In [1]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.insert(0, '..')

from collections import defaultdict, Counter
import string
import pandas as pd

from paradeller.samples import sample1, sample2, sample3
from paradeller.analysis import tokenize

### Get into standard format

In [2]:
ID_COUNTER = 0

In [3]:
def datafy(text, name):
    global ID_COUNTER
    
    # get lines, remove duplicates
    lines = [
        ' '.join(tokenize(x)) for x in text.split("\n")
        if x != ''
    ]
    
    data = []
    for i, line in enumerate(lines):
        data.append(dict(
            id=ID_COUNTER,
            text=line,
            poem=name,
            line=i
        ))
        ID_COUNTER += 1
    
    return data

In [4]:
def tokenize(text):
    return [
        x.strip().lower().translate(str.maketrans('', '', string.punctuation))
        for x in text.split()
    ]

In [5]:
data1 = datafy(sample1, "sample1")
data2 = datafy(sample2, "sample2")
data3 = datafy(sample3, "sample3")

data = data1 + data2 + data3

In [6]:
df = pd.DataFrame(data).set_index('id')
df.head()

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,sample1,meet me on the darkest sea of dead stars
1,1,sample1,meet me on the darkest sea of dead stars
2,2,sample1,when the waves burn my skin i’ll remember
3,3,sample1,when the waves burn my skin i’ll remember
4,4,sample1,i’ll remember the burn on the darkest


### Load into dict

In [12]:
df[df['poem'] == 'sample1'].head(6)

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,sample1,meet me on the darkest sea of dead stars
1,1,sample1,meet me on the darkest sea of dead stars
2,2,sample1,when the waves burn my skin i’ll remember
3,3,sample1,when the waves burn my skin i’ll remember
4,4,sample1,i’ll remember the burn on the darkest
5,5,sample1,sea of dead waves when my skin meet me stars


In [13]:
d = defaultdict(list)

for item in data:
    # get tokens
    tokens = tokenize(item['text'])
    
    for token in tokens:
        d[token].append(item['id'])

db = dict(d)

In [14]:
# get item
i = 0

item = next(x for x in data if x['id'] == i)
item

{'id': 0,
 'text': 'meet me on the darkest sea of dead stars',
 'poem': 'sample1',
 'line': 0}

In [15]:
tokens = tokenize(item['text'])
tokens

['meet', 'me', 'on', 'the', 'darkest', 'sea', 'of', 'dead', 'stars']

In [16]:
len(tokens)

9

In [18]:
c = Counter()

for word in tokens:
    line_ids = [
        x for x in db[word]
        if x != item['id']]    
    c.update(line_ids)
    
c.most_common(10)

[(1, 9),
 (5, 6),
 (18, 6),
 (4, 4),
 (23, 2),
 (24, 2),
 (25, 2),
 (43, 2),
 (60, 2),
 (61, 2)]

In [16]:
df[df['poem'] == 'sample1']

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,sample1,meet me on the darkest sea of dead stars
1,1,sample1,meet me on the darkest sea of dead stars
2,2,sample1,when the waves burn my skin i’ll remember
3,3,sample1,when the waves burn my skin i’ll remember
4,4,sample1,i’ll remember the burn on the darkest
5,5,sample1,sea of dead waves when my skin meet me stars
6,6,sample1,fall into this faulty trap of myself
7,7,sample1,fall into this faulty trap of myself
8,8,sample1,explain me without understanding why
9,9,sample1,explain me without understanding why


In [3]:
s1 = "the dog is in the room"
s2 = "the dog he walks outside"

In [17]:
set1 = set(s1.split())
set1

{'dog', 'in', 'is', 'room', 'the'}

In [18]:
set2 = set(s2.split())
set2

{'dog', 'he', 'outside', 'the', 'walks'}

In [19]:
set1 - set2

{'in', 'is', 'room'}

In [23]:
set1 & set2

{'dog', 'the'}

In [20]:
set1 ^ set2

{'he', 'in', 'is', 'outside', 'room', 'walks'}