In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys; sys.path.insert(0, '..')

from collections import defaultdict, Counter
import string
import pandas as pd

from paradeller.samples import sample1, sample2, sample3
from paradeller.analysis import tokenize, get_combos, load_all, create_word_dict
from paradeller.helper import load_archive

### Get samples into standard format

In [3]:
ID_COUNTER = 0

def datafy_poem(text, name):
    global ID_COUNTER
    # get lines, remove duplicates
    lines = [
        ' '.join(tokenize(x)) for x in text.split("\n")
        if x != ''
    ]
    # convert to dict
    data = []
    for i, line in enumerate(lines):
        data.append(dict(
            id=ID_COUNTER,
            text=line,
            poem=name,
            line=i
        ))
        ID_COUNTER += 1
    return data

data1 = datafy_poem(sample1, "sample1")
data2 = datafy_poem(sample2, "sample2")
data3 = datafy_poem(sample3, "sample3")
data = data1 + data2 + data3

In [4]:
df = pd.DataFrame(data).set_index('id')

display(df.head(2))
display(df.tail(2))

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,sample1,meet me on the darkest sea of dead stars
1,1,sample1,meet me on the darkest sea of dead stars


Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
70,22,sample3,earthbound limitations on celestial wings
71,23,sample3,and as twilight falls explore wealth untapped


## Load into dict

In [5]:
import itertools
from itertools import permutations, combinations

In [6]:
d = defaultdict(list)

for item in data:
    # get tokens
    tokens = tokenize(item['text'])
    for token in tokens:
        d[token].append(item['id'])

db = dict(d)

print(db)

{'meet': [0, 1, 5, 22], 'me': [0, 1, 5, 8, 9, 11, 19, 23, 48, 49, 52, 66], 'on': [0, 1, 4, 18, 54, 55, 58, 70], 'the': [0, 1, 2, 3, 4, 4, 18, 20, 24, 24, 25, 25, 28, 29, 42, 43, 60, 61, 64, 68], 'darkest': [0, 1, 4, 18], 'sea': [0, 1, 5, 18], 'of': [0, 1, 5, 6, 7, 10, 18, 21, 30, 31, 34, 38, 39, 40, 43, 46, 60, 61, 64, 68], 'dead': [0, 1, 5, 18], 'stars': [0, 1, 5, 23], 'when': [2, 3, 5, 20], 'waves': [2, 3, 5, 20], 'burn': [2, 3, 4, 20], 'my': [2, 3, 5, 12, 13, 16, 20, 21], 'skin': [2, 3, 5, 20], 'i’ll': [2, 3, 4, 20], 'remember': [2, 3, 4, 20], 'fall': [6, 7, 10, 23], 'into': [6, 7, 11, 23], 'this': [6, 7, 10, 21], 'faulty': [6, 7, 10, 21], 'trap': [6, 7, 10, 21], 'myself': [6, 7, 10, 21], 'explain': [8, 9, 10, 19], 'without': [8, 9, 11, 19], 'understanding': [8, 9, 11, 19], 'why': [8, 9, 11, 23], 'buries': [12, 13, 17, 22], 'your': [12, 13, 16, 19, 62, 63, 65, 67], 'sadness': [12, 13, 17, 22], 'in': [12, 13, 16, 22], 'abstraction': [12, 13, 16, 18], 'because': [14, 15, 17, 21], 'tim

### Count # times other lines share a word

In [7]:
# get tokens for id
i = 0
item = next(x for x in data if x['id'] == i)
tokens = tokenize(item['text'])

tokens

['meet', 'me', 'on', 'the', 'darkest', 'sea', 'of', 'dead', 'stars']

In [8]:
c = Counter()
for word in tokens:
    line_ids = [
        x for x in db[word]
        if x != item['id']
    ]
    c.update(line_ids)
    
c.most_common(4)

[(1, 9), (5, 6), (18, 6), (4, 4)]

In [9]:
ids = [x[0] for x in c.most_common(4)]
ids

[1, 5, 18, 4]

In [10]:
df.loc[ids]

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,sample1,meet me on the darkest sea of dead stars
5,5,sample1,sea of dead waves when my skin meet me stars
18,18,sample1,on the darkest sea of dead abstraction
4,4,sample1,i’ll remember the burn on the darkest


### Combinations

In [11]:
sample1_data = [d for d in data if d['poem'] == 'sample1']

In [12]:
df[df['poem'] == 'sample1'].head(6)

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,sample1,meet me on the darkest sea of dead stars
1,1,sample1,meet me on the darkest sea of dead stars
2,2,sample1,when the waves burn my skin i’ll remember
3,3,sample1,when the waves burn my skin i’ll remember
4,4,sample1,i’ll remember the burn on the darkest
5,5,sample1,sea of dead waves when my skin meet me stars


In [13]:
my_data = load_all(sample1_data)
len(my_data)

2484

In [14]:
# preview a sample

i, j = 10, 20

some_keys = list(my_data.keys())[i:j]

for k in some_keys:
    print(f"{str(k):25} {my_data[k]}")

('darkest', 'me')         [0, 1]
('darkest', 'meet')       [0, 1]
('darkest', 'of')         [0, 1, 18]
('darkest', 'on')         [0, 1, 4, 18]
('darkest', 'sea')        [0, 1, 18]
('darkest', 'stars')      [0, 1]
('darkest', 'the')        [0, 1, 4, 4, 18]
('dead', 'me')            [0, 1, 5]
('dead', 'meet')          [0, 1, 5]
('dead', 'of')            [0, 1, 5, 18]


In [59]:
def load_all(data: list):
    """
    Create dict where combo-tuples are keys
    and list of ids are values
    """
    my_data = defaultdict(list)

    for item in data:
        # tokenize
        tokens = tokenize(item["text"])
        # find comobos
        combos = get_combos(tokens)
        # add to record
        for combo in combos:
            my_data[combo].append(item["id"])
    return my_data

In [60]:
text = sample1_data[0]['text']
text

'meet me on the darkest sea of dead stars'

In [61]:
tokens = tokenize(text)
tokens

['meet', 'me', 'on', 'the', 'darkest', 'sea', 'of', 'dead', 'stars']

In [64]:
tokens = sorted(tokens)

In [66]:
tokens

['darkest', 'dead', 'me', 'meet', 'of', 'on', 'sea', 'stars', 'the']

In [70]:
for i in range(len(tokens)):
    print(tokens[i])

darkest
dead
me
meet
of
on
sea
stars
the


In [28]:
combos = get_combos(tokens)
# combos

In [33]:
combos

[('darkest',),
 ('dead',),
 ('me',),
 ('meet',),
 ('of',),
 ('on',),
 ('sea',),
 ('stars',),
 ('the',),
 ('darkest', 'dead'),
 ('darkest', 'me'),
 ('darkest', 'meet'),
 ('darkest', 'of'),
 ('darkest', 'on'),
 ('darkest', 'sea'),
 ('darkest', 'stars'),
 ('darkest', 'the'),
 ('dead', 'me'),
 ('dead', 'meet'),
 ('dead', 'of'),
 ('dead', 'on'),
 ('dead', 'sea'),
 ('dead', 'stars'),
 ('dead', 'the'),
 ('me', 'meet'),
 ('me', 'of'),
 ('me', 'on'),
 ('me', 'sea'),
 ('me', 'stars'),
 ('me', 'the'),
 ('meet', 'of'),
 ('meet', 'on'),
 ('meet', 'sea'),
 ('meet', 'stars'),
 ('meet', 'the'),
 ('of', 'on'),
 ('of', 'sea'),
 ('of', 'stars'),
 ('of', 'the'),
 ('on', 'sea'),
 ('on', 'stars'),
 ('on', 'the'),
 ('sea', 'stars'),
 ('sea', 'the'),
 ('stars', 'the'),
 ('darkest', 'dead', 'me'),
 ('darkest', 'dead', 'meet'),
 ('darkest', 'dead', 'of'),
 ('darkest', 'dead', 'on'),
 ('darkest', 'dead', 'sea'),
 ('darkest', 'dead', 'stars'),
 ('darkest', 'dead', 'the'),
 ('darkest', 'me', 'meet'),
 ('darkest', 

In [32]:
# sorted(combos, key=lambda k: len(k))

In [17]:
load_all(sample1_data)

defaultdict(list,
            {('darkest',): [0, 1, 4, 18],
             ('dead',): [0, 1, 5, 18],
             ('me',): [0, 1, 5, 8, 9, 11, 19, 23],
             ('meet',): [0, 1, 5, 22],
             ('of',): [0, 1, 5, 6, 7, 10, 18, 21],
             ('on',): [0, 1, 4, 18],
             ('sea',): [0, 1, 5, 18],
             ('stars',): [0, 1, 5, 23],
             ('the',): [0, 1, 2, 3, 4, 4, 18, 20],
             ('darkest', 'dead'): [0, 1, 18],
             ('darkest', 'me'): [0, 1],
             ('darkest', 'meet'): [0, 1],
             ('darkest', 'of'): [0, 1, 18],
             ('darkest', 'on'): [0, 1, 4, 18],
             ('darkest', 'sea'): [0, 1, 18],
             ('darkest', 'stars'): [0, 1],
             ('darkest', 'the'): [0, 1, 4, 4, 18],
             ('dead', 'me'): [0, 1, 5],
             ('dead', 'meet'): [0, 1, 5],
             ('dead', 'of'): [0, 1, 5, 18],
             ('dead', 'on'): [0, 1, 18],
             ('dead', 'sea'): [0, 1, 5, 18],
             ('dead', 's

In [19]:
d = create_word_dict(sample1_data)
d

{'meet': [0, 1, 5, 22],
 'me': [0, 1, 5, 8, 9, 11, 19, 23],
 'on': [0, 1, 4, 18],
 'the': [0, 1, 2, 3, 4, 4, 18, 20],
 'darkest': [0, 1, 4, 18],
 'sea': [0, 1, 5, 18],
 'of': [0, 1, 5, 6, 7, 10, 18, 21],
 'dead': [0, 1, 5, 18],
 'stars': [0, 1, 5, 23],
 'when': [2, 3, 5, 20],
 'waves': [2, 3, 5, 20],
 'burn': [2, 3, 4, 20],
 'my': [2, 3, 5, 12, 13, 16, 20, 21],
 'skin': [2, 3, 5, 20],
 'i’ll': [2, 3, 4, 20],
 'remember': [2, 3, 4, 20],
 'fall': [6, 7, 10, 23],
 'into': [6, 7, 11, 23],
 'this': [6, 7, 10, 21],
 'faulty': [6, 7, 10, 21],
 'trap': [6, 7, 10, 21],
 'myself': [6, 7, 10, 21],
 'explain': [8, 9, 10, 19],
 'without': [8, 9, 11, 19],
 'understanding': [8, 9, 11, 19],
 'why': [8, 9, 11, 23],
 'buries': [12, 13, 17, 22],
 'your': [12, 13, 16, 19],
 'sadness': [12, 13, 17, 22],
 'in': [12, 13, 16, 22],
 'abstraction': [12, 13, 16, 18],
 'because': [14, 15, 17, 21],
 'time': [14, 15, 17, 22],
 'worries': [14, 15, 17, 22],
 'us': [14, 15, 17, 22],
 'eternally': [14, 15, 16, 23]}

In [40]:
# sort keys by number of values
sorted_keys = sorted(my_data, key=lambda k: len(my_data[k]), reverse=True)

## Real Data

In [35]:
import pickle

In [22]:
a = load_archive()
print(f"{len(a):,}")

53,898


In [23]:
my_data = load_all(a)

100%|██████████| 53898/53898 [05:42<00:00, 157.29it/s]  


In [25]:
L = len(my_data)
print(f"{L:,}")

21,234,158


In [34]:
with open('test.pickle', "wb") as file:
    pickle.dump(my_data, file)