In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys; sys.path.insert(0, '..')

from collections import defaultdict, Counter
import string
import pandas as pd

from paradeller.samples import sample1, sample2, sample3
from paradeller.analysis import tokenize
from paradeller.helper import load_archive

### Get into standard format

In [3]:
ID_COUNTER = 0

def datafy_poem(text, name):
    global ID_COUNTER
    # get lines, remove duplicates
    lines = [
        ' '.join(tokenize(x)) for x in text.split("\n")
        if x != ''
    ]
    # convert to dict
    data = []
    for i, line in enumerate(lines):
        data.append(dict(
            id=ID_COUNTER,
            text=line,
            poem=name,
            line=i
        ))
        ID_COUNTER += 1
    return data

data1 = datafy_poem(sample1, "sample1")
data2 = datafy_poem(sample2, "sample2")
data3 = datafy_poem(sample3, "sample3")
data = data1 + data2 + data3

In [4]:
df = pd.DataFrame(data).set_index('id')

display(df.head(2))
display(df.tail(2))

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,sample1,meet me on the darkest sea of dead stars
1,1,sample1,meet me on the darkest sea of dead stars


Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
70,22,sample3,earthbound limitations on celestial wings
71,23,sample3,and as twilight falls explore wealth untapped


## Load into dict

In [5]:
import itertools
from itertools import permutations, combinations

In [6]:
d = defaultdict(list)

for item in data:
    # get tokens
    tokens = tokenize(item['text'])
    for token in tokens:
        d[token].append(item['id'])

db = dict(d)

print(db)

{'meet': [0, 1, 5, 22], 'me': [0, 1, 5, 8, 9, 11, 19, 23, 48, 49, 52, 66], 'on': [0, 1, 4, 18, 54, 55, 58, 70], 'the': [0, 1, 2, 3, 4, 4, 18, 20, 24, 24, 25, 25, 28, 29, 42, 43, 60, 61, 64, 68], 'darkest': [0, 1, 4, 18], 'sea': [0, 1, 5, 18], 'of': [0, 1, 5, 6, 7, 10, 18, 21, 30, 31, 34, 38, 39, 40, 43, 46, 60, 61, 64, 68], 'dead': [0, 1, 5, 18], 'stars': [0, 1, 5, 23], 'when': [2, 3, 5, 20], 'waves': [2, 3, 5, 20], 'burn': [2, 3, 4, 20], 'my': [2, 3, 5, 12, 13, 16, 20, 21], 'skin': [2, 3, 5, 20], 'i’ll': [2, 3, 4, 20], 'remember': [2, 3, 4, 20], 'fall': [6, 7, 10, 23], 'into': [6, 7, 11, 23], 'this': [6, 7, 10, 21], 'faulty': [6, 7, 10, 21], 'trap': [6, 7, 10, 21], 'myself': [6, 7, 10, 21], 'explain': [8, 9, 10, 19], 'without': [8, 9, 11, 19], 'understanding': [8, 9, 11, 19], 'why': [8, 9, 11, 23], 'buries': [12, 13, 17, 22], 'your': [12, 13, 16, 19, 62, 63, 65, 67], 'sadness': [12, 13, 17, 22], 'in': [12, 13, 16, 22], 'abstraction': [12, 13, 16, 18], 'because': [14, 15, 17, 21], 'tim

### Count # times other lines share a word

In [7]:
# get tokens for id
i = 0
item = next(x for x in data if x['id'] == i)
tokens = tokenize(item['text'])

tokens

['meet', 'me', 'on', 'the', 'darkest', 'sea', 'of', 'dead', 'stars']

In [8]:
c = Counter()
for word in tokens:
    line_ids = [
        x for x in db[word]
        if x != item['id']
    ]
    c.update(line_ids)
    
c.most_common(4)

[(1, 9), (5, 6), (18, 6), (4, 4)]

In [9]:
ids = [x[0] for x in c.most_common(4)]
ids

[1, 5, 18, 4]

In [10]:
df.loc[ids]

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,sample1,meet me on the darkest sea of dead stars
5,5,sample1,sea of dead waves when my skin meet me stars
18,18,sample1,on the darkest sea of dead abstraction
4,4,sample1,i’ll remember the burn on the darkest


### Combinations

In [11]:
def get_combos(tokens):
    total = []
    for i in range(1, len(tokens)+1):
        combos = list(combinations(sorted(tokens), i))
        total.extend(combos)
    return total

In [12]:
FULL = defaultdict(list)

In [13]:
# choose id
text_id = 0

# get tokens for that string
item = next(x for x in data if x['id'] == text_id)
tokens = tokenize(item['text'])
print(tokens)

# find comobos
combos = get_combos(tokens)
print("# combos:", len(combos))

# add to record
for c in combos:
    FULL[c].append(text_id)
    
print("# FULL  :", len(FULL))

['meet', 'me', 'on', 'the', 'darkest', 'sea', 'of', 'dead', 'stars']
# combos: 511
# FULL  : 511


In [14]:
# choose id
text_id = 4

# get tokens for that string
item = next(x for x in data if x['id'] == text_id)
tokens = tokenize(item['text'])
print(tokens)

# find comobos
combos = get_combos(tokens)
print("# combos:", len(combos))

# add to record
for c in combos:
    FULL[c].append(text_id)
    
print("# FULL  :", len(FULL))

['i’ll', 'remember', 'the', 'burn', 'on', 'the', 'darkest']
# combos: 127
# FULL  : 599


In [15]:
def load_all():
    my_data = defaultdict(list)
    
    for item in data:
        # tokenize
        tokens = tokenize(item['text'])
        
        # find comobos
        combos = get_combos(tokens)
        
        # add to record
        for c in combos:
            my_data[c].append(item['id'])
            
    return my_data

In [16]:
my_data = load_all()

In [17]:
# preview

i, j = 10, 20

some_keys = list(my_data.keys())[i:j]

for k in some_keys:
    print(f"{str(k):25} {my_data[k]}")

('darkest', 'me')         [0, 1]
('darkest', 'meet')       [0, 1]
('darkest', 'of')         [0, 1, 18]
('darkest', 'on')         [0, 1, 4, 18]
('darkest', 'sea')        [0, 1, 18]
('darkest', 'stars')      [0, 1]
('darkest', 'the')        [0, 1, 4, 4, 18]
('dead', 'me')            [0, 1, 5]
('dead', 'meet')          [0, 1, 5]
('dead', 'of')            [0, 1, 5, 18]


In [18]:
a = load_archive()

In [19]:
len(a)

[{'id': 1144358142547046407,
  'text': '15 followers.....you are appreciated.  Thank you for being here',
  'author': 'jenndanza',
  'time': '2019-06-27 21:33:36'},
 {'id': 1144358142463160332,
  'text': 'Arjun felt perplexed. He is not her raven!',
  'author': 'PSteidle',
  'time': '2019-06-27 21:33:36'},
 {'id': 1144358142437969920,
  'text': 'I got so sun burnt while swimming today😩',
  'author': 'CrissmanMegan',
  'time': '2019-06-27 21:33:36'},
 {'id': 1144358142387662851,
  'text': 'I hate delayed trains',
  'author': 'maw_cungi',
  'time': '2019-06-27 21:33:36'},
 {'id': 1144358142341525517,
  'text': 'let me get that tiddy.. on the rocks bitch',
  'author': 'miliondollameat',
  'time': '2019-06-27 21:33:36'},
 {'id': 1144358142307794945,
  'text': 'Best news ever 😭😍💜',
  'author': 'ItsAngelBaby_',
  'time': '2019-06-27 21:33:36'},
 {'id': 1144358142219837440,
  'text': 'Stop the world and let me off',
  'author': '1jathan',
  'time': '2019-06-27 21:33:36'},
 {'id': 114435814220