In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys; sys.path.insert(0, '..')

from collections import defaultdict, Counter
import string
import pandas as pd

from paradeller.samples import sample1, sample2, sample3
from paradeller.analysis import tokenize
from paradeller.helper import load_archive

### Get samples into standard format

In [3]:
ID_COUNTER = 0

def datafy_poem(text, name):
    global ID_COUNTER
    # get lines, remove duplicates
    lines = [
        ' '.join(tokenize(x)) for x in text.split("\n")
        if x != ''
    ]
    # convert to dict
    data = []
    for i, line in enumerate(lines):
        data.append(dict(
            id=ID_COUNTER,
            text=line,
            poem=name,
            line=i
        ))
        ID_COUNTER += 1
    return data

data1 = datafy_poem(sample1, "sample1")
data2 = datafy_poem(sample2, "sample2")
data3 = datafy_poem(sample3, "sample3")
data = data1 + data2 + data3

In [4]:
df = pd.DataFrame(data).set_index('id')

df.head(6)

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,sample1,meet me on the darkest sea of dead stars
1,1,sample1,meet me on the darkest sea of dead stars
2,2,sample1,when the waves burn my skin i’ll remember
3,3,sample1,when the waves burn my skin i’ll remember
4,4,sample1,i’ll remember the burn on the darkest
5,5,sample1,sea of dead waves when my skin meet me stars


In [5]:
data = load_archive()

## Do the algorithm

### Create Adj. Lists

In [6]:
from paradeller.analysis import (
    make_adj_list_by_word, make_adj_list_by_id
)

In [7]:
adj_list_words = make_adj_list_by_word(data)

# print(adj_list_words)

In [8]:
adj_list_ids = make_adj_list_by_id(data)

# print(adj_list_ids)

### Choose Tweets

In [9]:
def get_tweet(i):
    return next(x for x in data if x['id'] == i)

In [10]:
# ID1 = 0
# ID2 = 2

In [11]:
# ID1 = 1144358142387662851
# ID2 = 1144358142219837440

In [12]:
# get_tweet(ID1)

In [13]:
# get_tweet(ID2)

### Find matches

In [14]:
from itertools import combinations
import random
from tqdm import tqdm, trange

In [15]:
from paradeller.analysis import (
    get_master_word_set,
    get_potential_tweets,
    filter_potential_tweets,
    find_valid_matches,
    find_maches
)

In [16]:
ids = list(adj_list_ids.keys())

In [17]:
len(ids)

45832

In [72]:
some_ids = [
    random.choice(ids)
    for _ in range(500)
]

In [73]:
combos = list(combinations(some_ids, 2))
len(combos)

124750

In [74]:
all_valid = {}

for c in tqdm(combos):
    valid = find_maches(c[0], c[1], adj_list_ids, adj_list_words)
    if (valid):
        all_valid[c] = valid

100%|██████████| 124750/124750 [28:52<00:00, 72.00it/s] 


In [75]:
len(all_valid)

99

In [76]:
def not_exact_match(start_pair, found_pair):
    t1, t2 = start_pair
    ta, tb = found_pair
    return not (
        (adj_list_ids[ta] == adj_list_ids[t1]) or
        (adj_list_ids[ta] == adj_list_ids[t2]))

In [87]:
def exact_match(start_pair, found_pair):
    t1, t2 = start_pair
    ta, tb = found_pair
    return (
        (adj_list_ids[ta] == adj_list_ids[t1]) or
        (adj_list_ids[ta] == adj_list_ids[t2]))

In [110]:
def find_start_lines(t1, t2, exact_matches, adj_list_ids):
    
    words1 = adj_list_ids[t1]
    words2 = adj_list_ids[t2]
    
    for pair in exact_matches:
        a, b = pair
        if (words1 == adj_list_ids[a]) and (words2 == adj_list_ids[b]):
            return [t1, a, t2, b]
        if (words1 == adj_list_ids[b]) and (words2 == adj_list_ids[a]):
            return [t1, b, t2, b]
    
    return [t1, t1, t2, t2]

In [124]:
for pair, matches in all_valid.items():
    t1, t2 = pair
    
    words1 = adj_list_ids[t1]
    words2 = adj_list_ids[t2]
    
    exact_matches = [m for m in matches if exact_match(pair, m)]
    non_exact_matches = list(set(matches) - set(exact_matches))
    
    if non_exact_matches:
        # look for reframing of first lines
        stanza = find_start_lines(t1, t2, exact_matches, adj_list_ids)
            
        stanza.extend([
            non_exact_matches[0][0],
            non_exact_matches[0][1]
        ])
        
        print("~"*50)
        for t in stanza:
            tweet = get_tweet(t)
            print(f"@{tweet['author']:20} {tweet['text']} ")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@DiamondMangum        I’m tired 😓 
@DiamondMangum        I’m tired 😓 
@She_Daaeee           I’m so hungry 😩 
@She_Daaeee           I’m so hungry 😩 
@Miyakelashaee_       I’m hungry . 
@shanice_thusi        I’m so tired 😓 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@DiamondMangum        I’m tired 😓 
@DiamondMangum        I’m tired 😓 
@RJIVE                MONI IS SO CUTE 
@RJIVE                MONI IS SO CUTE 
@shanice_thusi        I’m so tired 😓 
@_lychiii_            moni is so cute 🥺🥺🥺🥺 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@DiamondMangum        I’m tired 😓 
@DiamondMangum        I’m tired 😓 
@MostHATED_Noump      I’m so fucking irritated 
@MostHATED_Noump      I’m so fucking irritated 
@JossiGainza          So fucking tired 
@tayyysiaaaaa         I’m so irritated 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@lysaaaab             WHAT IT DO BABYYYYYYYY 
@lysaaaab             WHAT IT DO BABYYYYYYYY 
@mikes

In [37]:
for pair, matches in all_valid.items():
    t1, t2 = pair
    
    words1 = adj_list_ids[t1]
    words2 = adj_list_ids[t2]
    
    for m in matches:
        a, b = m
        if (adj_list_ids[a] != words1) and (adj_list_ids[a] != words2):
            print(get_tweet(t1)['text'])
            print(get_tweet(t2)['text'])
            print(get_tweet(a)['text'])
            print(get_tweet(b)['text'])
            print("~"*30)

Am i so hard to love ?
Am drunk 😤
i am dRunk
Am i so hard to love?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Am i so hard to love ?
God is good 🤞🏾
God is so good
Am i so hard to love?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
I keep thinking today is Saturday
Am drunk 😤
i am dRunk
I keep thinking today is Saturday 😂😭
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Acid Rap is on Spotify now. 🙌🏻🙌🏻🙌🏻
it’s above me!
it’s above me now.
Acid Rap is on Spotify 🙌🏽🙌🏽🙌🏽
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Acid Rap is on Spotify now. 🙌🏻🙌🏻🙌🏻
it’s above me!
It’s above me now 🤷🏽‍♀️
Acid Rap is on Spotify 🙌🏽🙌🏽🙌🏽
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
God is good 🤞🏾
LETS GO USA ⚽️🇺🇸
lets go USA
God is good 🙏🏼❤️
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
God is good 🤞🏾
I am so hungry.
God is so good
i am hungry
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [64]:
matches = find_maches(t1, t2, adj_list_ids, adj_list_words)

In [65]:
for m in matches:
    a, b = m
    if not ((adj_list_ids[a] == words1) or (adj_list_ids[a] == words2)):
        print(get_tweet(a)['text'])
        print(get_tweet(b)['text'])
        print("~"*30)

In [None]:
(1144708038831550468, 1145534442682945537)

In [38]:
get_tweet(1144708038831550468)

{'id': 1144708038831550468,
 'text': 'Ovie 😍😘😘😘 #LoveIsland',
 'author': 'ehidodo',
 'time': '2019-06-28 20:43:58'}

In [39]:
get_tweet(1145534442682945537)

{'id': 1145534442682945537,
 'text': 'Hello July 👋🏽🖤.',
 'author': 'AObaid23',
 'time': '2019-07-01 03:27:48'}

In [40]:
1145542253177425920, 1144708101590962176

(1145542253177425920, 1144708101590962176)

In [41]:
get_tweet(1145542253177425920)

{'id': 1145542253177425920,
 'text': 'Hello July 💙',
 'author': 'ulll97',
 'time': '2019-07-01 03:58:50'}

In [42]:
get_tweet(1144708101590962176)

{'id': 1144708101590962176,
 'text': 'Ovie 😰 #loveisland',
 'author': 'GeorgiaNunnx',
 'time': '2019-06-28 20:44:13'}

In [27]:
master_word_set = get_master_word_set(ID1, ID2, adj_list_ids)
# master_word_set

In [28]:
pot_ids = get_potential_tweets(ID1, ID2, adj_list_words, adj_list_ids)

In [17]:
pot_ids = filter_potential_tweets(
    pot_ids, adj_list_ids, master_word_set
)

In [18]:
len(pot_ids)

1

In [21]:
pot_ids

[1144677537576079362]

In [23]:
get_tweet(1144677537576079362)

{'id': 1144677537576079362,
 'text': 'I hate I hate I hate',
 'author': 'valeria_not',
 'time': '2019-06-28 18:42:46'}

In [19]:
valid = find_valid_matches(pot_ids, adj_list_ids, master_word_set)

In [20]:
valid

[]

### Count # times other lines share a word

In [7]:
# get tokens for id
i = 0
item = next(x for x in data if x['id'] == i)
tokens = tokenize(item['text'])

tokens

['meet', 'me', 'on', 'the', 'darkest', 'sea', 'of', 'dead', 'stars']

In [8]:
c = Counter()
for word in tokens:
    line_ids = [
        x for x in db[word]
        if x != item['id']
    ]
    c.update(line_ids)
    
c.most_common(4)

[(1, 9), (5, 6), (18, 6), (4, 4)]

In [9]:
ids = [x[0] for x in c.most_common(4)]
ids

[1, 5, 18, 4]

In [10]:
df.loc[ids]

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,sample1,meet me on the darkest sea of dead stars
5,5,sample1,sea of dead waves when my skin meet me stars
18,18,sample1,on the darkest sea of dead abstraction
4,4,sample1,i’ll remember the burn on the darkest


## Real Data

In [35]:
import pickle

In [22]:
a = load_archive()
print(f"{len(a):,}")

53,898


In [23]:
my_data = load_all(a)

100%|██████████| 53898/53898 [05:42<00:00, 157.29it/s]  


In [25]:
L = len(my_data)
print(f"{L:,}")

21,234,158


In [34]:
with open('test.pickle', "wb") as file:
    pickle.dump(my_data, file)