In [1]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.insert(0, '..')

import random
from itertools import combinations
import pickle

import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange
import altair as alt

from paradeller.helper import load_archive
from paradeller.samples import load_samples
from paradeller.dataprep import (
    tokenize,
    find_duplicates,
    filter_out_duplicates,
    filter_out_short,
    filter_out_oddballs,
    filter_out_oddballs_recursive,
    restructure_data,
    create_adj_list_by_word,
    create_adj_list_by_id
)
from paradeller.analysis import (
    find_matches,
    find_matches_for_start_pairs
)

In [2]:
def get_tweet(i):
    """Find tweet with given id"""
    try:
        return next(x for x in data if x["id"] == i)
    except StopIteration as e:
        print("Error: No tweet with that ID")
        return None

showlen = lambda data: print(f"Length: {len(data):,}")

## Get Data

In [3]:
data = load_archive()
showlen(data)

Length: 156,374


In [4]:
####################
USE_SAMPLE = False
####################

if USE_SAMPLE:
    data = load_samples()
    
showlen(data)

Length: 156,374


### Cleanup

In [5]:
# remove too short
data = filter_out_short(data, n=3)
showlen(data)

HBox(children=(IntProgress(value=0, max=156374), HTML(value='')))


Length: 155,584


In [6]:
# identify duplicate phrases (will use later)
duplicates = find_duplicates(data)

# remove duplicate phrases
data = filter_out_duplicates(data, duplicates)
showlen(data)

HBox(children=(IntProgress(value=0, max=155584), HTML(value='')))


Length: 152,247


In [7]:
# remove oddballs (too few matches)
data = filter_out_oddballs_recursive(data)
showlen(data)

HBox(children=(IntProgress(value=0, max=152247), HTML(value='')))


39,654 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=112593), HTML(value='')))


5,268 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=107325), HTML(value='')))


1,011 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=106314), HTML(value='')))


179 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=106135), HTML(value='')))


43 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=106092), HTML(value='')))


Nothing removed. Done filtering.
Length: 106,092


### Create Adj. Lists

In [8]:
# make adj lists
adj_list_words, adj_list_ids = restructure_data(data)

HBox(children=(IntProgress(value=0, max=106092), HTML(value='')))




### Summary

In [9]:
print(f"data {type(data)} {len(data):,}")

data <class 'list'> 106,092


## Do the algorithm

### Choose Some Ids

In [10]:
from statistics import mean

In [11]:
# sort tweet ids by avg popularity of its words
pop = []
for tweet_id, words in tqdm(adj_list_ids.items()):
    pop.append((
        tweet_id,
        mean([len(adj_list_words[word]) for word in words])
    ))
    
pop.sort(key=lambda x: x[1], reverse=True)

HBox(children=(IntProgress(value=0, max=106092), HTML(value='')))




In [12]:
df = pd.DataFrame(pop, columns=['id', 'pop'])
# df['pop'].describe()

In [13]:
some_ids = [x[0] for x in pop[:300]]
pairs = list(combinations(some_ids, 2))
comma_print(len(pairs))

44,850


### Find matches

In [14]:
all_valid = find_matches_for_start_pairs(pairs, adj_list_ids, adj_list_words)

HBox(children=(IntProgress(value=0, max=44850), HTML(value='')))




In [47]:
with open("../data/found.pickle", 'wb') as f:
    pickle.dump(all_valid, f)

In [49]:
with open("../data/found.pickle", 'rb') as f:
    all_valid = pickle.load(f)

In [51]:
for pair, matches in all_valid.items():
    t1, t2 = pair
    stanza_start = [t1, t1, t2, t2]
    for match in matches:
        a, b = match
        stanza = stanza_start + [a, b]
        print("~"*50)
        for t in stanza:
            tweet = get_tweet(t)
            print("\n@{tweet['author']:20} {tweet['text']}")

In [27]:
output = ""
for pair, matches in all_valid.items():
    t1, t2 = pair
    stanza_start = [t1, t1, t2, t2]
    for match in matches:
        a, b = match
        stanza = stanza_start + [a, b]
        output += ("\n" + "~"*50)
        for t in stanza:
            tweet = get_tweet(t)
            output += f"\n@{tweet['author']:20} {tweet['text']}"

print(output)


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@mnabna_              i love my 👀💦❤
@mnabna_              i love my 👀💦❤
@jnlsbb               I miss you.
@jnlsbb               I miss you.
@excerptsofelly       i love you
@chesterslegacy       I miss my ** :(
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@mnabna_              i love my 👀💦❤
@mnabna_              i love my 👀💦❤
@SamaSami99           I miss me.
@SamaSami99           I miss me.
@chesterslegacy       I miss my ** :(
@Sonizzle21           I love me.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@mnabna_              i love my 👀💦❤
@mnabna_              i love my 👀💦❤
@yonggurk             So, I miss you.
@yonggurk             So, I miss you.
@mizukinana777        So I love you
@chesterslegacy       I miss my ** :(
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@mnabna_              i love my 👀💦❤
@mnabna_              i love my 👀💦❤
@thiwinnn             I NEED A JOB!
@thiwinnn             I NEED A JOB!
@KaySo_Sw

In [24]:
with open("found.txt", "w") as f:
    f.write(output)

In [26]:
output = ""
for pair, matches in all_valid.items():
    t1, t2 = pair
    if adj_list_ids[t1][0] != adj_list_ids[t2][0]:
        stanza_start = [t1, t1, t2, t2]
        for match in matches:
            a, b = match
            stanza = stanza_start + [a, b]
            output += ("\n" + "~"*50)
            for t in stanza:
                tweet = get_tweet(t)
                output += f"\n@{tweet['author']:20} {tweet['text']}"
print(output)


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@mnabna_              i love my 👀💦❤
@mnabna_              i love my 👀💦❤
@yonggurk             So, I miss you.
@yonggurk             So, I miss you.
@mizukinana777        So I love you
@chesterslegacy       I miss my ** :(
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@chesterslegacy       I miss my ** :(
@chesterslegacy       I miss my ** :(
@mizukinana777        So I love you
@mizukinana777        So I love you
@mnabna_              i love my 👀💦❤
@yonggurk             So, I miss you.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@excerptsofelly       i love you
@excerptsofelly       i love you
@yonggurk             So, I miss you.
@yonggurk             So, I miss you.
@jnlsbb               I miss you.
@mizukinana777        So I love you
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@Sonizzle21           I love me.
@Sonizzle21           I love me.
@yonggurk             So, I miss you.
@yonggurk             So, I miss yo

#### Filter out shared words

The best paradelles often have no shared words between start lines

In [31]:
def no_common_words(pair):    
    s1 = set(adj_list_ids[pair[0]])
    s2 = set(adj_list_ids[pair[1]])
    return len(s1.intersection(s2)) == 0

In [32]:
# pairs = [
#     p for p in pairs
#     if no_common_words(p)
# ]

In [33]:
comma_print(len(pairs))

44,850


#### Find matches!!

In [34]:
from paradeller.analysis import find_matches, find_matches_for_start_pairs

In [35]:
all_valid = find_matches_for_start_pairs(pairs, adj_list_ids, adj_list_words)

HBox(children=(IntProgress(value=0, max=44850), HTML(value='')))




In [36]:
len(all_valid)

0

In [37]:
for pair, matches in all_valid.items():
    t1, t2 = pair
    stanza_start = [t1, t1, t2, t2]
    for match in matches:
        a, b = match
        stanza = stanza_start + [a, b]
        print("~"*50)
        for t in stanza:
            tweet = get_tweet(t)
            print(f"@{tweet['author']:20} {tweet['text']} ")

#### Parallel?

In [28]:
from math import sqrt
from joblib import Parallel, delayed

In [30]:
# matches = Parallel(n_jobs=2)(delayed(find_matches)(
#     p[0], p[1], adj_list_ids, adj_list_words
# ) for p in tqdm(pairs))

In [31]:
# %%timeit
# matches = Parallel(n_jobs=2)(delayed(find_matches)(
#     p[0], p[1], adj_list_ids, adj_list_words
# ) for p in pairs)

In [None]:
%%timeit
matches = [
    find_matches(p[0], p[1], adj_list_ids, adj_list_words)
    for p in pairs
]

In [156]:
pairs = combos

all_valid = {}
for p in pairs:
    valid = find_matches(p[0], p[1], adj_list_ids, adj_list_words)
    if valid:
        all_valid[p] = valid

In [83]:
# all_valid

#### Display results

In [84]:
for pair, matches in all_valid.items():
    t1, t2 = pair
    stanza_start = [t1, t1, t2, t2]
    for match in matches:
        a, b = match
        stanza = stanza_start + [a, b]
        print("~"*50)
        for t in stanza:
            tweet = get_tweet(t)
            print(f"@{tweet['author']:20} {tweet['text']} ")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@unknown              where the cypress bow tis shady trees 
@unknown              where the cypress bow tis shady trees 
@unknown              beneath the bend and branches breeze 
@unknown              beneath the bend and branches breeze 
@unknown              tis the breeze beneath the cypress trees 
@unknown              where shady branches bend and bow 


### Including duplicates again

In [120]:
def display_stanza(ids):
    for t in ids:
        tweet = get_tweet(t)
        print(f"@{tweet['author']:20} {tweet['text']} ")

In [217]:
lines = [
    1144358993399353354,
    1144688724996952064,
    1144692637733138432,
    1144688724506284033
]

In [218]:
display_stanza(lines)

@BriannaS_            I’m tired asf 
@mamican_tarik        AAPL: 197.34 at Jun 28, 2019 7:27 PM 
@only_1diamond98      I’m tired asf 
@A7la7yati            AAPL: 197.34 at Jun 28, 2019 7:27 PM 


In [52]:
def exact_match(start_pair, found_pair):
    t1, t2 = start_pair
    ta, tb = found_pair
    return (
        (adj_list_ids[ta] == adj_list_ids[t1]) or
        (adj_list_ids[ta] == adj_list_ids[t2]))

In [53]:
def find_start_lines(t1, t2, exact_matches, adj_list_ids):
    
    words1 = adj_list_ids[t1]
    words2 = adj_list_ids[t2]
    
    for pair in exact_matches:
        a, b = pair
        if (words1 == adj_list_ids[a]) and (words2 == adj_list_ids[b]):
            return [t1, a, t2, b]
        if (words1 == adj_list_ids[b]) and (words2 == adj_list_ids[a]):
            return [t1, b, t2, a]
    
    return [t1, t1, t2, t2]

In [54]:
for pair, matches in all_valid.items():
    t1, t2 = pair
    
    words1 = adj_list_ids[t1]
    words2 = adj_list_ids[t2]
    
    exact_matches = [m for m in matches if exact_match(pair, m)]
    non_exact_matches = list(set(matches) - set(exact_matches))
    
    if non_exact_matches:
        stanza_start = find_start_lines(t1, t2, exact_matches, adj_list_ids)            
        stanza_end = [
            non_exact_matches[0][0],
            non_exact_matches[0][1]
        ]
        stanza = stanza_start + stanza_end
        print("~"*50)
        for t in stanza:
            tweet = get_tweet(t)
            print(f"@{tweet['author']:20} {tweet['text']} ")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@nicksparks94         I am..........so tired 
@nicksparks94         I am..........so tired 
@videotpes            my head hurts SO BAD 
@videotpes            my head hurts SO BAD 
@jaymaleeth           i am tired . 
@taylorrrrxoxo_       My head hurts so bad 


### Count # times other lines share a word

In [7]:
# get tokens for id
i = 0
item = next(x for x in data if x['id'] == i)
tokens = tokenize(item['text'])

tokens

['meet', 'me', 'on', 'the', 'darkest', 'sea', 'of', 'dead', 'stars']

In [8]:
c = Counter()
for word in tokens:
    line_ids = [
        x for x in db[word]
        if x != item['id']
    ]
    c.update(line_ids)
    
c.most_common(4)

[(1, 9), (5, 6), (18, 6), (4, 4)]

In [9]:
ids = [x[0] for x in c.most_common(4)]
ids

[1, 5, 18, 4]

In [10]:
df.loc[ids]

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,sample1,meet me on the darkest sea of dead stars
5,5,sample1,sea of dead waves when my skin meet me stars
18,18,sample1,on the darkest sea of dead abstraction
4,4,sample1,i’ll remember the burn on the darkest


## Real Data

In [35]:
import pickle

In [22]:
a = load_archive()
print(f"{len(a):,}")

53,898


In [23]:
my_data = load_all(a)

100%|██████████| 53898/53898 [05:42<00:00, 157.29it/s]  


In [25]:
L = len(my_data)
print(f"{L:,}")

21,234,158


In [34]:
with open('test.pickle', "wb") as file:
    pickle.dump(my_data, file)