In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys; sys.path.insert(0, '..')

from collections import defaultdict, Counter
import string
import pandas as pd

from paradeller.helper import comma_print

## Load data

In [3]:
from paradeller.helper import load_archive
from paradeller.samples import load_samples

In [4]:
# from samples
sample_data = load_samples()
df = pd.DataFrame(sample_data).set_index('id')

In [5]:
# from archive
archive_data = load_archive()
comma_print(len(archive_data))

197,352


### Choose a source to use

In [6]:
##################
use_samples = False
##################

if use_samples:
    data = sample_data
else:
    data = archive_data

comma_print(len(data))

197,352


### Access data

In [7]:
def get_tweet(i):
    """Find tweet with given id"""
    try:
        return next(x for x in data if x["id"] == i)
    except StopIteration as e:
        print("Error: No tweet with that ID")
        return None

## Do the algorithm

### Filter down start tweets to choose from

#### Remove duplicate ids

Tweets with exactly the same id

In [8]:
d = {}
for item in data:
    d[item['id']] = item
data = list(d.values())
comma_print(len(data))

101,321


#### Remove duplicates phrases

Temporarily remove tweets with duplicate word sets

In [9]:
from paradeller.analysis import find_duplicates, filter_out_duplicates

In [10]:
# get duplicates { sorted words tuple: ids }
duplicates = find_duplicates(data)

# only keep first
data = filter_out_duplicates(data, duplicates)

HBox(children=(IntProgress(value=0, max=101321), HTML(value='')))




#### Every word must appear in at least 3 lines

In [11]:
from paradeller.analysis import filter_out_oddballs

In [12]:
data = filter_out_oddballs(data)

HBox(children=(IntProgress(value=0, max=99269), HTML(value='')))




### Create Adj. Lists

In [13]:
from paradeller.analysis import restructure_data

In [14]:
adj_list_words, adj_list_ids = restructure_data(data)

HBox(children=(IntProgress(value=0, max=69527), HTML(value='')))




In [15]:
comma_print(len(adj_list_words))
comma_print(len(adj_list_ids))

12,467
69,527


### Find matches

In [16]:
from itertools import combinations
import random
from tqdm import tqdm, trange

from paradeller.analysis import find_matches, get_potential_tweets, find_valid_matches

In [17]:
ids = list(adj_list_ids.keys())
comma_print(len(ids))

69,527


#### Randomly choose start pairs

In [30]:
some_ids = [
    random.choice(ids)
    for _ in range(300)
]

pairs = list(combinations(some_ids, 2))
comma_print(len(pairs))

44,850


#### Filter out shared words

The best paradelles often have no shared words between start lines

In [31]:
def no_common_words(pair):    
    s1 = set(adj_list_ids[pair[0]])
    s2 = set(adj_list_ids[pair[1]])
    return len(s1.intersection(s2)) == 0

In [32]:
# pairs = [
#     p for p in pairs
#     if no_common_words(p)
# ]

In [33]:
comma_print(len(pairs))

44,850


#### Find matches!!

In [34]:
from paradeller.analysis import find_matches, find_matches_for_start_pairs

In [35]:
all_valid = find_matches_for_start_pairs(pairs, adj_list_ids, adj_list_words)

HBox(children=(IntProgress(value=0, max=44850), HTML(value='')))




In [36]:
len(all_valid)

0

In [37]:
for pair, matches in all_valid.items():
    t1, t2 = pair
    stanza_start = [t1, t1, t2, t2]
    for match in matches:
        a, b = match
        stanza = stanza_start + [a, b]
        print("~"*50)
        for t in stanza:
            tweet = get_tweet(t)
            print(f"@{tweet['author']:20} {tweet['text']} ")

#### Parallel?

In [28]:
from math import sqrt
from joblib import Parallel, delayed

In [30]:
# matches = Parallel(n_jobs=2)(delayed(find_matches)(
#     p[0], p[1], adj_list_ids, adj_list_words
# ) for p in tqdm(pairs))

In [31]:
# %%timeit
# matches = Parallel(n_jobs=2)(delayed(find_matches)(
#     p[0], p[1], adj_list_ids, adj_list_words
# ) for p in pairs)

In [None]:
%%timeit
matches = [
    find_matches(p[0], p[1], adj_list_ids, adj_list_words)
    for p in pairs
]

In [156]:
pairs = combos

all_valid = {}
for p in pairs:
    valid = find_matches(p[0], p[1], adj_list_ids, adj_list_words)
    if valid:
        all_valid[p] = valid

In [83]:
# all_valid

#### Display results

In [84]:
for pair, matches in all_valid.items():
    t1, t2 = pair
    stanza_start = [t1, t1, t2, t2]
    for match in matches:
        a, b = match
        stanza = stanza_start + [a, b]
        print("~"*50)
        for t in stanza:
            tweet = get_tweet(t)
            print(f"@{tweet['author']:20} {tweet['text']} ")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@unknown              where the cypress bow tis shady trees 
@unknown              where the cypress bow tis shady trees 
@unknown              beneath the bend and branches breeze 
@unknown              beneath the bend and branches breeze 
@unknown              tis the breeze beneath the cypress trees 
@unknown              where shady branches bend and bow 


### Including duplicates again

In [120]:
def display_stanza(ids):
    for t in ids:
        tweet = get_tweet(t)
        print(f"@{tweet['author']:20} {tweet['text']} ")

In [217]:
lines = [
    1144358993399353354,
    1144688724996952064,
    1144692637733138432,
    1144688724506284033
]

In [218]:
display_stanza(lines)

@BriannaS_            I’m tired asf 
@mamican_tarik        AAPL: 197.34 at Jun 28, 2019 7:27 PM 
@only_1diamond98      I’m tired asf 
@A7la7yati            AAPL: 197.34 at Jun 28, 2019 7:27 PM 


In [52]:
def exact_match(start_pair, found_pair):
    t1, t2 = start_pair
    ta, tb = found_pair
    return (
        (adj_list_ids[ta] == adj_list_ids[t1]) or
        (adj_list_ids[ta] == adj_list_ids[t2]))

In [53]:
def find_start_lines(t1, t2, exact_matches, adj_list_ids):
    
    words1 = adj_list_ids[t1]
    words2 = adj_list_ids[t2]
    
    for pair in exact_matches:
        a, b = pair
        if (words1 == adj_list_ids[a]) and (words2 == adj_list_ids[b]):
            return [t1, a, t2, b]
        if (words1 == adj_list_ids[b]) and (words2 == adj_list_ids[a]):
            return [t1, b, t2, a]
    
    return [t1, t1, t2, t2]

In [54]:
for pair, matches in all_valid.items():
    t1, t2 = pair
    
    words1 = adj_list_ids[t1]
    words2 = adj_list_ids[t2]
    
    exact_matches = [m for m in matches if exact_match(pair, m)]
    non_exact_matches = list(set(matches) - set(exact_matches))
    
    if non_exact_matches:
        stanza_start = find_start_lines(t1, t2, exact_matches, adj_list_ids)            
        stanza_end = [
            non_exact_matches[0][0],
            non_exact_matches[0][1]
        ]
        stanza = stanza_start + stanza_end
        print("~"*50)
        for t in stanza:
            tweet = get_tweet(t)
            print(f"@{tweet['author']:20} {tweet['text']} ")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@nicksparks94         I am..........so tired 
@nicksparks94         I am..........so tired 
@videotpes            my head hurts SO BAD 
@videotpes            my head hurts SO BAD 
@jaymaleeth           i am tired . 
@taylorrrrxoxo_       My head hurts so bad 


### Count # times other lines share a word

In [7]:
# get tokens for id
i = 0
item = next(x for x in data if x['id'] == i)
tokens = tokenize(item['text'])

tokens

['meet', 'me', 'on', 'the', 'darkest', 'sea', 'of', 'dead', 'stars']

In [8]:
c = Counter()
for word in tokens:
    line_ids = [
        x for x in db[word]
        if x != item['id']
    ]
    c.update(line_ids)
    
c.most_common(4)

[(1, 9), (5, 6), (18, 6), (4, 4)]

In [9]:
ids = [x[0] for x in c.most_common(4)]
ids

[1, 5, 18, 4]

In [10]:
df.loc[ids]

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,sample1,meet me on the darkest sea of dead stars
5,5,sample1,sea of dead waves when my skin meet me stars
18,18,sample1,on the darkest sea of dead abstraction
4,4,sample1,i’ll remember the burn on the darkest


## Real Data

In [35]:
import pickle

In [22]:
a = load_archive()
print(f"{len(a):,}")

53,898


In [23]:
my_data = load_all(a)

100%|██████████| 53898/53898 [05:42<00:00, 157.29it/s]  


In [25]:
L = len(my_data)
print(f"{L:,}")

21,234,158


In [34]:
with open('test.pickle', "wb") as file:
    pickle.dump(my_data, file)