In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys; sys.path.insert(0, '..')

from collections import defaultdict
from datetime import datetime
from itertools import combinations
import json
import os
from multiprocessing import Pool
from collections import Counter

from tqdm.auto import tqdm

from paradeller.helper import DATE_FMT, load_archive
from paradeller.dataprep import (
    load_and_prep,
    sort_ids_by_popularity,
    restructure_data,
    create_adj_list_by_id,
    create_adj_list_by_word,
    filter_out_oddballs_recursive,
)   
from paradeller.analysis import (
    consolidate_stanzas,
    find_matches,
    find_matches_for_start_pairs,
    get_num_combos
)
from paradeller.postprocess import (
    stanza_sorter_maker, print_stanzas, print_poems, print_stanza
)

In [3]:
def get_tweet(i):
    """Find tweet with given id"""
    try:
        return next(x for x in data if x["id"] == i)
    except StopIteration as e:
        print("Error: No tweet with that ID")
        return None

showlen = lambda data: print(f"{len(data):,}")

In [5]:
data, duplicates, adj_list_words, adj_list_ids = load_and_prep(use_pickle=True, update_pickle=False)


Loading processed data from pickle...
--------------------------------------------------
DONE

data            type: <class 'list'>	len: 270,885
duplicates      type: <class 'dict'>	len: 5,365
adj_list_words  type: <class 'dict'>	len: 22,505
adj_list_ids    type: <class 'dict'>	len: 270,885


In [4]:
data, duplicates, adj_list_words, adj_list_ids = load_and_prep(use_pickle=False, update_pickle=True)


Loading raw data from archive.json...
Length: 394,026

Cleaning up data...
> Remove too short


HBox(children=(IntProgress(value=0, max=394026), HTML(value='')))


Length: 361,234
> Remove duplicate phrases


HBox(children=(IntProgress(value=0, max=361234), HTML(value='')))


Length: 349,960
> Recursively remove oddballs


HBox(children=(IntProgress(value=0, max=349960), HTML(value='')))


69,794 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=280166), HTML(value='')))


7,780 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=272386), HTML(value='')))


1,208 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=271178), HTML(value='')))


211 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=270967), HTML(value='')))


51 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=270916), HTML(value='')))


14 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=270902), HTML(value='')))


13 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=270889), HTML(value='')))


4 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=270885), HTML(value='')))


Nothing removed. Done filtering.
Length: 270,885

Creating adjacency lists...


HBox(children=(IntProgress(value=0, max=270885), HTML(value='')))



Restructing duplicates...

Saving new data to pickle...
--------------------------------------------------
DONE

data            type: <class 'list'>	len: 270,885
duplicates      type: <class 'dict'>	len: 5,365
adj_list_words  type: <class 'dict'>	len: 22,505
adj_list_ids    type: <class 'dict'>	len: 270,885


## More filtering??

### Word Count Brillaince

In [None]:
before = len(data)
# before = 254123
before

In [None]:
def can_be_completed(line, overall_wc):
    counts = Counter(adj_list_ids[line])
    return all([
        overall_wc[word] - counts[word] >= (counts[word] * 2)
        for word in counts
    ])

In [None]:
def recusively_remove_losers(data, adj_list_ids):
    showlen(data)
    
    # get overall word count
    overall_wc = Counter()
    for id_, words in adj_list_ids.items():
        overall_wc.update(words)
    
    # try removing some ids
    ids = list(adj_list_ids.keys())
    rm_ids = set([i for i in ids if not can_be_completed(i, overall_wc)])
    
    if len(rm_ids) > 0:
        print(f"Removing {len(rm_ids)} ids")
        data = [x for x in data if x["id"] not in rm_ids]
        adj_list_ids = create_adj_list_by_id(data)
        return recusively_remove_losers(data, adj_list_ids)
    
    print("All done filtering! Updating adj lists")
    adj_list_words, adj_list_ids = restructure_data(data)
    return data, adj_list_words, adj_list_ids

In [None]:
data, adj_list_words, adj_list_ids  = recusively_remove_losers(data, adj_list_ids)

In [None]:
data = filter_out_oddballs_recursive(data)

In [None]:
after = len(data)
after

In [None]:
before - after

In [None]:
b_num = get_num_combos(before, 2)
print(f"{b_num:,}")

In [None]:
a_num = get_num_combos(after, 2)
print(f"{a_num:,}")

#### Search least popular

In [None]:
from statistics import mean

In [None]:
sorted_ids = sort_ids_by_popularity(adj_list_ids, adj_list_words)

some_ids = sorted_ids[:100]
pairs = list(combinations(some_ids, 2))

len(pairs)

In [None]:
ids = list(adj_list_ids.keys())

In [None]:
least_pop_ids = sorted_ids[-3000:]

In [None]:
pairs = list(combinations(least_pop_ids, 2))

In [None]:
len(pairs)

In [None]:
# ids = least_pop_ids
# pairs = combinations(ids, 2)

# num_combos = get_num_combos(len(ids), 2)
# print(f"{num_combos:,}")

In [None]:
l = least_pop_ids[-1]

In [None]:
len(ids)

In [None]:
pairs = [(l, i) for i in ids]

In [None]:
len(pairs)

In [None]:
pairs[0]

In [None]:
# get overall word count
overall_wc = Counter()
for id_, words in adj_list_ids.items():
    overall_wc.update(words)

In [None]:
def valid_pair(id1, id2):
    stanza_words = adj_list_ids[id1] + adj_list_ids[id2]
    counts = Counter(stanza_words)
    return all([
        overall_wc[word] - counts[word] >= (counts[word] * 2)
        for word in counts
    ])

In [None]:
shorter = [p for p in tqdm(pairs) if valid_pair(*p)]

In [None]:
len(pairs) - len(shorter)

In [None]:
id1, id2 = pairs[0]

In [None]:
id1

In [None]:
get_tweet(id1)

In [None]:
stanza_words = adj_list_ids[id1] + adj_list_ids[id2]

In [None]:
counts = Counter(stanza_words)

In [None]:
counts

In [None]:
stanza_words

In [None]:
overall_wc['15']

In [None]:
[
    overall_wc[word] - counts[word]
    for word in counts
]

In [None]:
for i in adj_list_words['liat']:
    print(adj_list_ids[i])

In [None]:
[
    overall_wc[word] - counts[word] >= (counts[word] * 2)
    for word in counts
]

In [None]:
253690 - len(shorter)

In [None]:
for id1, id2 in pairs:
    
    counts = Counter(stanza_words)

In [None]:
# sorted_ids = sort_ids_by_popularity(adj_list_ids, adj_list_words)

some_ids = sorted_ids[:100]
pairs = list(combinations(some_ids, 2))

len(pairs)

In [None]:
def find_matches_for_pair(p):
    """Helper function to find initial stanzas, given a pair of lines"""
    return find_matches(p[0], p[1], adj_list_ids, adj_list_words)

In [None]:
with Pool(os.cpu_count()) as pool:
    res = list(tqdm(pool.imap(find_matches_for_pair, pairs), total=len(pairs)))

valid_stanzas = [x for x in list(zip(pairs, res)) if x[1]]
stanzas = consolidate_stanzas(valid_stanzas)

print(f"Found {len(stanzas)} results.")

In [None]:
d = defaultdict(list)
for pair in tqdm(pairs, total=num_combos):
    w1 = adj_list_ids[pair[0]]
    w2 = adj_list_ids[pair[1]]
    words = tuple(sorted(w1 + w2))
    d[words].append(pair)

In [None]:
found = {k:v for k, v in d.items() if len(v) > 1}
len(found)

In [None]:
valid_stanzas = []
for matching_pairs in tqdm(found.values()):
    combos = list(combinations(matching_pairs, 2))
    stanzas = combos
    stanzas = [c[0] + c[1] for c in combos]
    valid_stanzas.extend(stanzas)
    
print("valid_stanzas:", len(valid_stanzas))

In [None]:
keep_ids = set().union(*valid_stanzas)

In [None]:
len(keep_ids)

In [None]:
# remove_ids = set(least_pop_ids) - keep_ids

In [None]:
print_stanzas(valid_stanzas, data, n=3)

### Filter down found stanzas

Using word counts

In [None]:
overall_wc = Counter()
for id_, words in adj_list_ids.items():
    overall_wc.update(words)

In [None]:
def can_be_completed(stanza):
    a, b, _, _ = stanza
    
    counts = Counter()
    counts.update(adj_list_ids[a] + adj_list_ids[b])
    
    return all([
        overall_wc[word] - (counts[word] * 2) >= counts[word]
        for word in counts
    ])

In [None]:
shorter = [x for x in valid_stanzas if can_be_completed(x)]

In [None]:
len(valid_stanzas)

In [None]:
elim = list(set(valid_stanzas) - set(shorter))

In [None]:
print_stanzas(elim, data)

In [None]:
overall_wc['toon']

In [None]:
40 - 15 - 15

In [None]:
word_counts['toon'] > 15 * 2

In [None]:
15 * 2

In [None]:
g = Counter()

In [None]:
g.update(adj_list_ids[c]+adj_list_ids[d])

In [None]:
g

In [None]:
stanza

In [None]:
print_stanza(stanza, data)

#### count words

## Alt Approach

In [None]:
ids = list(adj_list_ids.keys())
len(ids)

In [None]:
pairs = combinations(ids, 2)

num_combos = get_num_combos(len(ids), 2)
print(f"{num_combos:,}")

In [None]:
d = defaultdict(list)
for pair in tqdm(pairs, total=num_combos):
    w1 = adj_list_ids[pair[0]]
    w2 = adj_list_ids[pair[1]]
    words = tuple(sorted(w1 + w2))
    d[words].append(pair)

### Find Stanzas

#### Old

In [None]:
def find_matches_for_pair(p):
    """Helper function to find initial stanzas, given a pair of lines"""
    return find_matches(p[0], p[1], adj_list_ids, adj_list_words)

In [None]:
with Pool(os.cpu_count()) as pool:
    res = list(tqdm(pool.imap(find_matches_for_pair, pairs), total=len(pairs)))

In [None]:
valid_stanzas = [x for x in list(zip(pairs, res)) if x[1]]
stanzas = consolidate_stanzas(valid_stanzas)
print(f"Found {len(stanzas)} results.")

In [None]:
old_stanzas = stanzas

#### New

In [None]:
ids = list(adj_list_ids.keys())
# pairs = list(combinations(ids, 2))

# len(pairs)

In [None]:
d = defaultdict(list)
for pair in tqdm(pairs):
    w1 = adj_list_ids[pair[0]]
    w2 = adj_list_ids[pair[1]]
    words = tuple(sorted(w1 + w2))
    d[words].append(pair)

In [None]:
found = {k:v for k, v in d.items() if len(v) > 1}
len(found)

In [None]:
valid_stanzas = []
for matching_pairs in tqdm(found.values()):
    combos = list(combinations(matching_pairs, 2))
    stanzas = [c[0] + c[1] for c in combos]
    valid_stanzas.extend(stanzas)
    
print("valid_stanzas:", len(valid_stanzas))

In [None]:
new_stanzas = valid_stanzas

In [None]:
len(old_stanzas)

In [None]:
len(new_stanzas)

In [None]:
print_stanzas(new_stanzas, data)

In [None]:
old_stanzas[0]

In [None]:
print_stanza(old_stanzas[0], data)

In [None]:
print_stanzas(old_stanzas, data)

In [None]:
for k in list(found.keys())[:5]:
    print(k, ":")
    print("  ", found[k])

In [None]:
print_stanzas(valid_stanzas, data)