In [14]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.insert(0, '..')
import pickle

from tqdm.auto import tqdm

from paradeller.analysis import get_stanzas
from paradeller.dataprep import load_and_prep

# TMP
from itertools import combinations

from paradeller.samples import load_samples
from paradeller.helper import (
    load_archive,
    save_to_pickle,
    read_from_pickle
)
from paradeller.dataprep import (
    tokenize,
    find_duplicates,
    filter_out_duplicates,
    filter_out_short,
    filter_out_oddballs,
    filter_out_oddballs_recursive,
    restructure_data,
    create_adj_list_by_word,
    create_adj_list_by_id
)
from paradeller.analysis import (
    find_matches,
    find_matches_for_start_pairs
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# data, duplicates, adj_list_words, adj_list_ids = load_and_prep(use_pickle=True)

# TO REFESH:
# data, duplicates, adj_list_words, adj_list_ids = load_and_prep(use_pickle=False, update_pickle=True)

In [3]:
def get_tweet(i):
    """Find tweet with given id"""
    try:
        return next(x for x in data if x["id"] == i)
    except StopIteration as e:
        print("Error: No tweet with that ID")
        return None

showlen = lambda data: print(f"Length: {len(data):,}")

In [4]:
#######################
USE_PICKLE = False
UPDATE_PICKLE = False
USE_SAMPLE = True
#######################


if USE_PICKLE:
    print("Loading real, processed data from pickle...")
    data, duplicates, adj_list_words, adj_list_ids = read_from_pickle()
else:
    if USE_SAMPLE:
        print("Loading unprocessed sample data...")
        data = load_samples()
    else:
        print("Loading unprocessed real data...")
        data = load_archive()
    
    showlen(data)
    print("\nCleaning up data...")

    # remove too short
    print("> Remove too short")
    data = filter_out_short(data)
    showlen(data)
    
    # remove duplicate phrases
    print("> Remove duplicate phrases")
    duplicates = find_duplicates(data)
    data = filter_out_duplicates(data, duplicates)
    showlen(data)

    # remove oddballs (too few matches)
    print("> Recursively remove oddballs")
    data = filter_out_oddballs_recursive(data)
    showlen(data)

    print("\nCreating adjacency lists...")
    # make adj lists
    adj_list_words, adj_list_ids = restructure_data(data)
    
    if UPDATE_PICKLE:
        print("\nSaving new data to pickle...")
        save_to_pickle((data, duplicates, adj_list_words, adj_list_ids))
        
    
print("-"*50)
print("DONE\n")
stuff = {
    "data": data, "duplicates": duplicates, "adj_list_words": adj_list_words, "adj_list_ids": adj_list_ids
}
for k, v in stuff.items():
    print(f"{k:15} type: {type(v)}\tlen: {len(v):,}")

Loading unprocessed sample data...
Length: 72

Cleaning up data...
> Remove too short


HBox(children=(IntProgress(value=0, max=72), HTML(value='')))


Length: 72
> Remove duplicate phrases


HBox(children=(IntProgress(value=0, max=72), HTML(value='')))


Length: 53
> Recursively remove oddballs


HBox(children=(IntProgress(value=0, max=53), HTML(value='')))


7 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=46), HTML(value='')))


7 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=39), HTML(value='')))


13 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=26), HTML(value='')))


8 tweets removed. Running again.


HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


Nothing removed. Done filtering.
Length: 18

Creating adjacency lists...


HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


--------------------------------------------------
DONE

data            type: <class 'list'>	len: 18
duplicates      type: <class 'dict'>	len: 53
adj_list_words  type: <class 'dict'>	len: 33
adj_list_ids    type: <class 'dict'>	len: 18


In [5]:
def get_tweet(i):
    """Find tweet with given id"""
    try:
        return next(x for x in data if x["id"] == i)
    except StopIteration as e:
        print("Error: No tweet with that ID")
        return None

### Read Results

In [12]:
ids = list(adj_list_ids.keys())

In [15]:
pairs = list(combinations(ids, 2))

In [22]:
all_valid = find_matches_for_start_pairs(pairs, adj_list_ids, adj_list_words)

HBox(children=(IntProgress(value=0, max=153), HTML(value='')))




In [24]:
stanzas = get_stanzas(all_valid)

In [25]:
stanzas

[(24, 26, 28, 29),
 (28, 29, 24, 26),
 (30, 32, 34, 35),
 (34, 35, 32, 30),
 (36, 38, 40, 41),
 (40, 41, 36, 38)]

In [6]:
with open('../data/found_2019-07-09-00-32.pickle', 'rb') as f:
    all_valid = pickle.load(f)

In [5]:
# all_valid = {}
# for item in all_valid_lst:
#     all_valid[item[0]] = item[1]

In [6]:
len(all_valid)

2023

In [7]:
stanzas = get_stanzas(all_valid)
# stanzas_text = [[get_tweet(x)['text'] for x in stanza] for stanza in tqdm(stanzas)]

len(stanzas)

2023

In [26]:
stanzas[0]

(24, 26, 28, 29)

In [27]:
def stanza_sorter(stanza):
    """
    Sort by interesting-ness
    """
    
    # --- points for length ---
    ids = set(stanza)
    len_pts = sum((len(adj_list_ids[i]) for i in ids))

    
    # --- points for variance --- 
    lineA = adj_list_ids[stanza[0]]
    lineB = adj_list_ids[stanza[1]]
    lineC = adj_list_ids[stanza[2]]
    lineD = adj_list_ids[stanza[3]]
    
    # diff b/w A and B
    diff_pts = len(set(lineA) ^ set(lineB))
    
    # points for different start words
    start_letters = set((x[0] for x in [lineA, lineB, lineC, lineD]))
    start_pts = len(start_letters)
    
    pts = sum((
        len_pts,
        (diff_pts * 8),
        (start_pts * 20)
    ))
    return pts

In [28]:
sorted_stanzas = sorted(stanzas, key=stanza_sorter, reverse=True)

In [29]:
# low
stanza_sorter(sorted_stanzas[-1])

174

In [30]:
# high
stanza_sorter(sorted_stanzas[0])

210

In [31]:
view_stanzas = sorted_stanzas[:5]

for stanza in view_stanzas:
    print("~"*50)
    #print(stanza_sorter(stanza))
    for i in [0,0,1,1,2,3]:
        t = stanza[i]
        tweet = get_tweet(t)
        print(f"@{tweet['author']:20} {tweet['text']} ")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@unknown              ink like stains of sap fold down 
@unknown              ink like stains of sap fold down 
@unknown              brown and dripping tears that keep 
@unknown              brown and dripping tears that keep 
@unknown              sap like ink and stains of brown 
@unknown              tears that fold keep dripping down 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@unknown              sap like ink and stains of brown 
@unknown              sap like ink and stains of brown 
@unknown              tears that fold keep dripping down 
@unknown              tears that fold keep dripping down 
@unknown              brown and dripping tears that keep 
@unknown              ink like stains of sap fold down 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@unknown              tis the breeze beneath the cypress trees 
@unknown              tis the breeze beneath the cypress trees 
@unknown              where shady b

### Find Complete Poems

In [52]:
from itertools import combinations, chain
from math import factorial as fact

from paradeller.analysis import find_final_stanzas_from_stanzas

In [53]:
stanzas = get_stanzas(all_valid)

# TMP
stanzas = stanzas[:50]

len(stanzas)

6

In [39]:
len(stanzas)

6

In [41]:
n = len(stanzas)
r = 3
num_combos = fact(n) // (fact(r) * fact(n - r))
num_combos

20

In [79]:
stanzas

[(24, 26, 28, 29),
 (28, 29, 24, 26),
 (30, 32, 34, 35),
 (34, 35, 32, 30),
 (36, 38, 40, 41),
 (40, 41, 36, 38)]

In [80]:
all_combos = combinations(stanzas, 3)
combos = [c for c in all_combos if len(set().union(*c)) == 12]
combos

[((24, 26, 28, 29), (30, 32, 34, 35), (36, 38, 40, 41)),
 ((24, 26, 28, 29), (30, 32, 34, 35), (40, 41, 36, 38)),
 ((24, 26, 28, 29), (34, 35, 32, 30), (36, 38, 40, 41)),
 ((24, 26, 28, 29), (34, 35, 32, 30), (40, 41, 36, 38)),
 ((28, 29, 24, 26), (30, 32, 34, 35), (36, 38, 40, 41)),
 ((28, 29, 24, 26), (30, 32, 34, 35), (40, 41, 36, 38)),
 ((28, 29, 24, 26), (34, 35, 32, 30), (36, 38, 40, 41)),
 ((28, 29, 24, 26), (34, 35, 32, 30), (40, 41, 36, 38))]

In [None]:
from multiprocessing import Pool
import os
from paradeller.analysis import find_final_stanzas

In [81]:
def find_final_stanzas_helper(stanzas):
    find_final_stanzas(*stanzas, adj_list_ids, adj_list_words)

In [85]:
with Pool(os.cpu_count()) as pool:
            res = list(
                tqdm(pool.imap(find_final_stanzas_helper, combos), total=len(combos))
            )

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

NameError: name 'find_final_stanzas' is not defined

In [54]:
found = find_final_stanzas_from_stanzas(stanzas, adj_list_ids, adj_list_words)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [45]:
found

{((24, 26, 28, 29),
  (30, 32, 34, 35),
  (36, 38, 40, 41)): [(42, 43, 44, 45, 46, 47)],
 ((24, 26, 28, 29),
  (30, 32, 34, 35),
  (40, 41, 36, 38)): [(42, 43, 44, 45, 46, 47)],
 ((24, 26, 28, 29),
  (34, 35, 32, 30),
  (36, 38, 40, 41)): [(42, 43, 44, 45, 46, 47)],
 ((24, 26, 28, 29),
  (34, 35, 32, 30),
  (40, 41, 36, 38)): [(42, 43, 44, 45, 46, 47)],
 ((28, 29, 24, 26),
  (30, 32, 34, 35),
  (36, 38, 40, 41)): [(42, 43, 44, 45, 46, 47)],
 ((28, 29, 24, 26),
  (30, 32, 34, 35),
  (40, 41, 36, 38)): [(42, 43, 44, 45, 46, 47)],
 ((28, 29, 24, 26),
  (34, 35, 32, 30),
  (36, 38, 40, 41)): [(42, 43, 44, 45, 46, 47)],
 ((28, 29, 24, 26),
  (34, 35, 32, 30),
  (40, 41, 36, 38)): [(42, 43, 44, 45, 46, 47)]}

In [51]:
for start_stanzas, end_stanzas in found.items():
    print("~"*50)
    for stanza in start_stanzas:
        for i in [0,0,1,1,2,3]:
            t = stanza[i]
            tweet = get_tweet(t)
            print(f"@{tweet['author']:20} {tweet['text']} ")
        print("")
    for stanza in end_stanzas:
        for line in stanza:
            tweet = get_tweet(line)
            print(f"@{tweet['author']:20} {tweet['text']} ")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@unknown              tis the breeze beneath the cypress trees 
@unknown              tis the breeze beneath the cypress trees 
@unknown              where shady branches bend and bow 
@unknown              where shady branches bend and bow 
@unknown              beneath the bend and branches breeze 
@unknown              where the cypress bow tis shady trees 

@unknown              ink like stains of sap fold down 
@unknown              ink like stains of sap fold down 
@unknown              brown and dripping tears that keep 
@unknown              brown and dripping tears that keep 
@unknown              sap like ink and stains of brown 
@unknown              tears that fold keep dripping down 

@unknown              will such variegated colors blend 
@unknown              will such variegated colors blend 
@unknown              away within envelope of leaves 
@unknown              away within envelope of leaves 
@unknown            

In [81]:
stanzas[0]

(1146571689473523714,
 1146265241896009730,
 1146613407090483200,
 1146715924226908160)

Number of combos:

- $n$ = types to choose from
- $r$ = number chosen

$\frac{n!}{r!(n-r)!}$

In [82]:
n = len(stanzas)
r = 3

num_combos = fact(n) // (fact(r) * fact(n-r))
num_combos

19600

In [86]:
all_combos = combinations(stanzas, 3)

In [87]:
# filtered generator
combos = (
    c for c in all_combos
    if len(set().union(*c)) == 12
)