In [2]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.insert(0, '..')
import pickle

from tqdm.auto import tqdm

from paradeller.analysis import consolidate_stanzas
from paradeller.dataprep import load_and_prep

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
data, duplicates, adj_list_words, adj_list_ids = load_and_prep(use_pickle=True)

# TO REFESH:
# data, duplicates, adj_list_words, adj_list_ids = load_and_prep(use_pickle=False, update_pickle=True)

Loading real, processed data from pickle...
--------------------------------------------------
DONE

data            type: <class 'list'>	len: 212,381
duplicates      type: <class 'dict'>	len: 282,571
adj_list_words  type: <class 'dict'>	len: 19,314
adj_list_ids    type: <class 'dict'>	len: 212,381


In [3]:
def get_tweet(i):
    """Find tweet with given id"""
    try:
        return next(x for x in data if x["id"] == i)
    except StopIteration as e:
        print("Error: No tweet with that ID")
        return None

showlen = lambda data: print(f"Length: {len(data):,}")

In [4]:
len(data)

212381

### Read Results

In [5]:
with open('../data/found_2019-07-09-00-32.pickle', 'rb') as f:
    all_valid = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: '../data/found_2019-07-09-00-32.pickle'

In [None]:
# all_valid = {}
# for item in all_valid_lst:
#     all_valid[item[0]] = item[1]

In [None]:
len(all_valid)

In [None]:
stanzas = get_stanzas(all_valid)
# stanzas_text = [[get_tweet(x)['text'] for x in stanza] for stanza in tqdm(stanzas)]

len(stanzas)

In [None]:
stanzas[0]

In [None]:
def stanza_sorter(stanza):
    """
    Sort by interesting-ness
    """
    
    # --- points for length ---
    ids = set(stanza)
    len_pts = sum((len(adj_list_ids[i]) for i in ids))

    
    # --- points for variance --- 
    lineA = adj_list_ids[stanza[0]]
    lineB = adj_list_ids[stanza[1]]
    lineC = adj_list_ids[stanza[2]]
    lineD = adj_list_ids[stanza[3]]
    
    # diff b/w A and B
    diff_pts = len(set(lineA) ^ set(lineB))
    
    # points for different start words
    start_letters = set((x[0] for x in [lineA, lineB, lineC, lineD]))
    start_pts = len(start_letters)
    
    pts = sum((
        len_pts,
        (diff_pts * 8),
        (start_pts * 20)
    ))
    return pts

In [None]:
sorted_stanzas = sorted(stanzas, key=stanza_sorter, reverse=True)

In [None]:
# low
stanza_sorter(sorted_stanzas[-1])

In [None]:
# high
stanza_sorter(sorted_stanzas[0])

In [None]:
view_stanzas = sorted_stanzas[:5]

for stanza in view_stanzas:
    print("~"*50)
    #print(stanza_sorter(stanza))
    for i in [0,0,1,1,2,3]:
        t = stanza[i]
        tweet = get_tweet(t)
        print(f"@{tweet['author']:20} {tweet['text']} ")

### Find Complete Poems

In [None]:
from itertools import combinations, chain
from math import factorial as fact

from paradeller.analysis import find_final_stanzas_from_stanzas

In [None]:
stanzas = get_stanzas(all_valid)

# TMP
stanzas = stanzas[:50]

len(stanzas)

In [None]:
len(stanzas)

In [None]:
n = len(stanzas)
r = 3
num_combos = fact(n) // (fact(r) * fact(n - r))
num_combos

In [None]:
stanzas

In [None]:
all_combos = combinations(stanzas, 3)
combos = [c for c in all_combos if len(set().union(*c)) == 12]
combos

In [None]:
found = find_final_stanzas_from_stanzas(stanzas, adj_list_ids, adj_list_words)

In [None]:
found

In [None]:
for start_stanzas, end_stanzas in found.items():
    print("~"*50)
    for stanza in start_stanzas:
        for i in [0,0,1,1,2,3]:
            t = stanza[i]
            tweet = get_tweet(t)
            print(f"@{tweet['author']:20} {tweet['text']} ")
        print("")
    for stanza in end_stanzas:
        for line in stanza:
            tweet = get_tweet(line)
            print(f"@{tweet['author']:20} {tweet['text']} ")

In [None]:
stanzas[0]

Number of combos:

- $n$ = types to choose from
- $r$ = number chosen

$\frac{n!}{r!(n-r)!}$

In [None]:
n = len(stanzas)
r = 3

num_combos = fact(n) // (fact(r) * fact(n-r))
num_combos

In [None]:
all_combos = combinations(stanzas, 3)

In [None]:
# filtered generator
combos = (
    c for c in all_combos
    if len(set().union(*c)) == 12
)

### Using Samples

In [None]:
from itertools import combinations

from paradeller.samples import load_samples
from paradeller.helper import (
    load_archive,
    save_to_pickle,
    read_from_pickle
)
from paradeller.dataprep import (
    tokenize,
    find_duplicates,
    filter_out_duplicates,
    filter_out_short,
    filter_out_oddballs,
    filter_out_oddballs_recursive,
    restructure_data,
    create_adj_list_by_word,
    create_adj_list_by_id
)
from paradeller.analysis import (
    find_matches,
    find_matches_for_start_pairs
)

In [None]:
#######################
USE_PICKLE = False
UPDATE_PICKLE = False
USE_SAMPLE = True
#######################


if USE_PICKLE:
    print("Loading real, processed data from pickle...")
    data, duplicates, adj_list_words, adj_list_ids = read_from_pickle()
else:
    if USE_SAMPLE:
        print("Loading unprocessed sample data...")
        data = load_samples()
    else:
        print("Loading unprocessed real data...")
        data = load_archive()
    
    showlen(data)
    print("\nCleaning up data...")

    # remove too short
    print("> Remove too short")
    data = filter_out_short(data)
    showlen(data)
    
    # remove duplicate phrases
    print("> Remove duplicate phrases")
    duplicates = find_duplicates(data)
    data = filter_out_duplicates(data, duplicates)
    showlen(data)

    # remove oddballs (too few matches)
    print("> Recursively remove oddballs")
    data = filter_out_oddballs_recursive(data)
    showlen(data)

    print("\nCreating adjacency lists...")
    # make adj lists
    adj_list_words, adj_list_ids = restructure_data(data)
    
    if UPDATE_PICKLE:
        print("\nSaving new data to pickle...")
        save_to_pickle((data, duplicates, adj_list_words, adj_list_ids))
        
    
print("-"*50)
print("DONE\n")
stuff = {
    "data": data, "duplicates": duplicates, "adj_list_words": adj_list_words, "adj_list_ids": adj_list_ids
}
for k, v in stuff.items():
    print(f"{k:15} type: {type(v)}\tlen: {len(v):,}")

In [None]:
def get_tweet(i):
    """Find tweet with given id"""
    try:
        return next(x for x in data if x["id"] == i)
    except StopIteration as e:
        print("Error: No tweet with that ID")
        return None

showlen = lambda data: print(f"Length: {len(data):,}")

In [None]:
ids = list(adj_list_ids.keys())

In [None]:
pairs = list(combinations(ids, 2))

In [None]:
all_valid = find_matches_for_start_pairs(pairs, adj_list_ids, adj_list_words)

In [None]:
stanzas = get_stanzas(all_valid)