In [1]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.insert(0, '..')
import pickle

from tqdm.auto import tqdm

from paradeller.analysis import get_stanzas
from paradeller.dataprep import load_and_prep

In [2]:
data, duplicates, adj_list_words, adj_list_ids = load_and_prep(use_pickle=True)

Loading real, processed data from pickle...
--------------------------------------------------
DONE

data            type: <class 'list'>	len: 197,754
duplicates      type: <class 'dict'>	len: 265,345
adj_list_words  type: <class 'dict'>	len: 18,491
adj_list_ids    type: <class 'dict'>	len: 197,754


In [3]:
def get_tweet(i):
    """Find tweet with given id"""
    try:
        return next(x for x in data if x["id"] == i)
    except StopIteration as e:
        print("Error: No tweet with that ID")
        return None

### Read Results

In [4]:
with open('../data/found_2019-07-09-00-32.pickle', 'rb') as f:
    all_valid = pickle.load(f)

In [5]:
# all_valid = {}
# for item in all_valid_lst:
#     all_valid[item[0]] = item[1]

In [17]:
len(all_valid)

2023

In [18]:
stanzas = get_stanzas(all_valid)
# stanzas_text = [[get_tweet(x)['text'] for x in stanza] for stanza in tqdm(stanzas)]

len(stanzas)

2023

In [19]:
stanzas[0]

(1146571689473523714,
 1146265241896009730,
 1146613407090483200,
 1146715924226908160)

In [20]:
def stanza_sorter(stanza):
    """
    Sort by interesting-ness
    """
    
    # --- points for length ---
    ids = set(stanza)
    len_pts = sum((len(adj_list_ids[i]) for i in ids))

    
    # --- points for variance --- 
    lineA = adj_list_ids[stanza[0]]
    lineB = adj_list_ids[stanza[1]]
    lineC = adj_list_ids[stanza[2]]
    lineD = adj_list_ids[stanza[3]]
    
    # diff b/w A and B
    diff_pts = len(set(lineA) ^ set(lineB))
    
    # points for different start words
    start_letters = set((x[0] for x in [lineA, lineB, lineC, lineD]))
    start_pts = len(start_letters)
    
    pts = sum((
        len_pts,
        (diff_pts * 8),
        (start_pts * 20)
    ))
    return pts

In [21]:
sorted_stanzas = sorted(stanzas, key=stanza_sorter, reverse=True)

In [22]:
# low
stanza_sorter(sorted_stanzas[-1])

52

In [23]:
# high
stanza_sorter(sorted_stanzas[0])

134

In [24]:
view_stanzas = sorted_stanzas[:5]

for stanza in view_stanzas:
    print("~"*50)
    #print(stanza_sorter(stanza))
    for i in [0,0,1,1,2,3]:
        t = stanza[i]
        tweet = get_tweet(t)
        print(f"@{tweet['author']:20} {tweet['text']} ")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@jawbreakerbot        And I miss you. 
@jawbreakerbot        And I miss you. 
@BrianaTommo1         I just... oof 
@BrianaTommo1         I just... oof 
@deliicatebaby        oof i miss * 
@csszxx               just you and i 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@Taken_By_DaBest      I want it 
@Taken_By_DaBest      I want it 
@jennierubyjaene      And I love you so. 
@jennierubyjaene      And I love you so. 
@mizukinana777        So I love you 
@Zuzile_Zu            And I want it 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@csszxx               just you and i 
@csszxx               just you and i 
@exosaibot            i want to sleep ;-; 
@exosaibot            i want to sleep ;-; 
@catchmyfirehand      i just. want to sleep. 
@i4vxy                you and i 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@csszxx               just you and i 
@csszxx               just you and i 
@raquelarreguin6      I love 

### Find Complete Poems

In [25]:
from itertools import combinations, chain
from math import factorial as fact

In [134]:
stanzas = get_stanzas(all_valid)

# TMP
stanzas = stanzas[:50]

len(stanzas)

50

In [135]:
stanzas[0]

(1146571689473523714,
 1146265241896009730,
 1146613407090483200,
 1146715924226908160)

Number of combos:

- $n$ = types to choose from
- $r$ = number chosen

$\frac{n!}{r!(n-r)!}$

In [136]:
n = len(stanzas)
r = 3

num_combos = fact(n) // (fact(r) * fact(n-r))
num_combos

19600

In [137]:
all_combos = combinations(stanzas, 3)

In [138]:
# filtered generator
combos = (
    c for c in all_combos
    if len(set().union(*c)) == 12
)

In [139]:
stan1, stan2, stan3 = next(combos)

In [140]:
pair1, pair2, pair3 = stan1[:2], stan2[:2], stan3[:2]

In [141]:
pair1

(1146571689473523714, 1146265241896009730)

In [142]:
pair2

(1146094193594343424, 1144393117660930048)

In [143]:
pair3

(1145904412713525248, 1145546224239599616)

In [144]:
prev_stanza_words = list(
    chain.from_iterable(
        [adj_list_ids[line] for line in [*pair1, *pair2, *pair3]]
    ))

prev_stanza_words

['i',
 'think',
 'i',
 'do',
 'i',
 'am',
 'so',
 'i',
 'love',
 'my',
 'i',
 'miss',
 'you',
 'i',
 'need',
 'a',
 'i',
 'am',
 'a',
 'dog']

In [145]:
def get_potential_final_lines(stanza_words, adj_list_words):
    """Potential lines for final stanza"""
    potential_ids = set()
    for word in stanza_words:
        potential_ids.update(adj_list_words[word])
    return potential_ids

In [146]:
pot_ids = get_potential_final_lines(prev_stanza_words, adj_list_words)

In [147]:
len(pot_ids)

103725

In [149]:
len(pot_ids - {*stan1, *stan2, *stan3})

103713