In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys; sys.path.insert(0, '..')

from collections import defaultdict, Counter
import string
import pandas as pd

from paradeller.samples import sample1, sample2, sample3
from paradeller.analysis import tokenize
from paradeller.helper import load_archive, comma_print

## Load data

### From samples

In [3]:
from paradeller.samples import load_samples

In [4]:
sample_data = load_samples()

In [5]:
df = pd.DataFrame(sample_data).set_index('id')
df.head(6)

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,sample1,meet me on the darkest sea of dead stars
1,1,sample1,meet me on the darkest sea of dead stars
2,2,sample1,when the waves burn my skin i’ll remember
3,3,sample1,when the waves burn my skin i’ll remember
4,4,sample1,i’ll remember the burn on the darkest
5,5,sample1,sea of dead waves when my skin meet me stars


### From archive

In [6]:
data = load_archive()
comma_print(len(data))

81,461


#### Remove duplicates

In [7]:
d = {}
for item in data:
    d[item['id']] = item

data = list(d.values())

comma_print(len(data))

52,281


In [8]:
# data = sample_data

### Find tweets in data

In [9]:
def get_tweet(i):
    """Find tweet with given id"""
    try:
        return next(x for x in data if x['id'] == i)
    except StopIteration as e:
        print("Error: No tweet with that ID")
        return None

In [10]:
get_tweet(data[10]['id'])

{'id': 1144358141972418560,
 'text': 'hi i love shawn with my heart',
 'author': 'ificanthvyou',
 'time': '2019-06-27 21:33:36'}

In [11]:
get_tweet(1)

Error: No tweet with that ID


## Do the algorithm

### Create Adj. Lists

In [12]:
from paradeller.analysis import (
    restructure_data,
)

In [13]:
adj_list_words, adj_list_ids = restructure_data(data)

In [14]:
comma_print(len(adj_list_words))
comma_print(len(adj_list_ids))

31,088
52,281


### Find matches

In [15]:
from itertools import combinations
import random
from tqdm import tqdm, trange

In [16]:
from paradeller.analysis import (
    find_maches
)

In [17]:
ids = list(adj_list_ids.keys())
comma_print(len(ids))

52,281


#### Filter down start tweets to choose from

##### 1) Remove duplicates

Temporarily remove tweets with duplicate word sets

In [18]:
from paradeller.analysis import get_nondup_ids

In [19]:
unique_ids = get_nondup_ids(adj_list_ids)
comma_print(len(unique_ids))

51,407


In [20]:
unique_data = [x for x in data if x['id'] in unique_ids]
comma_print(len(unique_data))

51,407


In [21]:
unique_adj_list_words, unique_adj_list_ids = restructure_data(unique_data)

comma_print(len(unique_adj_list_words))
comma_print(len(unique_adj_list_ids))

31,088
51,407


##### 2) Every word must appear in at least 3 lines

In [38]:
# find words with too few matches
filtered_adj_list_words = {k:v for k,v in unique_adj_list_words.items() if len(v) < 3 }

comma_print(len(filtered_adj_list_words))

22,866


In [39]:
rm_ids = set()
for word, ids in filtered_adj_list_words.items():
    rm_ids.update(ids)

comma_print(len(rm_ids))

18,944


##### Remove too-short ids from data

In [42]:
use_data = [x for x in data if x['id'] not in rm_ids]
comma_print(len(use_data))

33,337


Get unique ids, again

In [43]:
use_words, use_ids = restructure_data(use_data)

comma_print(len(use_words))
comma_print(len(use_ids))

8,002
33,337


In [46]:
search_ids = list(get_nondup_ids(use_ids))
comma_print(len(search_ids))

32,511


#### Randomly choose start pairs

In [47]:
some_ids = [
    random.choice(search_ids)
    for _ in range(100)
]

combos = list(combinations(some_ids, 2))
comma_print(len(combos))

4,950


In [48]:
all_valid = {}

for c in tqdm(combos):
    valid = find_maches(c[0], c[1], adj_list_ids, adj_list_words)
    if (valid):
        all_valid[c] = valid

100%|██████████| 4950/4950 [01:38<00:00, 50.20it/s]


In [49]:
len(all_valid)

3

In [50]:
all_valid

{(1145545828775473157,
  1145737690693951489): [(1145537747341340672,
   1145741264639451138), (1145741264639451138, 1144700443462950914)],
 (1145545828775473157,
  1144371731840360448): [(1145534321442217986, 1144364114715664384)],
 (1145737690693951489,
  1144371731840360448): [(1145534321442217986, 1145741264639451138)]}

In [51]:
def display_stanza(ids):
    for t in ids:
        tweet = get_tweet(t)
        print(f"@{tweet['author']:20} {tweet['text']} ")

In [217]:
lines = [
    1144358993399353354,
    1144688724996952064,
    1144692637733138432,
    1144688724506284033
]

In [218]:
display_stanza(lines)

@BriannaS_            I’m tired asf 
@mamican_tarik        AAPL: 197.34 at Jun 28, 2019 7:27 PM 
@only_1diamond98      I’m tired asf 
@A7la7yati            AAPL: 197.34 at Jun 28, 2019 7:27 PM 


In [52]:
def exact_match(start_pair, found_pair):
    t1, t2 = start_pair
    ta, tb = found_pair
    return (
        (adj_list_ids[ta] == adj_list_ids[t1]) or
        (adj_list_ids[ta] == adj_list_ids[t2]))

In [53]:
def find_start_lines(t1, t2, exact_matches, adj_list_ids):
    
    words1 = adj_list_ids[t1]
    words2 = adj_list_ids[t2]
    
    for pair in exact_matches:
        a, b = pair
        if (words1 == adj_list_ids[a]) and (words2 == adj_list_ids[b]):
            return [t1, a, t2, b]
        if (words1 == adj_list_ids[b]) and (words2 == adj_list_ids[a]):
            return [t1, b, t2, a]
    
    return [t1, t1, t2, t2]

In [54]:
for pair, matches in all_valid.items():
    t1, t2 = pair
    
    words1 = adj_list_ids[t1]
    words2 = adj_list_ids[t2]
    
    exact_matches = [m for m in matches if exact_match(pair, m)]
    non_exact_matches = list(set(matches) - set(exact_matches))
    
    if non_exact_matches:
        stanza_start = find_start_lines(t1, t2, exact_matches, adj_list_ids)            
        stanza_end = [
            non_exact_matches[0][0],
            non_exact_matches[0][1]
        ]
        stanza = stanza_start + stanza_end
        print("~"*50)
        for t in stanza:
            tweet = get_tweet(t)
            print(f"@{tweet['author']:20} {tweet['text']} ")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@nicksparks94         I am..........so tired 
@nicksparks94         I am..........so tired 
@videotpes            my head hurts SO BAD 
@videotpes            my head hurts SO BAD 
@jaymaleeth           i am tired . 
@taylorrrrxoxo_       My head hurts so bad 


In [37]:
for pair, matches in all_valid.items():
    t1, t2 = pair
    
    words1 = adj_list_ids[t1]
    words2 = adj_list_ids[t2]
    
    for m in matches:
        a, b = m
        if (adj_list_ids[a] != words1) and (adj_list_ids[a] != words2):
            print(get_tweet(t1)['text'])
            print(get_tweet(t2)['text'])
            print(get_tweet(a)['text'])
            print(get_tweet(b)['text'])
            print("~"*30)

Am i so hard to love ?
Am drunk 😤
i am dRunk
Am i so hard to love?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Am i so hard to love ?
God is good 🤞🏾
God is so good
Am i so hard to love?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
I keep thinking today is Saturday
Am drunk 😤
i am dRunk
I keep thinking today is Saturday 😂😭
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Acid Rap is on Spotify now. 🙌🏻🙌🏻🙌🏻
it’s above me!
it’s above me now.
Acid Rap is on Spotify 🙌🏽🙌🏽🙌🏽
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Acid Rap is on Spotify now. 🙌🏻🙌🏻🙌🏻
it’s above me!
It’s above me now 🤷🏽‍♀️
Acid Rap is on Spotify 🙌🏽🙌🏽🙌🏽
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
God is good 🤞🏾
LETS GO USA ⚽️🇺🇸
lets go USA
God is good 🙏🏼❤️
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
God is good 🤞🏾
I am so hungry.
God is so good
i am hungry
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [64]:
matches = find_maches(t1, t2, adj_list_ids, adj_list_words)

In [65]:
for m in matches:
    a, b = m
    if not ((adj_list_ids[a] == words1) or (adj_list_ids[a] == words2)):
        print(get_tweet(a)['text'])
        print(get_tweet(b)['text'])
        print("~"*30)

In [None]:
(1144708038831550468, 1145534442682945537)

In [38]:
get_tweet(1144708038831550468)

{'id': 1144708038831550468,
 'text': 'Ovie 😍😘😘😘 #LoveIsland',
 'author': 'ehidodo',
 'time': '2019-06-28 20:43:58'}

In [39]:
get_tweet(1145534442682945537)

{'id': 1145534442682945537,
 'text': 'Hello July 👋🏽🖤.',
 'author': 'AObaid23',
 'time': '2019-07-01 03:27:48'}

In [40]:
1145542253177425920, 1144708101590962176

(1145542253177425920, 1144708101590962176)

In [41]:
get_tweet(1145542253177425920)

{'id': 1145542253177425920,
 'text': 'Hello July 💙',
 'author': 'ulll97',
 'time': '2019-07-01 03:58:50'}

In [42]:
get_tweet(1144708101590962176)

{'id': 1144708101590962176,
 'text': 'Ovie 😰 #loveisland',
 'author': 'GeorgiaNunnx',
 'time': '2019-06-28 20:44:13'}

In [27]:
master_word_set = get_master_word_set(ID1, ID2, adj_list_ids)
# master_word_set

In [28]:
pot_ids = get_potential_tweets(ID1, ID2, adj_list_words, adj_list_ids)

In [17]:
pot_ids = filter_potential_tweets(
    pot_ids, adj_list_ids, master_word_set
)

In [18]:
len(pot_ids)

1

In [21]:
pot_ids

[1144677537576079362]

In [23]:
get_tweet(1144677537576079362)

{'id': 1144677537576079362,
 'text': 'I hate I hate I hate',
 'author': 'valeria_not',
 'time': '2019-06-28 18:42:46'}

In [19]:
valid = find_valid_matches(pot_ids, adj_list_ids, master_word_set)

In [20]:
valid

[]

### Count # times other lines share a word

In [7]:
# get tokens for id
i = 0
item = next(x for x in data if x['id'] == i)
tokens = tokenize(item['text'])

tokens

['meet', 'me', 'on', 'the', 'darkest', 'sea', 'of', 'dead', 'stars']

In [8]:
c = Counter()
for word in tokens:
    line_ids = [
        x for x in db[word]
        if x != item['id']
    ]
    c.update(line_ids)
    
c.most_common(4)

[(1, 9), (5, 6), (18, 6), (4, 4)]

In [9]:
ids = [x[0] for x in c.most_common(4)]
ids

[1, 5, 18, 4]

In [10]:
df.loc[ids]

Unnamed: 0_level_0,line,poem,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,sample1,meet me on the darkest sea of dead stars
5,5,sample1,sea of dead waves when my skin meet me stars
18,18,sample1,on the darkest sea of dead abstraction
4,4,sample1,i’ll remember the burn on the darkest


## Real Data

In [35]:
import pickle

In [22]:
a = load_archive()
print(f"{len(a):,}")

53,898


In [23]:
my_data = load_all(a)

100%|██████████| 53898/53898 [05:42<00:00, 157.29it/s]  


In [25]:
L = len(my_data)
print(f"{L:,}")

21,234,158


In [34]:
with open('test.pickle', "wb") as file:
    pickle.dump(my_data, file)