In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd
import numpy as np
import pickle
from functools import partial
import tqdm

from tweet_recommendations.data_processing.data_loader import convert_hashtags_dicts_to_list
from tweet_recommendations.data_processing.split_train_test import split_by_user
from tweet_recommendations.graphs.graph_builder import (build_base_tweets_graph, 
                                                        add_tweet_embeddings_to_graph, 
                                                        calculate_hashtag_embeddings, 
                                                        calculate_edge_weights, 
                                                        calculate_pagerank, 
                                                        calculate_hashtag_popularity_mean_retweets_heuristic)

# Tweets

In [3]:
tweets_df = pd.read_pickle('./data/source_data/original_tweets.p')
tweets_df = tweets_df[tweets_df['hashtags'].str.len()>0]
tweets_df = convert_hashtags_dicts_to_list(tweets_df)

# Fasttext embeddings

In [4]:
fasttext_df = pd.read_pickle('./data/embeddings/fasttext_embeddings.pkl')
fasttext_df = fasttext_df.rename({'tweet_id': 'id', 'embeddings': 'embedding'}, axis='columns')
fasttext_df['id'] = fasttext_df['id'].astype(np.int64)
fasttext_df.head(5)

Unnamed: 0,id,embedding
0,1055013541181353987,"[0.066088036, 0.06916399, -0.08772982, 0.02842..."
1,1052688633918963713,"[0.042748816, 0.018327478, -0.026713202, 0.038..."
2,1054102180486172673,"[0.04925826, 0.12695895, -0.08234913, 0.047574..."
3,1051723165829918720,"[0.07929088, 0.052764755, -0.102878, 0.0189335..."
4,1034736277084688384,"[0.05510524, 0.07667969, -0.09653922, 0.048123..."


# Skipgram embeddings

In [5]:
skipgram_df = pd.read_pickle('./data/embeddings/skipgram_embeddings.pkl')
skipgram_df = skipgram_df.rename({'tweet_id': 'id', 'tweet_embedding': 'embedding'}, axis='columns')
skipgram_df['id'] = skipgram_df['id'].astype(np.int64)
skipgram_df.head(5)

Unnamed: 0,id,embedding
0,1055013541181353987,"[-0.02563778146635741, -0.43154129571281374, -..."
1,1052688633918963713,"[-0.06042357006420692, -0.36830542680053485, 0..."
2,1054102180486172673,"[-0.18836596173544726, -0.36910818586194954, -..."
3,1051723165829918720,"[-0.07193121433790241, -0.3486243937430637, -0..."
4,1034736277084688384,"[-0.21816051351587948, -0.3266038287703584, 0...."


# Filter Tweets
Leave only tweets with embeddings

In [6]:
skip_ok = tweets_df['id'].isin(skipgram_df['id']) 
fast_ok = tweets_df['id'].isin(fasttext_df['id']) 
tweets_df = tweets_df[skip_ok & fast_ok]

# Split train/val/test

In [7]:
ls = os.listdir('./data/processed/')
if ('train_tweets.pkl' in ls 
    and 'val_tweets.pkl' in ls 
    and 'test_tweets.pkl' in ls):
    print("Reading Train/Val/Test from disk.")
    train_tweets = pd.read_pickle('./data/processed/train_tweets.pkl')
    val_tweets = pd.read_pickle('./data/processed/val_tweets.pkl')
    test_tweets = pd.read_pickle('./data/processed/test_tweets.pkl')
else:
    print("Creating new Train/Val/Test.")
    splitted_tweets = split_by_user(tweets_df)
    train_tweets = splitted_tweets['train']
    val_tweets = splitted_tweets['val']
    test_tweets = splitted_tweets['test']
    train_tweets.to_pickle('./data/processed/train_tweets.pkl')
    val_tweets.to_pickle('./data/processed/val_tweets.pkl')
    test_tweets.to_pickle('./data/processed/test_tweets.pkl')

Reading Train/Val/Test from disk.


# Build Graph

In [8]:
def build_graph(tweets_df, skipgram_df, fasttext_df):
    g = build_base_tweets_graph(tweets_df, partial(tqdm.tqdm_notebook, desc='Base graph'))
    g = add_tweet_embeddings_to_graph(g, fasttext_df, 'fasttext')
    g = calculate_hashtag_embeddings(g, 'fasttext', partial(tqdm.tqdm_notebook, desc='Fasttext # embed.'))
    g = add_tweet_embeddings_to_graph(g, skipgram_df, 'skipgram')
    g = calculate_hashtag_embeddings(g, 'skipgram', partial(tqdm.tqdm_notebook, desc='Skipgram # embed.'))
    g = calculate_edge_weights(g, 'fasttext', 'fasttext_distance', 'fasttext_similarity',
                               partial(tqdm.tqdm_notebook, desc='Fasttext weights'))
    g = calculate_edge_weights(g, 'skipgram', 'skipgram_distance', 'skipgram_similarity',
                               partial(tqdm.tqdm_notebook, desc='Skipgram weights'))
    g = calculate_pagerank(g)
    g = calculate_hashtag_popularity_mean_retweets_heuristic(g, partial(tqdm.tqdm_notebook, desc='# popularity'))
    return g

In [9]:
train_G = build_graph(train_tweets, skipgram_df, fasttext_df)
train_G.order(), train_G.size()

HBox(children=(IntProgress(value=0, description='Base graph', max=36189, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Fasttext # embed.', max=49665, style=ProgressStyle(descriptio…




HBox(children=(IntProgress(value=0, description='Skipgram # embed.', max=49665, style=ProgressStyle(descriptio…




HBox(children=(IntProgress(value=0, description='Fasttext weights', max=82111, style=ProgressStyle(description…

  ang_dist = np.arccos(similarity) / np.pi





HBox(children=(IntProgress(value=0, description='Skipgram weights', max=82111, style=ProgressStyle(description…

  dist = 1.0 - uv / np.sqrt(uu * vv)





HBox(children=(IntProgress(value=0, description='# popularity', max=49665, style=ProgressStyle(description_wid…




(49665, 82111)

In [11]:
val_G = build_graph(val_tweets, skipgram_df, fasttext_df)
val_G.order(), val_G.size()

HBox(children=(IntProgress(value=0, description='Base graph', max=7430, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Fasttext # embed.', max=11495, style=ProgressStyle(descriptio…




HBox(children=(IntProgress(value=0, description='Skipgram # embed.', max=11495, style=ProgressStyle(descriptio…




HBox(children=(IntProgress(value=0, description='Fasttext weights', max=18094, style=ProgressStyle(description…




HBox(children=(IntProgress(value=0, description='Skipgram weights', max=18094, style=ProgressStyle(description…

  dist = 1.0 - uv / np.sqrt(uu * vv)
  ang_dist = np.arccos(similarity) / np.pi





HBox(children=(IntProgress(value=0, description='# popularity', max=11495, style=ProgressStyle(description_wid…




(11495, 18094)

In [12]:
test_G = build_graph(test_tweets, skipgram_df, fasttext_df)
test_G.order(), test_G.size()

HBox(children=(IntProgress(value=0, description='Base graph', max=7450, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Fasttext # embed.', max=11400, style=ProgressStyle(descriptio…




HBox(children=(IntProgress(value=0, description='Skipgram # embed.', max=11400, style=ProgressStyle(descriptio…




HBox(children=(IntProgress(value=0, description='Fasttext weights', max=17371, style=ProgressStyle(description…




HBox(children=(IntProgress(value=0, description='Skipgram weights', max=17371, style=ProgressStyle(description…

  dist = 1.0 - uv / np.sqrt(uu * vv)
  ang_dist = np.arccos(similarity) / np.pi





HBox(children=(IntProgress(value=0, description='# popularity', max=11400, style=ProgressStyle(description_wid…




(11400, 17371)

In [13]:
with open('./data/graphs/train_graph.p', 'wb') as f:
    pickle.dump(train_G, f)
    
with open('./data/graphs/val_graph.p', 'wb') as f:
    pickle.dump(val_G, f)
    
with open('./data/graphs/test_graph.p', 'wb') as f:
    pickle.dump(test_G, f)

In [16]:
!ls data/graphs/

fasttext_graphs  test_graph.p  train_graph.p  val_graph.p


In [8]:
!ls data/processed/

embeddings.pkl	tagged.zip	 train_tweets.pkl      val_tweets.pkl
processed.pkl	test_tweets.pkl  tweet_embeddings.pkl


In [9]:
!cp data/processed/tweet_embeddings.pkl data/embeddings/skipgram_embeddings.pkl