In [1]:
import os
os.chdir('..')

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%matplotlib inline

In [4]:
import pandas as pd
import pickle as pkl
import networkx as nx
import numpy as np
import pickle

import fastText
import matplotlib.pyplot as plt
from functools import partial
import tqdm
import string

In [5]:
from tweet_recommendations.embeddings import word2vec, fast_text
from tweet_recommendations.other_methods.method import Method
from tweet_recommendations.data_processing.split_train_test import split_to_train_test_by_user

In [6]:
def load_udi_subsampled_by_users(udi_dataset, fraction=0.1, verbose=False):
    #udi_dataset = pd.read_csv("./data/udi_processed/udi_stemmed.csv")
    files = os.listdir("./data/udi_dataset/UDI-TwitterCrawl-Aug2012-Tweets-Parsed/")
    np.random.shuffle(files)
    files = files[:int(fraction * len(files))]
    usernames = []
    for file in tqdm.tqdm_notebook(files, disable=not verbose, 
                                   desc="Loading user ids"):
            df = pd.read_csv("./data/udi_dataset/UDI-TwitterCrawl-Aug2012-Tweets-Parsed/" + file, usecols=["ID"])
            df["username"] = file.split(".")[0]
            usernames.append(df)

    usernames = pd.concat(usernames, ignore_index=True)
    if verbose: print("Loaded tweets authors usernames")
    udi_dataset = udi_dataset.merge(usernames, on="ID")
    if verbose: print("Merged tweets authors usernames")
    return udi_dataset
    
    
def unify_columns_in_udi_dataset(udi_dataset, verbose=False):
    #udi_dataset = pd.read_pickle("./data/udi_dataset/temp_stemmed_dataset_with_users.pkl")

    columns_to_leave = ["ID", "username", "RetCount", 
                        "Origin", "Text", "Hashtags"]
    columns_to_drop = list(set(udi_dataset.columns) - set(columns_to_leave))
    udi_dataset.drop(columns=columns_to_drop, inplace=True)
    udi_dataset.rename(columns={"ID": "id", 
                                "username": "username",
                                "RetCount": "retweet_count",
                                "Origin": "text", 
                                "Text": "lemmas",
                                "Hashtags": "hashtags"}, 
                       inplace=True)
    if verbose: print("Filtered columns")    
    udi_dataset.drop_duplicates(["id"], inplace=True)
    if verbose: print("Dropped duplicates")
    
    if verbose: print("Removing punctuation and numbers...")
    def remove_punctuation(lemmas):
        result = []
        if isinstance(lemmas, list):
            for lemma in lemmas:
                stripped = lemma.translate(str.maketrans('', '', string.punctuation)).strip()
                if stripped and not stripped.isdigit():
                    result.append(stripped)
        return result
    udi_dataset["lemmas"] = udi_dataset["lemmas"].str.split().apply(remove_punctuation)
    if verbose: print("Converting hashtags...")
    udi_dataset["hashtags"] = udi_dataset["hashtags"].apply(lambda x: [{"text": h} for h in eval(x)])
    return udi_dataset


def load_udi_dataset(fraction=0.1, verbose=False):
    udi_dataset = pd.read_csv("./data/udi_processed/udi_stemmed.csv")
    udi_dataset = load_udi_subsampled_by_users(udi_dataset, fraction, verbose)
    udi_dataset = unify_columns_in_udi_dataset(udi_dataset, verbose)
    udi_dataset = udi_dataset[udi_dataset["lemmas"].str.len() > 1]
    udi_dataset = udi_dataset[udi_dataset["hashtags"].str.len() > 0]
    return udi_dataset


def load_our_dataset():
    tweets_with_lemmas = pd.read_pickle("data/source_data/original_tweets_with_lemmas.p")
    our_dataset = tweets_with_lemmas[["id", "username", "retweet_count", "text", "lemmas", "hashtags"]]
    return our_dataset

In [7]:
def get_word_vectors(lemmas_list, fasttext_model):
    return np.stack([fasttext_model.get_word_vector(word) for word in lemmas_list])

In [8]:
def process_dataset(source_dataset, fasttext_model, verbose=False):
    dataset = source_dataset[source_dataset["hashtags"].str.len()>0]
    dataset = dataset[dataset["lemmas"].str.len()>0]
    if verbose: print("Dropped empty tweets")
    if verbose: print("Filtering out hashtag below minimal frequency...")
    dataset = Method.drop_tweets_with_hashtags_that_occurred_less_than(dataset, minimal_hashtag_occurrence=3)
    if verbose: print("Getting word embeddings...")
    dataset["word_embeddings"] = dataset["lemmas"].apply(partial(get_word_vectors, fasttext_model=fasttext_model))
    if verbose:print("Calculating tweet embeddings...")
    dataset["embedding"] = dataset["word_embeddings"].apply(lambda x: x.mean(axis=0))
    if verbose: print("Splitting to train & test")
    train_dataset, test_dataset = split_to_train_test_by_user(dataset)
    if verbose: print("Done!")
    return train_dataset, test_dataset

In [9]:
udi_dataset = load_udi_dataset(fraction=0.05, verbose=True)
eng_fasttext = fastText.load_model("/mnt/SAMSUNG/models/fasttext/english/cc.en.300.bin")
udi_train, udi_test = process_dataset(udi_dataset, eng_fasttext, verbose=True)
print(len(udi_train), len(udi_test))
del eng_fasttext

udi_train.to_pickle("./data/experiments_datasets/udi_train.pkl")
udi_test.to_pickle("./data/experiments_datasets/udi_test.pkl")

del udi_dataset, udi_train, udi_test

HBox(children=(IntProgress(value=0, description='Loading user ids', max=6693, style=ProgressStyle(description_…


Loaded tweets authors usernames
Merged tweets authors usernames
Filtered columns
Dropped duplicates
Removing punctuation and numbers...
Converting hashtags...
Dropped empty tweets
Filtering out hashtag below minimal frequency...
Getting word embeddings...
Calculating tweet embeddings...
Splitting to train & test


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  duplicates='drop').cat.codes


Done!
288223 120628


In [11]:
import gc
gc.collect()

0

In [12]:
pol_fasttext = fastText.load_model("/mnt/SAMSUNG/models/fasttext/polish/kgr10.plain.lemma.lower.skipgram.dim300.neg10.bin")
our_train, our_test = process_dataset(load_our_dataset(), pol_fasttext, verbose=True)
print(len(our_train), len(our_test))
del pol_fasttext
our_train.to_pickle("./data/experiments_datasets/our_train.pkl")
our_test.to_pickle("./data/experiments_datasets/our_test.pkl")

del our_train, our_test

Dropped empty tweets
Filtering out hashtag below minimal frequency...
Getting word embeddings...
Calculating tweet embeddings...
Splitting to train & test


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  duplicates='drop').cat.codes


Done!
36699 11054
