In [None]:
import pandas as pd
import numpy as np
import snowballstemmer, sys

In [None]:
sys.path.insert(0,"../../python/")
import rg17.text_cleaning as tc

In [None]:
import sys
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../pipelines/TrendApproximation.json", sys.argv)

In [None]:
unstemmed_file_path = ph.get("unstemmed_tweet_file_path")
stemmed_file_path = ph.get("stemmed_tweet_file_path")
stop_words_file_path = ph.get("stop_words_file_path")
player_file = ph.get("player_screen_names_file_path")
player_names_file_path = ph.get("player_names_file_path")

# 0. Load name parts and accounts

In [None]:
player_accounts = []
with open(player_file) as f:
    for line in f:
        player_accounts += ["@%s" % p for p in line.rstrip()[2:-2].split('", "')]

In [None]:
len(player_accounts)

In [None]:
name_parts = []
with open(player_names_file_path) as f:
    for line in f:
        name_parts.append(line.rstrip())

In [None]:
len(name_parts)

# 1. Stemming english words

It scrambles the following name groups:
   
   * names
   * cities
   * We're - we'r

## i.) Collect the list of fixed words

In [None]:
list_of_fixed_words = ["tennis","RolandGarros","paris","during","roland", "garros", "title"]
list_of_fixed_words += ["coverage"]

## ii.) Add name parts to fixed words

In [None]:
list_of_fixed_words += name_parts

## iii.) Stemming...

In [None]:
my_stemmer = tc.CustomStemmer(list_of_fixed_words, "english")

#### examples

In [None]:
my_stemmer.stem_words("Tennis being my favourite sport in #Paris".split())

In [None]:
my_stemmer.stem_words("win winning wins won winner champion victory".split())

In [None]:
my_stemmer.stem_words("loser defeated lose lost beaten".split())

#### real deal...

In [None]:
tweets_with_text = pd.read_csv(unstemmed_file_path, sep="|")

In [None]:
tweets_with_text["text"] = tweets_with_text["text"].apply(lambda x: " ".join(my_stemmer.stem_words(x.split(),remove_hashtag=True)))

In [None]:
tweets_with_text.to_csv(stemmed_file_path, sep="|", index=False)

# Reloading stemmed file

In [None]:
tweets_with_text = pd.read_csv(stemmed_file_path, sep="|")

# 2. Dropping stop-words

## a.) Clean text

# TODO: numbers from accounts and hashtags DO disappear!!! see clean_text()

In [None]:
tweets_with_text["text"] = tweets_with_text["text"].apply(tc.clean_text)

## b.) Stop words

source: http://www.ranks.nl/stopwords

In [None]:
stop_words = []
with open(stop_words_file_path) as f:
    for line in f:
        stop_words.append(line.rstrip())

stop_words += ["the"]
stop_words

## c.) Dropping other incredibly frequent words

#### I should not exclude any words - the relevance score should handle these words properly!!!

stop_words += ["@rolandgarros", "frenchopen", "rolandgarros", "tennis", "#rg", "#frenchopen", "#rolandgarros", "#tennis"]
stop_words += ["somuchsweetromanticfunnyyummiaromaticsweetsexyeatbreakfastfullhealthbat"]

In [None]:
def remove_stop_words(text):
    return " ".join([word for word in text.split() if word.lower() not in stop_words])

In [None]:
tweets_with_text["text"] = tweets_with_text["text"].apply(remove_stop_words)

# 3. Calculate TF-IDF for daily tweets

In [None]:
tweets_with_text["date"] = pd.to_datetime(tweets_with_text["time"],unit="s")

In [None]:
tweets_with_text["date_id"] = tweets_with_text["date"].apply(lambda x: str(x)[:-6])

In [None]:
tweets_with_text["date_id"].value_counts()[-5:]

## i.) Concatenate tweet messages related to the same day

### Modify text before groupby: it is needed for the concatenation of messages

In [None]:
tweets_with_text["text"] = tweets_with_text["text"].apply(lambda x: " " + str(x))

In [None]:
grouped_docs = tweets_with_text.groupby("date_id")["text"].apply(lambda x: x.sum()).reset_index()

In [None]:
grouped_docs = grouped_docs.drop(0,axis=0)

## ii.) Cleaning text with word size

In [None]:
grouped_docs["words"] = grouped_docs["text"].apply(lambda x: tc.get_words_above_size_limit(x,2))
grouped_docs["num_words"] = grouped_docs["words"].apply(len)

In [None]:
grouped_docs.head()

In [None]:
all_words = set([])
for idx, row in grouped_docs.iterrows():
    all_words = all_words.union(set(row["words"]))

In [None]:
len(all_words)

## Generate tweet text representations

In [None]:
from collections import Counter

word_representations = []
for idx, row in grouped_docs.iterrows():
    cnt = Counter(row["words"])
    cnt_repr = dict(zip(all_words,np.zeros(len(all_words))))
    cnt_repr.update(dict(cnt))
    word_representations.append(cnt_repr)

In [None]:
word_counts = pd.DataFrame(word_representations)

In [None]:
word_counts_arr = word_counts.as_matrix()

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)

In [None]:
tfidf_arr = transformer.fit_transform(word_counts_arr)

In [None]:
tf_ids_df = pd.DataFrame(tfidf_arr.todense(), columns=list(word_counts.columns))

In [None]:
tf_ids_df = tf_ids_df.replace(to_replace=0, value=np.nan)

tf_ids_df.head()

In [None]:
sum_tf_idf = tf_ids_df.sum(axis=0)

In [None]:
sum_tf_idf = sum_tf_idf.sort_values(ascending=False)

In [None]:
player_tf_idfs = sum_tf_idf[sum_tf_idf.index.isin(player_accounts)]
non_player_tf_idfs = sum_tf_idf[~sum_tf_idf.index.isin(player_accounts)]

In [None]:
player_accounts[:10]

sum_tf_idf

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.subplots(figsize=(20,5))
plt.subplot(1,2,1)
sum_tf_idf.hist(bins=100)
plt.subplot(1,2,2)
player_tf_idfs.hist(bins=100)
plt.show()

In [None]:
for i in range(100):
    print(non_player_tf_idfs.index[i],non_player_tf_idfs.ix[i])

# Make final list of words + Export

In [None]:
def export_words(top):
    non_player_words = list(non_player_tf_idfs.head(top).index)
    all_words = non_player_words + player_accounts
    print(len(all_words))
    with open("/mnt/idms/fberes/network/roland_garros/data/rg17_%i_important_en_words_plus_players.txt" % top, 'w') as f_out:
        for w in all_words:
            f_out.write("%s\n" % w)

In [None]:
for top in [5000,8000,10000]:
    export_words(top)