In [None]:
import pandas as pd
import numpy as np

In [None]:
player_file = "/mnt/idms/fberes/network/online_ranker/roland_garros_updated_schedule/recoded_player_accounts.txt"
player_accounts = []
with open(player_file) as f:
    for line in f:
        player_accounts.append(line.rstrip())

In [None]:
len(player_accounts)

# Experimenting with TF-IDF

In [None]:
tweets_with_text = pd.read_csv("/mnt/idms/fberes/network/roland_garros/data/rg17_tweets_eng.csv", sep="|")

In [None]:
tweets_with_text.head()

In [None]:
tweets_with_text["date"] = pd.to_datetime(tweets_with_text["time"],unit="s")

In [None]:
tweets_with_text["date_id"] = tweets_with_text["date"].apply(lambda x: str(x)[:-6])

In [None]:
tweets_with_text["date_id"].value_counts()[-5:]

#### Modify text before groupby

In [None]:
tweets_with_text["text"] = tweets_with_text["text"].apply(lambda x: " " + str(x))

In [None]:
grouped_docs = tweets_with_text.groupby("date_id")["text"].apply(lambda x: x.sum()).reset_index()

In [None]:
grouped_docs = grouped_docs.drop(0,axis=0)

In [None]:
import re
def clean_text(t):
    clean_1 = ' '.join(re.findall("[\w,\@]+",t))
    clean_2 = ' '.join(re.findall("[^\,,\d]+",clean_1))
    return clean_2.lower()

def get_words(t, size_limit):
    words = []
    for w in t.split(" "):
        if len(w) > size_limit:
            words.append(w)
    return words

In [None]:
grouped_docs["text_clean"] = grouped_docs["text"].apply(clean_text)

In [None]:
grouped_docs["text_clean"].head()

In [None]:
grouped_docs["words"] = grouped_docs["text_clean"].apply(lambda x: get_words(x,2))
grouped_docs["num_words"] = grouped_docs["words"].apply(len)

In [None]:
all_words = set([])
for idx, row in grouped_docs.iterrows():
    all_words = all_words.union(set(row["words"]))

In [None]:
len(all_words)

## Generate tweet text representations

In [None]:
from collections import Counter

word_representations = []
for idx, row in grouped_docs.iterrows():
    cnt = Counter(row["words"])
    cnt_repr = dict(zip(all_words,np.zeros(len(all_words))))
    cnt_repr.update(dict(cnt))
    word_representations.append(cnt_repr)

In [None]:
word_counts = pd.DataFrame(word_representations)

In [None]:
word_counts_arr = word_counts.as_matrix()

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)

In [None]:
tfidf_arr = transformer.fit_transform(word_counts_arr)

In [None]:
tf_ids_df = pd.DataFrame(tfidf_arr.todense(), columns=list(word_counts.columns))

In [None]:
tf_ids_df = tf_ids_df.replace(to_replace=0, value=np.nan)

In [None]:
tf_ids_df.head()

### Dropping stop words

http://www.ranks.nl/stopwords

In [None]:
stop_words = []
with open("stop_words_en_long.txt") as f:
    for line in f:
        stop_words.append(line.rstrip())
stop_words

In [None]:
stop_words_intersection = []
for sw in stop_words:
    if sw in tf_ids_df.columns:
        stop_words_intersection.append(sw)
len(stop_words_intersection)

In [None]:
tf_ids_df = tf_ids_df.drop(stop_words_intersection,axis=1)

In [None]:
sum_tf_idf = tf_ids_df.sum(axis=0)

In [None]:
sum_tf_idf = sum_tf_idf.sort_values(ascending=False)

In [None]:
player_tf_idfs = sum_tf_idf[sum_tf_idf.index.isin(player_accounts)]
non_player_tf_idfs = sum_tf_idf[~sum_tf_idf.index.isin(player_accounts)]

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.subplots(figsize=(20,5))
plt.subplot(1,2,1)
sum_tf_idf.hist(bins=100)
plt.subplot(1,2,2)
player_tf_idfs.hist(bins=100)
plt.show()

In [None]:
for i in range(100):
    print(non_player_tf_idfs.index[i],non_player_tf_idfs.ix[i])

# Make final list of words

In [None]:
top = 5000

In [None]:
non_player_words = list(non_player_tf_idfs.head(top).index)

In [None]:
all_words = non_player_words + player_accounts

In [None]:
len(all_words)

**First 3-6 words should be eliminated:**
    * @rolandgarros
    * amp
**Should I delete these frequent words?**
    * rolandgarros
    * tennis
    * french
    * open
    * day
    * roland?
    * garros?

In [None]:
excluded_words = ["@rolandgarros","amp"]#,"frenchopen","rolandgarros","tennis","french","open"]

In [None]:
for w in excluded_words:
    all_words.remove(w)

In [None]:
len(all_words)

### Export

In [None]:
with open("/mnt/idms/fberes/network/roland_garros/data/rg17_%i_important_en_words_plus_players.txt" % top, 'w') as f_out:
    for w in all_words:
        f_out.write("%s\n" % w)