# Preliminary data analysis

## Housekeepin’

In [None]:
# # lan model
# import requests
# ftm_url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
# r = requests.get(ftm_url, allow_redirects=True)
# open("../../data/lid.176.bin", "wb").write(r.content)

In [None]:

import pandas as pd
import matplotlib.pyplot as plt

# docs
docs_augmented = pd.read_pickle("../../data/docs-augmented.pkl")
docs_augmented.head(3)

In [None]:
display(docs_augmented.shape[0])
display(docs_augmented.OutputText.isnull().sum())

In [None]:
docs_augmented.Year.value_counts()

In [None]:
f, ax = plt.subplots(1,1,figsize=(8.5,7))
docs_augmented.groupby("Country").Participant.count().\
    sort_values().tail(25).plot(kind="barh", ax=ax);
ax.set_ylabel("country");
ax.set_xlabel("frequency");

In [None]:
f, ax = plt.subplots(1,1,figsize=(7,7))
docs_augmented.groupby("Sector").Participant.count().\
    sort_values().tail(25).plot(kind="barh", ax=ax);
ax.set_ylabel("sector");
ax.set_xlabel("frequency"); 

In [None]:
# avg documents per participant, possible concat?
docs_augmented.Participant.value_counts().mean()

## Texts

In [None]:
# simple pre-proc
def purge_chars(string):
    import re
    string = string.lower()
    string = re.sub("<.*?>|</.*?>","", string)
    string = re.sub("(s?)(f|ht)tp(s?)://\\S+\\b","",string)
    string = re.sub("[^a-z@ '.,?!\\-:]"," ",string)
    return re.sub("\\s+"," ", string)

docs_augmented["OutputChars"] = docs_augmented.apply(lambda x: purge_chars(x.OutputText), axis=1)    

In [None]:
# chars, no of tokens, no of sentences, languages
def get_characteristics(df):
    import re
    import pandas as pd
    import fasttext as fs
    string = df.OutputChars
    df["n_chars"] = len(string)
    df["n_words"] = len(re.split("\\W+",string))
    df["n_sents"] = len(re.split("[.!?]", string))
    ftm = fs.load_model("../../data/lid.176.bin")
    lest = ftm.predict(string, k=1)
    df["lang_est"] = lest[0][0].split("_")[-1]
    return df

docs_augmented = docs_augmented.apply(get_characteristics, axis=1)    

In [None]:
f, ax = plt.subplots(1,1,figsize=(5,5))
docs_augmented.groupby("lang_est").Participant.count().\
    sort_values().tail(25).plot(kind="barh", ax=ax, logx=True);
ax.set_ylabel("languages");
ax.set_xlabel("frequency"); 

In [None]:
# plot it
docs_augmented = docs_augmented[docs_augmented.lang_est=="en"]
f, axs = plt.subplots(1,3,figsize=(15,5))
counts = {"n_chars":"no characters", "n_words":"no words", "n_sents":"no senteces"}
for c, ax in zip(counts.keys(), axs.flatten()):
    docs_augmented[c].plot(kind="hist", ax=ax, logy=True, rot=90);
    ax.set_ylabel("frequency");
    ax.set_xlabel(counts[c]);

In [None]:
#  the most frequent tokens
from sklearn.feature_extraction.text import CountVectorizer
tokvec = CountVectorizer(max_features=25, stop_words="english",
    max_df=0.95, min_df=2)
token_counts = tokvec.fit_transform(docs_augmented.OutputChars)
f, ax = plt.subplots(1,1,figsize=(10,10))
pd.DataFrame(token_counts.toarray(), columns=tokvec.get_feature_names()).\
    sum().sort_values().plot(kind="barh", ax=ax);
ax.set_ylabel("tokens");
ax.set_xlabel("freq"); 

In [None]:
# 1,2,3-grams
gramvec = CountVectorizer(ngram_range=(1,3), stop_words="english",
    max_df=0.95, min_df=2)
gram_counts = gramvec.fit_transform(docs_augmented.OutputChars)
gram_counts = pd.DataFrame(gram_counts.toarray(), columns=gramvec.get_feature_names())
gram_counts = gram_counts.sum().reset_index()
gram_counts.columns = ["ngram", "frequency"]
gram_counts["n"] = gram_counts.ngram.apply(lambda x: len(x.split(" ")))

f, axs = plt.subplots(1,3,figsize=(25,10))
for ax, n in zip(axs, gram_counts.n.unique()):
    gram_counts[gram_counts.n==n].sort_values("frequency").tail(15).\
        plot(y="frequency",x="ngram", kind="barh", ax=ax, legend="false")
    ax.set_title(str(n)+"-gram");
    ax.set_ylabel("");
    ax.set_xlabel("frequency");
    ax.get_legend().remove();
f.tight_layout()  

In [None]:
# upos
def get_upos(text,i):
    import spacy
    nlp =  spacy.load("en_core_web_sm")
    parsed = nlp(text)
    ls = [(i, t.text, t.lemma_, t.pos_, t.tag_, t.dep_,
    t.shape_, t.is_alpha, t.is_stop) for t in parsed]
    return pd.DataFrame(ls,
        columns=["doc_id","text", "lemma", "pos", "tag",
            "dep", "shape", "is_alpha","is_stopword"])

from joblib import Parallel, delayed

upos_ls = Parallel(n_jobs=4)(delayed(get_upos)(docs_augmented.OutputChars.loc[i],i)\
    for i in docs_augmented.index)
docs_upos = pd.concat(upos_ls)

# common ones
docs_upos.pos.value_counts().sort_values().tail(5)

In [None]:
# barh noun, verb, adj
upos_subset = ["NOUN","VERB","ADJ"]
f, axs = plt.subplots(1,3,figsize=(25,10))
for ax, up in zip(axs, upos_subset):
    docs_upos[docs_upos.pos==up].lemma.value_counts().sort_values().tail(15).\
        plot(kind="barh", ax=ax, legend="false")
    ax.set_title(up)
    ax.set_xlabel("frequency");
    ax.get_legend().remove()
f.tight_layout()

In [None]:
# rake - example
import spacy
from rake_spacy import Rake

def filter_tokens(token):
    return (token.is_stop or token.is_space or token.is_punct) and not (token.like_num)\
        and (token.pos_ not in ["VERB", "ADJ", "NOUN"]) and (len(token.text)<2)

rake = Rake(nlp=spacy.load("en_core_web_sm"), min_length=2, max_length=5,
    stop_token_class=filter_tokens)

rake.apply(docs_augmented.OutputChars[0])[:10]

In [None]:
# coocurence
import numpy as np
doc_reconstructed = docs_upos[[p in ["NOUN", "VERB","ADJ"] for p in docs_upos.pos]].groupby("doc_id").\
    apply(lambda x: " ".join(x["lemma"]))
coovec = CountVectorizer(stop_words="english",
    ngram_range=(2,2), max_df=0.95, min_df=2)
coo_counts = coovec.fit_transform(doc_reconstructed)
coo_counts = pd.DataFrame(np.sum(coo_counts.todense(),axis=0),
    columns=coovec.get_feature_names()).T.reset_index()
coo_counts.columns = ["bigram","frequency"]
coo_counts.sort_values("frequency").tail(10)

In [None]:
# networkx
import networkx as nx
coo_counts["from"]=coo_counts.bigram.apply(lambda x: x.split(" ")[0])
coo_counts["to"]=coo_counts.bigram.apply(lambda x: x.split(" ")[1])
net = nx.convert_matrix.from_pandas_edgelist(coo_counts.sort_values("frequency").tail(50),
    source="from", target="to", edge_attr="frequency")
f,ax = plt.subplots(1,1,figsize=(15,7))
pos = nx.spring_layout(net, seed=1, iterations=15)
nx.draw_networkx_labels(net, pos, font_size=10,
    font_family="sans-serif", alpha=.9, ax=ax);
nx.draw_networkx_edges(net, width=[0.015*net[u][v]['frequency'] for u,v in net.edges()],
    pos=pos, alpha=.25);

In [None]:
# lda topics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(1, 5, figsize=(30, 7), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 12})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=12)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=14)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

tfidfvec = TfidfVectorizer(min_df=2)

tfidf_counts = tfidfvec.fit_transform(doc_reconstructed)

lda = LatentDirichletAllocation(n_components=5, max_iter=10,
    learning_method='online', learning_offset=50., random_state=0)

lda.fit(tfidf_counts)
plot_top_words(lda, tfidfvec.get_feature_names(), 5, 'Topics in LDA model')