In [18]:
import os
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from dotenv import load_dotenv
from sklearn.feature_extraction.text import CountVectorizer
from tokenizer import tokenizer
from gensim.corpora import Dictionary
from gensim.corpora.bleicorpus import BleiCorpus

In [19]:
def cum_dist_plot(array, quantiles = [0.6, 0.8, 0.9, 0.95, 0.99], xlabel="", ylabel=""):
    """
    cummulative distribution over series values
    """
    # cum dist
    serie = pd.Series(array)
    serie_sorted = serie.sort_values(ascending=False)
    indexs = range(1, len(serie_sorted)+1)
    cum_dist = []

    cum = 0
    for value in serie_sorted:
        cum += value
        cum_dist.append(cum)

    cum_dist = pd.Series(cum_dist)
    
    #quantiles
    q_indexs = [] 
    q_values = [] 

    labels = [f"q{int(100*q)}" for q in quantiles] 
    for q in quantiles:
        cut = cum_dist.max()*q
        slice = cum_dist[cum_dist<cut]
        i, value = slice.index[-1], slice.values[-1]
        q_indexs.append(i)
        q_values.append(value)

    fig = go.Figure()
    fig.update_layout(template="plotly_white", margin={'l': 0, 'r': 0, 't': 0, 'b': 0},
        xaxis={'automargin': True}, yaxis={'automargin': True})
    fig.add_trace(go.Scatter(
        showlegend = False,
        x= list(indexs),
        y= cum_dist.values 
    ))
    fig.add_trace(go.Scatter(
        showlegend = False,
        x= q_indexs,
        y= q_values,
        text=labels,
        textposition="bottom right",
        mode='markers+text',
        marker=dict(color='royalblue')
    ))
    fig.update_xaxes(title_text=xlabel, title_font = {"size": 20}, tickfont=dict(size=14))
    fig.update_yaxes(title_text=ylabel, title_font = {"size": 20}, tickfont=dict(size=14))
    fig.update_traces(textfont_size=14)
    fig.show()   
    return fig
        
def get_stats(array, quantiles = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 0.95, 0.99]):
    """
    Return  statistis over the distribution of the serie
    """
    serie = pd.Series(array)
    stats = pd.DataFrame(serie.describe()).T
    stats = stats[["count", "mean", "std", "min"]]
    for q in quantiles:
        qq = serie.quantile(q)
        col = f"q{int(q*100)}"
        stats[col] = qq
    stats["max"] = serie.max()
    return stats

# 1. Loading Raw Data

In [20]:
# load environment variables
load_dotenv()

True

In [21]:
# load data
df = pd.read_pickle(os.getenv("DATA"))

In [22]:
tf_vectorizer = CountVectorizer(analyzer='word', lowercase=False, tokenizer=lambda text: text.split())
tf_vectorizer.fit(df["text"])
vocabulary1 = tf_vectorizer.get_feature_names()
frequency1 = np.array(tf_vectorizer.transform(df["text"]).sum(axis=0)).ravel()
corpus1 = [doc.split() for doc in df["text"]]

voc_size1 = len(vocabulary1)
corpus_size1 = sum([1 for doc in corpus1 if len(doc)>0])
tokens_size1 = sum([len(doc) for doc in corpus1])

print(f"corpus: {corpus_size1}")
print(f"vocabulary: {voc_size1}")
print(f"tokens: {tokens_size1}")

corpus: 49015
vocabulary: 93203
tokens: 2030980


In [23]:
get_stats(frequency1, quantiles = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 0.95, 0.99])

Unnamed: 0,count,mean,std,min,q10,q20,q25,q30,q40,q50,q60,q70,q75,q80,q90,q95,q99,max
0,93203.0,21.790929,596.109864,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3.0,8.0,21.0,193.0,69388


In [24]:
fig1 = cum_dist_plot(array=frequency1, xlabel="Vocabulario", ylabel="Frecuencia acumulada" )

In [25]:
fig1.write_image("../../tesis/img/ch3/cum_dist_1.eps", scale=2)

# 2. Char elimination
To lowercase, elimination of symbols of accents, punctuation, mails, tokens with numbers and words with less than 4 chars.

In [26]:
tf_vectorizer = CountVectorizer(analyzer='word', lowercase=False, tokenizer=lambda text: tokenizer(text))
tf_vectorizer.fit(df["text"])
vocabulary2 = tf_vectorizer.get_feature_names()
frequency2 = np.array(tf_vectorizer.transform(df["text"]).sum(axis=0)).ravel()

corpus2 = [tokenizer(doc) for doc in df["text"]]

voc_size2 = len(vocabulary2)
corpus_size2 = sum([1 for doc in corpus2 if len(doc)>0])
tokens_size2 = sum([len(doc) for doc in corpus2])

print(f"corpus: {corpus_size2 }")
print(f"vocabulary: {voc_size2}")
print(f"tokens: {tokens_size2}")

corpus: 49003
vocabulary: 42921
tokens: 1028412


In [27]:
get_stats(frequency2, quantiles = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 0.95, 0.99])

Unnamed: 0,count,mean,std,min,q10,q20,q25,q30,q40,q50,q60,q70,q75,q80,q90,q95,q99,max
0,42921.0,23.960579,371.64457,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3.0,4.0,13.0,40.0,338.8,47995


In [28]:
fig2 = cum_dist_plot(array=frequency2, xlabel="Vocabulario", ylabel="Frecuencia acumulada" )

In [29]:
fig2.write_image("../../tesis/img/ch3/cum_dist_2.eps", scale=2)

The 50% of words happens one time and the 80% have a frequency lower or equal to 4. The 95% of the cummulative distribution can be explain with 7837 words (9%). In conclusion, the distribution of tokens have a long tail.

# 3. Frequency

In [30]:
vocabulary3 = set()
epochs = df["epoch"].unique()
for epoch in epochs:

    # docs from actual epoch
    docs = df[df["epoch"] == epoch]["text"]

    # processing docs from actual epoch
    corpus = [tokenizer(doc) for doc in docs]
    
    # map each word to an id {id->word}
    dictionary = Dictionary(corpus)

    # remove words with higher and lower frequency into the corpus
    lb = int(float(os.getenv("NO_BELOW"))*len(corpus))
    ub = int(float(os.getenv("NO_ABOVE"))*len(corpus))
    dictionary.filter_extremes(no_below=lb, no_above=ub)
  
    vocabulary3 = vocabulary3.union(dictionary.token2id.keys())

In [31]:
tokenizer_args = {"vocabulary": vocabulary3}

tf_vectorizer = CountVectorizer(analyzer='word', lowercase=False, tokenizer=lambda text: tokenizer(text, **tokenizer_args))
tf_vectorizer.fit(df["text"])
vocabulary3 = tf_vectorizer.get_feature_names()
frequency3 = np.array(tf_vectorizer.transform(df["text"]).sum(axis=0)).ravel()
corpus3 = [tokenizer(doc, **tokenizer_args) for doc in df["text"]]

voc_size3 = len(vocabulary3)
corpus_size3 = sum([1 for doc in corpus3 if len(doc)>0])
tokens_size3 = sum([len(doc) for doc in corpus3])

print(f"corpus: {corpus_size3}")
print(f"vocabulary: {voc_size3}")
print(f"tokens: {tokens_size3}")

corpus: 48988
vocabulary: 3148
tokens: 925693


In [32]:
get_stats(frequency3, quantiles = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 0.95, 0.99])

Unnamed: 0,count,mean,std,min,q10,q20,q25,q30,q40,q50,q60,q70,q75,q80,q90,q95,q99,max
0,3148.0,294.057497,1343.428877,7.0,24.0,31.0,34.0,38.1,49.0,64.0,89.0,131.0,169.0,222.0,497.6,995.2,4071.07,47995


In [33]:
fig3 = cum_dist_plot(array=frequency3, xlabel="Vocabulario", ylabel="Frecuencia acumulada" )

In [34]:
fig3.write_image("../../tesis/img/ch3/cum_dist_3.eps", scale=2)

# 4. Word filtering

- vocabulary filter: remove word that are not in vocabulary of embeddings
- stopwords elimination: nltk spanish stopwords + contextual stopwords 

## Vocabulary

Vocabulary from Spanish Unannotated Corpora.

https://github.com/dccuchile/spanish-word-embeddings#fasttext-embeddings-from-sbwc

In [35]:
# load vocabulary of embeddings
with open(os.getenv("VOCABULARY"), "r") as f:
    vocabulary = [line.strip() for line in f]

In [36]:
vocabulary4 = set(vocabulary3).intersection(vocabulary)

In [37]:
tokenizer_args = {"vocabulary": vocabulary4}

tf_vectorizer = CountVectorizer(analyzer='word', lowercase=False, tokenizer=lambda text: tokenizer(text, **tokenizer_args))
tf_vectorizer.fit(df["text"])
vocabulary4 = tf_vectorizer.get_feature_names()
frequency4 = np.array(tf_vectorizer.transform(df["text"]).sum(axis=0)).ravel()
corpus4 = [tokenizer(doc, **tokenizer_args) for doc in df["text"]]

voc_size4 = len(vocabulary4)
corpus_size4 = sum([1 for doc in corpus4 if len(doc)>0])
tokens_size4 = sum([len(doc) for doc in corpus4])

print(f"corpus: {corpus_size4}")
print(f"vocabulary: {voc_size4}")
print(f"tokens: {tokens_size4}")

corpus: 48988
vocabulary: 2902
tokens: 901745


In [38]:
get_stats(frequency4, quantiles = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 0.95, 0.99])

Unnamed: 0,count,mean,std,min,q10,q20,q25,q30,q40,q50,q60,q70,q75,q80,q90,q95,q99,max
0,2902.0,310.732254,1394.851251,8.0,25.0,33.0,36.0,40.0,51.0,69.0,95.0,140.0,182.0,235.0,526.5,1044.95,4325.71,47995


In [39]:
fig4 = cum_dist_plot(array=frequency4, xlabel="Vocabulario", ylabel="Frecuencia acumulada" )

In [40]:
fig4.write_image("../../tesis/img/ch3/cum_dist_4.eps", scale=2)

## Stopwords



In [41]:
# load stopwords: 321 stopwords (nltk) + 929 contextual stopwords -> 1250 (set)
with open(os.getenv("STOPWORDS"), "r") as f:
    stopwords = [line.strip() for line in f]

In [42]:
tokenizer_args = {"vocabulary": vocabulary4, "stopwords": stopwords}

tf_vectorizer = CountVectorizer(analyzer='word', lowercase=False, tokenizer=lambda text: tokenizer(text, **tokenizer_args))
tf_vectorizer.fit(df["text"])
vocabulary5 = tf_vectorizer.get_feature_names()
frequency5 = np.array(tf_vectorizer.transform(df["text"]).sum(axis=0)).ravel()
corpus5 = [tokenizer(doc, **tokenizer_args) for doc in df["text"]]

voc_size5 = len(vocabulary5)
corpus_size5 = sum([1 for doc in corpus5 if len(doc)>0])
tokens_size5 = sum([len(doc) for doc in corpus5])

print(f"corpus: {corpus_size5}")
print(f"vocabulary: {voc_size5}")
print(f"tokens: {tokens_size5}")

corpus: 48566
vocabulary: 1960
tokens: 495182


In [43]:
get_stats(frequency5, quantiles = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 0.95, 0.99])

Unnamed: 0,count,mean,std,min,q10,q20,q25,q30,q40,q50,q60,q70,q75,q80,q90,q95,q99,max
0,1960.0,252.643878,906.508103,8.0,26.0,33.0,37.0,41.0,52.6,71.0,96.4,139.0,176.0,227.2,475.5,940.95,2922.82,26092


In [44]:
fig5 = cum_dist_plot(array=frequency5, xlabel="Vocabulario", ylabel="Frecuencia acumulada" )

In [45]:
fig5.write_image("../../tesis/img/ch3/cum_dist_5.eps", scale=2)

# Processing Completed

In [46]:
corpus_path = f'{os.getenv("CORPUS")}/'
dict_files = sorted([file for file in os.listdir(corpus_path) if ".dict" in file])
corpus_files = sorted([file for file in os.listdir(corpus_path) if ".mm" in file and ".index" not in file and ".vocab" not in file])

In [47]:
data = {}
for epoch in range(0,6):
    # load dictionary {word->id}
    dict_path = f"{corpus_path}{dict_files[epoch]}"
    dictionary = Dictionary.load(dict_path)
    # load corpus
    corpus_file = f"{corpus_path}{corpus_files[epoch]}"
    corpus = BleiCorpus(corpus_file)
    # save data in a dict
    data[epoch] = {"dict": dictionary, "corpus": corpus}
    
vocabulary = set()
for epoch in range(0,6):
    vocabulary = vocabulary.union(data[epoch]["dict"].token2id.keys())

In [48]:
corpus_size6 =  sum([len(data[epoch]["corpus"]) for epoch in range(0,6)])
vocab_size6 = len(vocabulary)
tokens_size6 = int(sum([freq for epoch in range(0,6) for doc in data[epoch]["corpus"] for (id, freq) in doc]))
print(f"corpus: {corpus_size6}")
print(f"vocabulary: {vocab_size6}")
print(f"tokens: {tokens_size6}")

corpus: 48555
vocabulary: 1960
tokens: 480605


There is a small difference with the previous statistics, because some words would have a lower occurrence at one epoch and therefore it was eliminated, however, at another epoch it could have a higher occurrence, thus implying an overestimation of the number of tokens per document.

In [49]:
frequency = {}
for epoch in epochs:
    id2token = {value: key for key, value in data[epoch]["dict"].token2id.items()}
    for doc in data[epoch]["corpus"]:
        for (id, freq) in doc:
            token = id2token[id]
            if token not in frequency.keys():
                frequency[token] = freq
            else:
                frequency[token] += freq
frequency = np.array(list(frequency.values()))

In [50]:
get_stats(frequency, quantiles = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 0.95, 0.99])

Unnamed: 0,count,mean,std,min,q10,q20,q25,q30,q40,q50,q60,q70,q75,q80,q90,q95,q99,max
0,1960.0,245.206633,908.260838,7.0,9.0,16.0,20.0,26.0,42.0,64.5,94.0,138.3,175.25,227.0,475.5,940.95,2922.82,26092.0


In [51]:
cum_dist_plot(array=frequency, xlabel="Vocabulario", ylabel="Frecuencia acumulada" )

In [52]:
innovation_data = []
for epoch in range(0,5):
    vocabulary1 = set(data[epoch]["dict"].token2id.keys())
    vocabulary2= set(data[epoch+1]["dict"].token2id.keys())
    
    old_vocabulary = len(vocabulary1)
    new_vocabulary = len(vocabulary2)
    
    old_words = round(100*len(vocabulary1-vocabulary2)/old_vocabulary, 2)
    new_words = round(100*len(vocabulary2-vocabulary1)/old_vocabulary, 2)
    vocabulary_change = {"epoch":epoch, "old_vocabulary": old_vocabulary, "new_vocabulary": new_vocabulary, 
     "%old_words": old_words, "%new_words":new_words}
    innovation_data.append(vocabulary_change)


In [53]:
innovation_rate = pd.DataFrame(innovation_data)
innovation_rate

Unnamed: 0,epoch,old_vocabulary,new_vocabulary,%old_words,%new_words
0,0,1145,1187,14.41,18.08
1,1,1187,1281,13.56,21.48
2,2,1281,1329,13.35,17.1
3,3,1329,1405,12.57,18.28
4,4,1405,1537,10.25,19.64
