In [40]:
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
import re
import seaborn as sns
import nltk 
import heapq
from nltk import RegexpTokenizer as rpt
from nltk.corpus import stopwords as sw
from string import punctuation 

nltk.download('punkt')
nltk.download('stopwords')
stopwords = sw.words('portuguese')

data_url="https://raw.githubusercontent.com/gabrielsvinha/text_processing/master/inputs.csv"
data = pd.read_csv(data_url).replace(np.nan, '', regex=True)
documents = data.text.count()
N = documents

def parse(text):
    words = []
    word_pattern = rpt(r'\w+')
    year_pattern = rpt(r'\d{4}')
    
    patterns = [word_pattern, year_pattern]
    
    for pattern in patterns:
        tokens = []
        for token in pattern.tokenize(text):
            if token not in stopwords and len(token) > 3:
                tokens.append(token)
        words.extend(tokens)
    return words


def build_index(dataset):
    document_index = 0
    index = {"doc_row": []}
    
    for entry in dataset.text:
        document_index = document_index + 1
        index["doc_row"].append(document_index)
            
        for ngram in parse(entry):
                if ngram in index:
                    if document_index in index[ngram]:
                        index[ngram][document_index] = index[ngram][document_index] + 1
                    else:
                        index[ngram][document_index] = 1 
                else:
                    index[ngram] = {document_index: 1}
    
    return index
                        
index = build_index(data)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/gabrielvinha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gabrielvinha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
def dice(index, word1, word2):
    if word1 in index:
        inverted_list = index[word1]
        n_A = float(len(inverted_list))
    else:
        n_A = 0.0
        
    if word2 in index:
        inverted_list = index[word2]
        n_B = float(len(inverted_list))
    else:
        n_B = 0.0
    
    docs = 0.0
    
    if word1 not in index or word2 not in index:
        n_AB = 0.0
    else:
        word1_inverted_list = index[word1].keys()
        word2_inverted_list = index[word2].keys()
    
        for doc_id in word1_inverted_list:
            if doc_id in word2_inverted_list:
                docs += 1.0
    
        n_AB = docs
    
    dom = n_A + n_B
    
    if dom == 0.0:
        return 0
    else:
        return n_AB/dom

def emim(index, word1, word2):
    if word1 in index:
        inverted_list = index[word1]
        n_A = float(len(inverted_list))
    else:
        n_A = 0.0
        
    if word2 in index:
        inverted_list = index[word2]
        n_B = float(len(inverted_list))
    else:
        n_B = 0.0
    
    docs = 0.0
    
    if word1 not in index or word2 not in index:
        n_AB = 0.0
    else:
        word1_inverted_list = index[word1].keys()
        word2_inverted_list = index[word2].keys()
    
        for doc_id in word1_inverted_list:
            if doc_id in word2_inverted_list:
                docs += 1.0
    
        n_AB = docs
    
    
    dom = n_A * n_B
    factor = 0
    
    if (dom != 0):
        factor = N * (n_AB/dom)
        
    if factor == 0: return 0
       
    return n_AB * math.log10(factor)


def chi2(index, word1, word2):
    if word1 in index:
        inverted_list = index[word1]
        n_A = float(len(inverted_list))
    else:
        n_A = 0.0
        
    if word2 in index:
        inverted_list = index[word2]
        n_B = float(len(inverted_list))
    else:
        n_B = 0.0
    
    docs = 0.0
    
    if word1 not in index or word2 not in index:
        n_AB = 0.0
    else:
        word1_inverted_list = index[word1].keys()
        word2_inverted_list = index[word2].keys()
    
        for doc_id in word1_inverted_list:
            if doc_id in word2_inverted_list:
                docs += 1.0
    
        n_AB = docs
    
    dom = n_A * n_B    
    if (dom == 0): return 0
    
    num = math.pow((n_AB - (1/N)*n_A*n_B),2)
    
    return num/dom
    
def mim(index, word1, word2):
    if word1 in index:
        inverted_list = index[word1]
        n_A = float(len(inverted_list))
    else:
        n_A = 0.0
        
    if word2 in index:
        inverted_list = index[word2]
        n_B = float(len(inverted_list))
    else:
        n_B = 0.0
    
    docs = 0.0
    
    if word1 not in index or word2 not in index:
        n_AB = 0.0
    else:
        word1_inverted_list = index[word1].keys()
        word2_inverted_list = index[word2].keys()
    
        for doc_id in word1_inverted_list:
            if doc_id in word2_inverted_list:
                docs += 1.0
    
        n_AB = docs
    
    dom = n_A * n_B            
    
    if dom == 0.0:
        return 0
    else:
        return n_AB/dom

In [42]:
def get_query_top10rank(index, query, metric):
    lst=[]
    for word in index.keys():
        if word != 'doc_row' and word != query:
          lst.append([word, metric(index, query, word)])
    
    df_tmp = pd.DataFrame(lst, columns=["word", "metric"])
    df_tmp['r']= df_tmp.metric.rank(ascending=False, method="first")
    df_tmp.sort_values("r", inplace = True)
    
    y = []
    for word in df_tmp[:10].word:
        y.append(word)
        
    return y

In [43]:
queries = ["lula","bolsonaro","guedes","general","presidente"]
methods = [mim, emim, chi2, dice]
dataframes = []

i = 0
for query in queries:
    df = pd.DataFrame(columns=["MIM","EMIM","X2","DICE"])
    for i in range(4):
        i += 1
        if (i == 1):
            df["MIM"] = get_query_top10rank(index, query, mim)
        elif (i == 2):
            df["EMIM"] = get_query_top10rank(index, query, emim)
        elif (i == 3):
            df["X2"] = get_query_top10rank(index, query, chi2)
        else:
            df["DICE"] = get_query_top10rank(index, query, dice)
    
    dataframes.append(df)

### Query: lula

In [44]:
dataframes[0]

Unnamed: 0,MIM,EMIM,X2,DICE
0,gerais,gerais,gerais,gerais
1,detida,detida,detida,detida
2,Bahia,Bahia,Bahia,Bahia
3,vergonhas,vergonhas,vergonhas,vergonhas
4,advindos,advindos,advindos,advindos
5,israelenses,israelenses,israelenses,israelenses
6,calcular,calcular,calcular,calcular
7,compositor,compositor,compositor,compositor
8,nulidade,nulidade,nulidade,nulidade
9,buscamos,buscamos,buscamos,buscamos


### Query: bolsonaro

In [45]:
dataframes[1]

Unnamed: 0,MIM,EMIM,X2,DICE
0,gerais,gerais,gerais,gerais
1,detida,detida,detida,detida
2,Bahia,Bahia,Bahia,Bahia
3,vergonhas,vergonhas,vergonhas,vergonhas
4,advindos,advindos,advindos,advindos
5,israelenses,israelenses,israelenses,israelenses
6,calcular,calcular,calcular,calcular
7,compositor,compositor,compositor,compositor
8,nulidade,nulidade,nulidade,nulidade
9,buscamos,buscamos,buscamos,buscamos


### Query: guedes

In [46]:
dataframes[2]

Unnamed: 0,MIM,EMIM,X2,DICE
0,37uVTIj4FO,37uVTIj4FO,37uVTIj4FO,37uVTIj4FO
1,apelidos,apelidos,apelidos,apelidos
2,privilegiados,privilegiados,privilegiados,privilegiados
3,personagens,personagens,personagens,personagens
4,estribeiras,estribeiras,estribeiras,estribeiras
5,portadores,portadores,portadores,portadores
6,bolsominions,bolsominions,bolsominions,bolsominions
7,agredindo,agredindo,agredindo,agredindo
8,LulaLivreQuintaSDV,LulaLivreQuintaSDV,LulaLivreQuintaSDV,LulaLivreQuintaSDV
9,comentando,comentando,comentando,comentando


### Query: general

In [47]:
dataframes[3]

Unnamed: 0,MIM,EMIM,X2,DICE
0,insalubridade,respeito,insalubridade,insalubridade
1,rejeitos,Leia,rejeitos,rejeitos
2,Institucional,insalubridade,Institucional,Institucional
3,sensível,rejeitos,sensível,sensível
4,Heleno,Institucional,Heleno,Heleno
5,perfumado,sensível,perfumado,perfumado
6,aprovaram,Heleno,aprovaram,aprovaram
7,Clinton,perfumado,Clinton,Clinton
8,piscicultura,aprovaram,piscicultura,piscicultura
9,atalho,Clinton,atalho,atalho


### Query: presidente

In [48]:
dataframes[4]

Unnamed: 0,MIM,EMIM,X2,DICE
0,detida,Bolsonaro,podem,Bolsonaro
1,vergonhas,Jair,dessa,Jair
2,advindos,contra,informa,disse
3,israelenses,disse,primeiros,feira
4,calcular,domingo,funciona,contra
5,8h45,após,total,Brasil
6,contabilizados,prisão,desde,governo
7,seviciou,Lula,Jair,sobre
8,indagado,Justiça,Bolsonaro,ministro
9,boleto,deputado,domingo,afirmou
