<a href="https://colab.research.google.com/github/devthumos/Processamento_de_Linguagem_Natural/blob/main/Atividade_avaliativa_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Pegando o dataset

In [21]:
import nltk
import pandas as pd
import re
import math

nltk.download('reuters')
from nltk.corpus import reuters

cats = reuters.categories()
print("Reuters has %d categories:\n%s" % (len(cats), cats))

fileids = reuters.fileids()

Reuters has 90 categories:
['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [22]:
categories = []
text = []

for file in fileids:
    categories.append(reuters.categories(file))
    text.append(reuters.raw(file))

df = pd.DataFrame({'ids':fileids, 'categories':categories, 'text':text})

## Pré processamento

### Normalizando o texto dos documentos

In [23]:
def simple(sentence):
  # pattern1 = r"[!\$@#,\.'\"’\)\(\s&\*\-\?~\{\}]"
  pattern1 = r"[\(\)!\$\.,!\-'\"\{\}\s><]+"  # Substituir pontuções e etc
  # pattern2 = r"\\u[0-9]{2,4}"  # Substituir as falhas no encoding
  

  simple = re.sub("&amp", "&", sentence, flags=re.I)
  simple = re.sub("&lt;", "", simple, flags=re.I)
  simple = re.sub("&gt;", "", simple, flags=re.I)
  simple = re.sub(pattern1, " ", simple, flags=re.I)
  simple = simple.lower()

  return simple

In [24]:
df["simple_text"] = df.text.apply(simple)
df.head()

Unnamed: 0,ids,categories,text,simple_text
0,test/14826,[trade],ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...,asian exporters fear damage from u s japan rif...
1,test/14828,[grain],CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...,china daily says vermin eat 7 12 pct grain sto...
2,test/14829,"[crude, nat-gas]",JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...,japan to revise long term energy demand downwa...
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...,thai trade deficit widens in first quarter tha...
4,test/14833,"[palm-oil, veg-oil]",INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...,indonesia sees cpo price rising sharply indone...


### Montando o Corpus

In [25]:
def create_corpus(dataframe):
  corpus = []
  df["simple_text"] = df.text.apply(simple)
  for doc in df.simple_text:
    corpus.append(doc)
  
  return corpus

In [26]:
corpus = create_corpus(df)

### Criando o Vocabulário

In [27]:
vocab = list(set((" ".join(corpus)).split()))

In [28]:
vocab[0:10]

['knorr',
 'londrina',
 'foray',
 'onpraise',
 'row',
 'breach',
 'plates',
 'saks',
 'meets',
 'implicit']

## Criando o **tf-idf**

Criando o **tf** e o **idf_dict**

In [29]:
idf_dict = {}  ## Vai guardar em quantos documentos o termo apareceu

def tf_vector(sentence):
  global idf, vocab

  tf_list = []
  
  for word in vocab:
    tf_list.append(sentence.count(word)/len(simple(sentence).split()))

  ## Aproveitando o apply para pegar o idf_dict
  for word in vocab:
    if word not in idf_dict:
      idf_dict[word] = 1
    else:
      if word in sentence:
        idf_dict[word] += 1

  return tf_list


Aplicando o tf_vector no used_df

In [30]:
teste_df = df[["ids", "categories", "simple_text"]].iloc[:100, :]
teste_df["tf_vector"] = teste_df.simple_text.apply(tf_vector)
teste_df.head()

Unnamed: 0,ids,categories,simple_text,tf_vector
0,test/14826,[trade],asian exporters fear damage from u s japan rif...,"[0.0, 0.0, 0.0, 0.0, 0.001314060446780552, 0.0..."
1,test/14828,[grain],china daily says vermin eat 7 12 pct grain sto...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,test/14829,"[crude, nat-gas]",japan to revise long term energy demand downwa...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",thai trade deficit widens in first quarter tha...,"[0.0, 0.0, 0.0, 0.0, 0.012658227848101266, 0.0..."
4,test/14833,"[palm-oil, veg-oil]",indonesia sees cpo price rising sharply indone...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


Definindo qual DataFrame iremos usar, se é o de teste ou o DataFrame principal

In [31]:
used_df = teste_df
# used_df = df

Aplicando o tf_vector para todas as notícias

In [32]:
# df["tf_vector"] = df.text.apply(tf_vector)
# df.head()

criando o **tf-idf**

In [33]:
def tf_idf(tf_vector: list) -> list:
  global corpus, idf_dict
  tf_idf = []

  for k, tf_element in enumerate(tf_vector):
    _ = tf_idf.append(tf_element * math.log(len(corpus)/idf_dict[vocab[k]], 10)) if vocab[k] != 0 else tf_idf.append(0) ## vocab[k] é a word, TESTADO
    # print(vocab[k], "something")  ## Para testar se vocab[k] retorna o primeiro item de vocab, que no caso é path CORRETO
    # return None  TESTE

  return tf_idf

adicionando a coluna tf_df_vector no teste_df de 100 notícias

In [34]:
used_df["tf_idf_vector"] = used_df["tf_vector"].apply(tf_idf)
used_df.head()

Unnamed: 0,ids,categories,simple_text,tf_vector,tf_idf_vector
0,test/14826,[trade],asian exporters fear damage from u s japan rif...,"[0.0, 0.0, 0.0, 0.0, 0.001314060446780552, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0037540731652104756, 0...."
1,test/14828,[grain],china daily says vermin eat 7 12 pct grain sto...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,test/14829,"[crude, nat-gas]",japan to revise long term energy demand downwa...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",thai trade deficit widens in first quarter tha...,"[0.0, 0.0, 0.0, 0.0, 0.012658227848101266, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.036162654161078125, 0.0..."
4,test/14833,"[palm-oil, veg-oil]",indonesia sees cpo price rising sharply indone...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Criando o Bag of Words

In [35]:
def bag_of_words_binary(sentence: str) -> list:
  global vocab

  binary_list = []
  termos_list = sentence.split()

  for word in vocab:
    if word in sentence:
      binary_list.append(1)
    else:
      binary_list.append(0)

  return binary_list

In [36]:
used_df["binary_vector"] = used_df.simple_text.apply(bag_of_words_binary)
used_df.head()

Unnamed: 0,ids,categories,simple_text,tf_vector,tf_idf_vector,binary_vector
0,test/14826,[trade],asian exporters fear damage from u s japan rif...,"[0.0, 0.0, 0.0, 0.0, 0.001314060446780552, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0037540731652104756, 0....","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,test/14828,[grain],china daily says vermin eat 7 12 pct grain sto...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,test/14829,"[crude, nat-gas]",japan to revise long term energy demand downwa...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",thai trade deficit widens in first quarter tha...,"[0.0, 0.0, 0.0, 0.0, 0.012658227848101266, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.036162654161078125, 0.0...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,test/14833,"[palm-oil, veg-oil]",indonesia sees cpo price rising sharply indone...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Aplicando dissimilaridade de cosseno em ambas representações, Binary Bag of Words e TF-IDF

In [37]:
def cossine_dissimilarity(vetor1, vetor2):   ## A similaridade de cossenos tem que receber vetores numéricos
  prod_interno = 0
  norma1 = 0
  norma2 = 0

  for valor1, valor2 in zip(vetor1, vetor2):
    prod_interno += valor1*valor2
    norma1 += valor1*valor1
    norma2 += valor2*valor2

  norma1 **= 0.5
  norma2 **= 0.5



  return prod_interno/(norma1*norma2)

In [38]:
def maxim(top_dict: dict):
  # top_list = list(top_dict.items())
  top_list = list(map(lambda x: (x[0], abs(x[1])), top_dict.items()))
  maior = top_list[0]
  # print(top_list)  ## TESTE

  # print(top_list)
  for k, item in enumerate(top_list[:-1]):
    # print("{} <= {}  {}".format(item[1], top_list[k+1][1], item[1] <= top_list[k+1][1]))
    if maior[1] <= top_list[k+1][1]:
      maior = top_list[k+1]

  #   print(maior)
  # print("\n")
  return maior

# Teste minim
top_dict = {
    "row 1": 3,
    "row 2": 10,
    "row 3": 5,
    "row 4": 9,
}
maxim(top_dict)

count = 0
def top_10_b(binary_vector) -> list:
  global used_df, count
  plus = 10
  # print(count)
  # top_dict = {k: cossine_dissimilarity(binary_vector, doc_binary_vector) for k, doc_binary_vector in enumerate(df_aux.iloc[0:10, :].binary_vector) if k != count}  ## dict {ids_1: cossine_dissimilarity_1....}
  if count < 10:
    top_dict = {k: cossine_dissimilarity(binary_vector, doc_binary_vector) for k, doc_binary_vector in enumerate(used_df.iloc[0:11, :].binary_vector) if k != count}
    # print("Menor que 10")
    # for k, doc_binary_vector in enumerate(df_aux.iloc[0:10, :].binary_vector):
    #   if k == count:
    #     print(f"{k} != {count} {k != count}   {teste_df.ids[k]}")
    # top_dict = {print(f"{k} != {count} {k != count}") for k, doc_binary_vector in enumerate(df_aux.iloc[0:10, :].binary_vector) if k != count}
    plus = 11
  else:
    top_dict = {k: cossine_dissimilarity(binary_vector, doc_binary_vector) for k, doc_binary_vector in enumerate(used_df.iloc[0:10, :].binary_vector)}
    # for k, doc_binary_vector in enumerate(df_aux.iloc[0:10, :].binary_vector):
    #   print(f"{k} != {count} {k != count}")

  # return None

  # top_dict = [(teste_df.ids[k], value) for k, value in sorted(top_dict.items())]
  # print(top_dict)



  ## Aqui temos um dicionário top_dict com os top 10 de similaridade
  for k, doc_binary_vector in enumerate(used_df.iloc[10:, :].binary_vector):
    if k != count:
      top_dict.update({k + plus: cossine_dissimilarity(binary_vector, doc_binary_vector)}) 
      maximo = maxim(top_dict)
      del top_dict[maximo[0]]

  count += 1
  sorted_dict = sorted(top_dict.items(), key=lambda x: x[1])

  top_list = [(teste_df.ids[k], value) for k, value in sorted_dict]

  return top_list

count1 = 0
def top_10_tf_idf(tf_idf_vector) -> list:
  global used_df, count1
  plus = 10
  # print(count)
  # top_dict = {k: cossine_dissimilarity(binary_vector, doc_binary_vector) for k, doc_binary_vector in enumerate(df_aux.iloc[0:10, :].binary_vector) if k != count}  ## dict {ids_1: cossine_dissimilarity_1....}
  if count1 < 10:
    top_dict = {k: cossine_dissimilarity(tf_idf_vector, doc_tf_idf_vector) for k, doc_tf_idf_vector in enumerate(used_df.iloc[0:11, :].tf_idf_vector) if k != count1}
    # print("Menor que 10")
    # for k, doc_binary_vector in enumerate(df_aux.iloc[0:10, :].binary_vector):
    #   if k == count:
    #     print(f"{k} != {count} {k != count}   {teste_df.ids[k]}")
    # top_dict = {print(f"{k} != {count} {k != count}") for k, doc_binary_vector in enumerate(df_aux.iloc[0:10, :].binary_vector) if k != count}
    plus = 11
  else:
    top_dict = {k: cossine_dissimilarity(tf_idf_vector, doc_tf_idf_vector) for k, doc_tf_idf_vector in enumerate(used_df.iloc[0:10, :].tf_idf_vector)}
    # for k, doc_binary_vector in enumerate(df_aux.iloc[0:10, :].binary_vector):
    #   print(f"{k} != {count} {k != count}")

  # return None

  # top_dict = [(teste_df.ids[k], value) for k, value in sorted(top_dict.items())]
  # print(top_dict)



  ## Aqui temos um dicionário top_dict com os top 10 de similaridade
  for k, doc_tf_idf_vector in enumerate(used_df.iloc[10:, :].binary_vector):
    if k != count1:
      top_dict.update({k + plus: cossine_dissimilarity(tf_idf_vector, doc_tf_idf_vector)}) 
      maximo = maxim(top_dict)
      del top_dict[maximo[0]]

  count1 += 1
  sorted_dict = sorted(top_dict.items(), key=lambda x: x[1])

  top_list = [(teste_df.ids[k], value) for k, value in sorted_dict]

  return top_list

# test_top_10_bb = teste_df.iloc[:2,:]
# test_top_10_bb
# test_top_10_bb["top_10"] = test_top_10_bb.binary_vector.apply(top_10_bb)
# test_top_10_bb.top_10[0]
# type(test_top_10_bb)
# test_top_10_bb
# for k in range(test_top_10_bb.shape[0]):
#   # print(top_10_bb(df_row))
#   # print(test_top_10_bb[k])
#   break

# for k, row in enumerate(test_top_10_bb.binary_vector):
#   print(row)
#   break

# for k in ra



In [39]:
used_df["binary_top_10"] = used_df.binary_vector.apply(top_10_b)
used_df.head()

Unnamed: 0,ids,categories,simple_text,tf_vector,tf_idf_vector,binary_vector,binary_top_10
0,test/14826,[trade],asian exporters fear damage from u s japan rif...,"[0.0, 0.0, 0.0, 0.0, 0.001314060446780552, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0037540731652104756, 0....","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14959, 0.14125975696169477), (test/1496..."
1,test/14828,[grain],china daily says vermin eat 7 12 pct grain sto...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14982, 0.1974975884213874), (test/14967..."
2,test/14829,"[crude, nat-gas]",japan to revise long term energy demand downwa...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14967, 0.16850509274845388), (test/1495..."
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",thai trade deficit widens in first quarter tha...,"[0.0, 0.0, 0.0, 0.0, 0.012658227848101266, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.036162654161078125, 0.0...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14967, 0.19518128953860608), (test/1496..."
4,test/14833,"[palm-oil, veg-oil]",indonesia sees cpo price rising sharply indone...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14959, 0.1975808762619464), (test/14967..."


In [40]:
used_df["tf_idf_top_10"] = used_df.tf_idf_vector.apply(top_10_tf_idf)
used_df.head()

Unnamed: 0,ids,categories,simple_text,tf_vector,tf_idf_vector,binary_vector,binary_top_10,tf_idf_top_10
0,test/14826,[trade],asian exporters fear damage from u s japan rif...,"[0.0, 0.0, 0.0, 0.0, 0.001314060446780552, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0037540731652104756, 0....","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14959, 0.14125975696169477), (test/1496...","[(test/14922, 0.22773539832360978), (test/1489..."
1,test/14828,[grain],china daily says vermin eat 7 12 pct grain sto...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14982, 0.1974975884213874), (test/14967...","[(test/14922, 0.2223250305922412), (test/14892..."
2,test/14829,"[crude, nat-gas]",japan to revise long term energy demand downwa...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14967, 0.16850509274845388), (test/1495...","[(test/14922, 0.22376552992616333), (test/1489..."
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",thai trade deficit widens in first quarter tha...,"[0.0, 0.0, 0.0, 0.0, 0.012658227848101266, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.036162654161078125, 0.0...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14967, 0.19518128953860608), (test/1496...","[(test/14891, 0.23630466926371615), (test/1492..."
4,test/14833,"[palm-oil, veg-oil]",indonesia sees cpo price rising sharply indone...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14959, 0.1975808762619464), (test/14967...","[(test/14922, 0.22077968299832845), (test/1489..."


## Acurácia da nossa utilização do cosseno de dissimilaridade com as representações Binary Bag of Words e TF-IDF

In [41]:
index = 0
def binary_acuracy(list_top_10):
  global index, used_df
  acuracy = 0

  for doc in list_top_10:
    # print(doc)  ## TESTE
    # return None  ## TESTE
    # print(used_df[used_df.ids == doc[0]].categories.tolist()[0])
    for cat_used in used_df.iloc[index,:].categories:
      # print("{} in {} == {}".format(cat_used, used_df[used_df.ids == doc[0]].categories, cat_used in used_df[used_df.ids == doc[0]].categories.tolist()[0])) ## TESTE
      # print(categories)
      if cat_used in used_df[used_df.ids == doc[0]].categories.tolist()[0]:
        acuracy += 1
        break
  # print("\n")  ## TESTE
    
  index += 1

  return acuracy

index1 = 0
def tf_idf_acuracy(list_top_10):
  global index1, used_df
  acuracy = 0

  for doc in list_top_10:
    # print(doc)  ## TESTE
    # return None  ## TESTE
    # print(used_df[used_df.ids == doc[0]].categories.tolist()[0])
    for cat_used in used_df.iloc[index1,:].categories:
      # print("{} in {} == {}".format(cat_used, used_df[used_df.ids == doc[0]].categories, cat_used in used_df[used_df.ids == doc[0]].categories.tolist()[0])) ## TESTE
      # print(categories)
      if cat_used in used_df[used_df.ids == doc[0]].categories.tolist()[0]:
        acuracy += 1
        break
  # print("\n")  ## TESTE
    
  index1 += 1

  return acuracy


### Aplicando a função de acurácia no binary_top_10

In [42]:
used_df["binary_acuracy"] = used_df.binary_top_10.apply(binary_acuracy)
used_df.head()

Unnamed: 0,ids,categories,simple_text,tf_vector,tf_idf_vector,binary_vector,binary_top_10,tf_idf_top_10,binary_acuracy
0,test/14826,[trade],asian exporters fear damage from u s japan rif...,"[0.0, 0.0, 0.0, 0.0, 0.001314060446780552, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0037540731652104756, 0....","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14959, 0.14125975696169477), (test/1496...","[(test/14922, 0.22773539832360978), (test/1489...",0
1,test/14828,[grain],china daily says vermin eat 7 12 pct grain sto...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14982, 0.1974975884213874), (test/14967...","[(test/14922, 0.2223250305922412), (test/14892...",0
2,test/14829,"[crude, nat-gas]",japan to revise long term energy demand downwa...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14967, 0.16850509274845388), (test/1495...","[(test/14922, 0.22376552992616333), (test/1489...",0
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",thai trade deficit widens in first quarter tha...,"[0.0, 0.0, 0.0, 0.0, 0.012658227848101266, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.036162654161078125, 0.0...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14967, 0.19518128953860608), (test/1496...","[(test/14891, 0.23630466926371615), (test/1492...",1
4,test/14833,"[palm-oil, veg-oil]",indonesia sees cpo price rising sharply indone...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14959, 0.1975808762619464), (test/14967...","[(test/14922, 0.22077968299832845), (test/1489...",0


### Aplicando a função de acurácia no tf_idf_top_10

In [43]:
used_df["tf_idf_acuracy"] = used_df.tf_idf_top_10.apply(tf_idf_acuracy)
used_df.head()

Unnamed: 0,ids,categories,simple_text,tf_vector,tf_idf_vector,binary_vector,binary_top_10,tf_idf_top_10,binary_acuracy,tf_idf_acuracy
0,test/14826,[trade],asian exporters fear damage from u s japan rif...,"[0.0, 0.0, 0.0, 0.0, 0.001314060446780552, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0037540731652104756, 0....","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14959, 0.14125975696169477), (test/1496...","[(test/14922, 0.22773539832360978), (test/1489...",0,1
1,test/14828,[grain],china daily says vermin eat 7 12 pct grain sto...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14982, 0.1974975884213874), (test/14967...","[(test/14922, 0.2223250305922412), (test/14892...",0,0
2,test/14829,"[crude, nat-gas]",japan to revise long term energy demand downwa...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14967, 0.16850509274845388), (test/1495...","[(test/14922, 0.22376552992616333), (test/1489...",0,0
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",thai trade deficit widens in first quarter tha...,"[0.0, 0.0, 0.0, 0.0, 0.012658227848101266, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.036162654161078125, 0.0...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14967, 0.19518128953860608), (test/1496...","[(test/14891, 0.23630466926371615), (test/1492...",1,3
4,test/14833,"[palm-oil, veg-oil]",indonesia sees cpo price rising sharply indone...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(test/14959, 0.1975808762619464), (test/14967...","[(test/14922, 0.22077968299832845), (test/1489...",0,1


In [44]:
# pd.set_option('display.max_rows', None)
# pd.set_option('max_colwidth', 400)
# used_df

In [45]:
# used_df.binary_top_10

## Comparando Binary Bag of Words e TF-IDF

Acurácia média da dissimilaridade de cosseno utilizando Binary Bag of Words

In [46]:
ba_mean = (used_df["binary_acuracy"]/10).sum()/used_df.shape[0]
ba_mean

0.17600000000000002

### Aplicando a função de acurácia no tf_idf_top_10

In [47]:
bt_mean = (used_df["tf_idf_acuracy"]/10).sum()/used_df.shape[0]
bt_mean

0.078