<a href="https://colab.research.google.com/github/dudaholandah/NLP/blob/main/Trabalhos/T3/NLP_T3_100.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import reuters
import math
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.corpus import wordnet as wn

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Pre-Processing

In [3]:
def pre_processing(text):
  stopwords_nltk = stopwords.words('english')
  stemmer = PorterStemmer()
  emojiDict = {':)' : 'happy', '(:' : 'happy', ':(': 'sad', '):': 'sad', '(;' : 'wink', ';)' : 'wink', ':D' : 'happy', 'D:': 'sad'}
  wn_lemmas = set(wn.all_lemma_names())

  # substituindo os emojis por palavras
  for word, initial in emojiDict.items():
    text = text.replace(word.lower(), initial)
  
  # regex para tirar caracteres, numeros, upcode e links htpp
  words = re.sub(r'[\s!%*~\^´`=+<\[\]?&$:;@#.0-9()\/\"\'_-]+|[\s]*[\\]u[a-z0-9]{4}[\s]*|[\s]*http[^\s]+[\s]*', " ", text.lower())
  words = re.sub(r'\brt\b', "", words)
  words = re.sub(r'[\\]+', " ", words)
  words = words.strip()
  
  phrase = ""
  words = words.split()

  # tirando stopwords
  # colocando apenas as palavras que estao na gramatica wordnet
  # fazendo stematizacao
  for x in words:
      if x not in stopwords_nltk and x in wn_lemmas:
          x = stemmer.stem(x)
          phrase = phrase + " " + x
        
  return phrase.strip()

In [4]:
categories = []
corpus = []
fileids = reuters.fileids()

for file in fileids:
    categories.append(reuters.categories(file))
    corpus.append(pre_processing(reuters.raw(file)))

df = pd.DataFrame({'ids':fileids, 'categories':categories, 'text':corpus})

In [5]:
df

Unnamed: 0,ids,categories,text
0,test/14826,[trade],asian fear damag u japan rift mount trade fric...
1,test/14828,[grain],china daili vermin eat pct grain stock survey ...
2,test/14829,"[crude, nat-gas]",japan revis long term energi demand downward m...
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",thai trade deficit first quarter thailand trad...
4,test/14833,"[palm-oil, veg-oil]",indonesia price rise sharpli indonesia crude p...
...,...,...,...
10783,training/999,"[interest, money-fx]",u k money market shortag forecast revis bank e...
10784,training/9992,[earn],knight inc quarterli prior pay april record ap...
10785,training/9993,[earn],inc quarterli prior pay april record april seven
10786,training/9994,[earn],nationwid cellular servic inc th loss six loss...


In [6]:
phrase = " ".join(corpus)
tokens = phrase.split()
vocab = sorted(set(tokens))

# Bag of Words

## Bag of Words Binário

In [7]:
bow_binario = {}

for i,doc in enumerate(corpus):
  bow_binario[i] = dict()
  for word in doc.split():
    bow_binario[i][word] = 1

In [8]:
bow_binario

{0: {'abl': 1,
  'account': 1,
  'action': 1,
  'advantag': 1,
  'alleg': 1,
  'allow': 1,
  'also': 1,
  'american': 1,
  'analyst': 1,
  'april': 1,
  'asia': 1,
  'asian': 1,
  'associ': 1,
  'australia': 1,
  'australian': 1,
  'avow': 1,
  'awar': 1,
  'beyond': 1,
  'billion': 1,
  'block': 1,
  'boost': 1,
  'broker': 1,
  'budget': 1,
  'businessmen': 1,
  'button': 1,
  'call': 1,
  'canberra': 1,
  'chairman': 1,
  'chief': 1,
  'co': 1,
  'coal': 1,
  'commerci': 1,
  'complet': 1,
  'conflict': 1,
  'continu': 1,
  'cost': 1,
  'countri': 1,
  'cut': 1,
  'damag': 1,
  'day': 1,
  'defus': 1,
  'democrat': 1,
  'deputi': 1,
  'despit': 1,
  'deterior': 1,
  'diplomat': 1,
  'director': 1,
  'disadvantag': 1,
  'disput': 1,
  'domest': 1,
  'due': 1,
  'econom': 1,
  'economi': 1,
  'effort': 1,
  'electr': 1,
  'electron': 1,
  'emerg': 1,
  'end': 1,
  'eros': 1,
  'exchang': 1,
  'expand': 1,
  'export': 1,
  'extend': 1,
  'failur': 1,
  'far': 1,
  'fear': 1,
  'feder':

## Bag of Words Contagem

In [9]:
bow_contagem = {}

for i,doc in enumerate(corpus):
  bow_contagem[i] = dict()
  for word in doc.split():
    if word in bow_contagem[i]:
      bow_contagem[i][word] += 1
    else:
      bow_contagem[i][word] = 1

In [10]:
bow_contagem

{0: {'abl': 1,
  'account': 1,
  'action': 1,
  'advantag': 1,
  'alleg': 2,
  'allow': 2,
  'also': 4,
  'american': 1,
  'analyst': 1,
  'april': 1,
  'asia': 1,
  'asian': 2,
  'associ': 1,
  'australia': 2,
  'australian': 1,
  'avow': 1,
  'awar': 1,
  'beyond': 1,
  'billion': 5,
  'block': 1,
  'boost': 2,
  'broker': 1,
  'budget': 1,
  'businessmen': 4,
  'button': 2,
  'call': 1,
  'canberra': 1,
  'chairman': 1,
  'chief': 1,
  'co': 1,
  'coal': 1,
  'commerci': 1,
  'complet': 1,
  'conflict': 1,
  'continu': 1,
  'cost': 2,
  'countri': 1,
  'cut': 1,
  'damag': 1,
  'day': 1,
  'defus': 1,
  'democrat': 1,
  'deputi': 2,
  'despit': 1,
  'deterior': 1,
  'diplomat': 1,
  'director': 1,
  'disadvantag': 1,
  'disput': 2,
  'domest': 1,
  'due': 1,
  'econom': 2,
  'economi': 2,
  'effort': 1,
  'electr': 1,
  'electron': 4,
  'emerg': 1,
  'end': 1,
  'eros': 1,
  'exchang': 1,
  'expand': 1,
  'export': 2,
  'extend': 1,
  'failur': 1,
  'far': 1,
  'fear': 1,
  'feder':

## Bag of Words Ponderado

In [11]:
bow_ponderado = {}
N = {} # qntd de termos de cada sentenca

for i,doc in enumerate(corpus):
    bow_ponderado[i] = dict()
    N[i] = len(doc.split())
    for word in doc.split():
        if word in bow_ponderado[i]:
            bow_ponderado[i][word] += 1
        else:
            bow_ponderado[i][word] = 1

for doc in bow_ponderado:
    for word in bow_ponderado[doc]:
        bow_ponderado[doc][word] /= N[doc]   

In [12]:
bow_ponderado

{0: {'abl': 0.003003003003003003,
  'account': 0.003003003003003003,
  'action': 0.003003003003003003,
  'advantag': 0.003003003003003003,
  'alleg': 0.006006006006006006,
  'allow': 0.006006006006006006,
  'also': 0.012012012012012012,
  'american': 0.003003003003003003,
  'analyst': 0.003003003003003003,
  'april': 0.003003003003003003,
  'asia': 0.003003003003003003,
  'asian': 0.006006006006006006,
  'associ': 0.003003003003003003,
  'australia': 0.006006006006006006,
  'australian': 0.003003003003003003,
  'avow': 0.003003003003003003,
  'awar': 0.003003003003003003,
  'beyond': 0.003003003003003003,
  'billion': 0.015015015015015015,
  'block': 0.003003003003003003,
  'boost': 0.006006006006006006,
  'broker': 0.003003003003003003,
  'budget': 0.003003003003003003,
  'businessmen': 0.012012012012012012,
  'button': 0.006006006006006006,
  'call': 0.003003003003003003,
  'canberra': 0.003003003003003003,
  'chairman': 0.003003003003003003,
  'chief': 0.003003003003003003,
  'co': 

# Inverse Document Frequency

In [13]:
idf = {}

for word in vocab:
  idf[word] = 0
  for doc in bow_contagem:
    if word in bow_contagem[doc]:
      idf[word] += 1

for word in idf:
  idf[word] = math.log(len(corpus)/idf[word])

In [14]:
idf

{'aa': 7.494430215031565,
 'aaa': 8.593042503699674,
 'aar': 9.28618968425962,
 'aaron': 8.18757739559151,
 'ab': 5.479527194489301,
 'abandon': 5.597310230145684,
 'abat': 8.18757739559151,
 'abbey': 8.18757739559151,
 'abbrevi': 9.28618968425962,
 'abc': 8.593042503699674,
 'aberr': 9.28618968425962,
 'abid': 6.452976340203405,
 'abidjan': 8.593042503699674,
 'abil': 4.96870157072331,
 'abilen': 9.28618968425962,
 'abl': 4.295757097480884,
 'ablaz': 8.18757739559151,
 'abnorm': 7.67675177182552,
 'aboard': 9.28618968425962,
 'abolish': 6.80128303447162,
 'abolit': 7.494430215031565,
 'abort': 9.28618968425962,
 'abraham': 8.593042503699674,
 'abroad': 5.334945965678193,
 'abrupt': 8.18757739559151,
 'abruptli': 8.593042503699674,
 'absenc': 6.150695468330471,
 'absent': 8.18757739559151,
 'absolut': 6.241667246536197,
 'absolv': 9.28618968425962,
 'absorb': 5.8522024797744745,
 'absorpt': 8.18757739559151,
 'abstain': 9.28618968425962,
 'absurd': 8.593042503699674,
 'abund': 7.088965

# TF-IDF

In [15]:
bow_tfidf = {}

for doc in bow_ponderado:
  bow_tfidf[doc] = dict()
  for word in bow_ponderado[doc]:
    bow_tfidf[doc][word] = bow_ponderado[doc][word] * idf[word]

In [16]:
tfidf = pd.DataFrame().from_records(bow_tfidf).fillna(0).T

In [17]:
tfidf

Unnamed: 0,asian,fear,damag,u,japan,rift,mount,trade,friction,rais,...,egregi,misread,articul,weill,switchboard,showcas,triennial,ig,obstinaci,rejuven
0,0.032158,0.016389,0.012359,0.084695,0.112599,0.024587,0.01721,0.091437,0.01822,0.008697,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.066305,0.000000,0.00000,0.023332,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.052049,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10783,0.000000,0.000000,0.000000,0.067472,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10784,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10785,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10786,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Similaridade Cosseno

In [18]:
def cosine_similarity(vetor1,vetor2):

    prod_interno = 0

    for i,valor in enumerate(vetor1):
        prod_interno += valor*vetor2[i]
    
    norma_vetor1 = 0
    for x in vetor1:
        norma_vetor1 += x*x
    norma_vetor1 = norma_vetor1**0.5

    norma_vetor2 = 0
    for x in vetor2:
        norma_vetor2 += x*x
    norma_vetor2 = norma_vetor2**0.5

    return prod_interno/(norma_vetor1*norma_vetor2)

# Recuperação de Textos

In [23]:
top10 = [0] * 10

categ = categories[:100]
for i in range(len(categ)):
  vec1 = tfidf.iloc[i]
  best = []

  for j in range(len(categ)):
    if i == j: continue
    vec2 = tfidf.iloc[j]
    similarity = cosine_similarity(vec1, vec2)
    best.append( (similarity,j) )

  best.sort(reverse=True)
  top = best[:10]
  for idx, [sim, j] in enumerate(top):
    for cat in categ[j] :
      if cat in categ[i]:
        top10[idx] += 1

print(top10)

[79, 72, 48, 52, 55, 56, 45, 45, 35, 37]
