<a href="https://colab.research.google.com/github/dudaholandah/NLP/blob/main/Trabalhos/T3/NLP_T3_100.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import reuters
import math
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.corpus import wordnet as wn

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('reuters')

# Pre-Processing

In [6]:
def pre_processing(text):
  stopwords_nltk = stopwords.words('english')
  stemmer = PorterStemmer()
  emojiDict = {':)' : 'happy', '(:' : 'happy', ':(': 'sad', '):': 'sad', '(;' : 'wink', ';)' : 'wink', ':D' : 'happy', 'D:': 'sad'}
  wn_lemmas = set(wn.all_lemma_names())

  # substituindo os emojis por palavras
  for word, initial in emojiDict.items():
    text = text.replace(word.lower(), initial)
  
  # regex para tirar caracteres, numeros, upcode e links htpp
  words = re.sub(r'[\s!%*~\^´`=+<\[\]?&$:;@#.0-9()\/\"\'_-]+|[\s]*[\\]u[a-z0-9]{4}[\s]*|[\s]*http[^\s]+[\s]*', " ", text.lower())
  words = re.sub(r'\brt\b', "", words)
  words = re.sub(r'[\\]+', " ", words)
  words = words.strip()
  
  phrase = ""
  words = words.split()

  # tirando stopwords
  # colocando apenas as palavras que estao na gramatica wordnet
  # fazendo stematizacao
  for x in words:
      if x not in stopwords_nltk and x in wn_lemmas:
          x = stemmer.stem(x)
          phrase = phrase + " " + x
        
  return phrase.strip()

In [7]:
categories = []
corpus = []
fileids = reuters.fileids()

for file in fileids:
    categories.append(reuters.categories(file))
    corpus.append(pre_processing(reuters.raw(file)))

df = pd.DataFrame({'ids':fileids, 'categories':categories, 'text':corpus})

In [8]:
df

Unnamed: 0,ids,categories,text
0,test/14826,[trade],asian fear damag u japan rift mount trade fric...
1,test/14828,[grain],china daili vermin eat pct grain stock survey ...
2,test/14829,"[crude, nat-gas]",japan revis long term energi demand downward m...
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",thai trade deficit first quarter thailand trad...
4,test/14833,"[palm-oil, veg-oil]",indonesia price rise sharpli indonesia crude p...
...,...,...,...
10783,training/999,"[interest, money-fx]",u k money market shortag forecast revis bank e...
10784,training/9992,[earn],knight inc quarterli prior pay april record ap...
10785,training/9993,[earn],inc quarterli prior pay april record april seven
10786,training/9994,[earn],nationwid cellular servic inc th loss six loss...


In [9]:
phrase = " ".join(corpus)
tokens = phrase.split()
vocab = sorted(set(tokens))

# Bag of Words

## Bag of Words Binário

In [10]:
bow_binario = {}

for i,doc in enumerate(corpus):
  bow_binario[i] = dict()
  for word in doc.split():
    bow_binario[i][word] = 1

In [None]:
bow_binario

## Bag of Words Contagem

In [12]:
bow_contagem = {}

for i,doc in enumerate(corpus):
  bow_contagem[i] = dict()
  for word in doc.split():
    if word in bow_contagem[i]:
      bow_contagem[i][word] += 1
    else:
      bow_contagem[i][word] = 1

In [None]:
bow_contagem

## Bag of Words Ponderado

In [14]:
bow_ponderado = {}
N = {} # qntd de termos de cada sentenca

for i,doc in enumerate(corpus):
    bow_ponderado[i] = dict()
    N[i] = len(doc.split())
    for word in doc.split():
        if word in bow_ponderado[i]:
            bow_ponderado[i][word] += 1
        else:
            bow_ponderado[i][word] = 1

for doc in bow_ponderado:
    for word in bow_ponderado[doc]:
        bow_ponderado[doc][word] /= N[doc]   

In [None]:
bow_ponderado

# Inverse Document Frequency

In [16]:
idf = {}

for word in vocab:
  idf[word] = 0
  for doc in bow_contagem:
    if word in bow_contagem[doc]:
      idf[word] += 1

for word in idf:
  idf[word] = math.log(len(corpus)/idf[word])

In [None]:
idf

# TF-IDF

In [18]:
bow_tfidf = {}

for doc in bow_ponderado:
  bow_tfidf[doc] = dict()
  for word in bow_ponderado[doc]:
    bow_tfidf[doc][word] = bow_ponderado[doc][word] * idf[word]

In [19]:
tfidf = pd.DataFrame().from_records(bow_tfidf).fillna(0).T

In [20]:
tfidf

Unnamed: 0,asian,fear,damag,u,japan,rift,mount,trade,friction,rais,...,egregi,misread,articul,weill,switchboard,showcas,triennial,ig,obstinaci,rejuven
0,0.032158,0.016389,0.012359,0.084695,0.112599,0.024587,0.01721,0.091437,0.01822,0.008697,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.066305,0.000000,0.00000,0.023332,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.052049,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10783,0.000000,0.000000,0.000000,0.067472,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10784,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10785,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10786,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.00000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Similaridade Cosseno

In [21]:
def cosine_similarity(vetor1,vetor2):

    prod_interno = 0

    for i,valor in enumerate(vetor1):
        prod_interno += valor*vetor2[i]
    
    norma_vetor1 = 0
    for x in vetor1:
        norma_vetor1 += x*x
    norma_vetor1 = norma_vetor1**0.5

    norma_vetor2 = 0
    for x in vetor2:
        norma_vetor2 += x*x
    norma_vetor2 = norma_vetor2**0.5

    return prod_interno/(norma_vetor1*norma_vetor2)

# Recuperação de Textos

In [22]:
top10 = [0] * 10

categ = categories[:100]
for i in range(len(categ)):
  vec1 = tfidf.iloc[i]
  best = []

  for j in range(len(categ)):
    if i == j: continue
    vec2 = tfidf.iloc[j]
    similarity = cosine_similarity(vec1, vec2)
    best.append( (similarity,j) )

  best.sort(reverse=True)
  top = best[:10]
  for idx, [sim, j] in enumerate(top):
    for cat in categ[j] :
      if cat in categ[i]:
        top10[idx] += 1

print(top10)

[79, 72, 48, 52, 55, 56, 45, 45, 35, 37]


In [31]:
acuracia = 0
for x in top10:
  acuracia += x/100

acuracia = acuracia*10
print(f"Acurácia: {acuracia:.2f}%")

Acurácia: 52.40%
