# AS02: Representação Textual
---
**Aluno**: Gustavo Martins Lopes da Costa

**Matrícula**: 690773

Ao final da execução, todos os arquivos texto de resultado ficarão disponíveis na pasta `./content`. Para acessá-los basta abrir a aba lateral do google colab.

## Obtenção do Dataset

In [1]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
import numpy as np
import pandas as pd
import re
import sklearn
import nltk

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')
corpus = list(newsgroups_train.target_names)

## Pré-Processamento
---

In [3]:
def get_tokens(doc):
  # replaces all "." to " "
  word = re.sub("[.]", " ", doc)
  tokens = re.sub("[.]", " ", doc).split()
  cleaned_text = [w.lower() for w in tokens]
  return cleaned_text

In [4]:
def tokenize(corpus):
  words = []
  for doc in corpus:
    w = get_tokens(doc)
    words.extend(w)
    words = sorted(list(set(words)))
  return words

## One Hot Enconding
---

Obtendo o vocabulário

In [5]:
vocab = tokenize(corpus)
print(f'The vocabulary has the following {len(vocab)} words: ')
for word in vocab:
  print(word)
print(len(vocab))

The vocabulary has the following 33 words: 
alt
atheism
autos
baseball
christian
comp
crypt
electronics
forsale
graphics
guns
hardware
hockey
ibm
mac
med
mideast
misc
motorcycles
ms-windows
os
pc
politics
rec
religion
sci
soc
space
sport
sys
talk
windows
x
33


Função que gera o one hot encoding

In [6]:
def generate_one_hot(corpus, vocab):
  one_hot_list = []
  for doc in corpus:
    words = get_tokens(doc)
    bag_vector = np.zeros(len(vocab))
    for word in words:
      for i, vocab_word in enumerate(vocab):
        if word == vocab_word:
          bag_vector[i] = 1
    one_hot_list.append({"document" :doc, "bag_vector":bag_vector})
  return one_hot_list

Obtendo os embeddings do one hot encoding

In [7]:
one_hot_list = generate_one_hot(corpus, vocab)
for result in one_hot_list:
  print(f' - {result["document"]} = {result["bag_vector"]}')

 - alt.atheism = [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 - comp.graphics = [0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 - comp.os.ms-windows.misc = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 - comp.sys.ibm.pc.hardware = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 - comp.sys.mac.hardware = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 - comp.windows.x = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 - misc.forsale = [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 - rec.autos = [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 - rec.mo

Salvando os resultados

In [8]:
with open('20News_01.txt', 'w') as one_hot_file:
  for result in one_hot_list:
    one_hot_file.write(result['document'] + ' = [' +', '.join(map(str, result["bag_vector"])) + ']\n')

## Count Vector
---

Função para computar o count vector

In [9]:
def generate_count_vector(corpus, vocab):
  count_vector_list = []
  for doc in corpus:
    words = get_tokens(doc)
    bag_vector = np.zeros(len(vocab))
    for word in words:
      for i, vocab_word in enumerate(vocab):
        if word == vocab_word:
          bag_vector[i] = 1
    count_vector_list.append({"document" :doc, "bag_vector":bag_vector})
  return count_vector_list

Gerando os embedding do count vector

In [10]:
count_vector_list = generate_count_vector(corpus, vocab)
for result in count_vector_list:
  print(f' - {result["document"]} = {result["bag_vector"]}')

 - alt.atheism = [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 - comp.graphics = [0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 - comp.os.ms-windows.misc = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 - comp.sys.ibm.pc.hardware = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 - comp.sys.mac.hardware = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 - comp.windows.x = [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 - misc.forsale = [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 - rec.autos = [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 - rec.mo

In [11]:
with open('20News_02.txt', 'w') as count_vector_file:
  for cv_result in count_vector_list:
    count_vector_file.write(cv_result['document'] + ' = [' +', '.join(map(str, cv_result["bag_vector"])) + ']\n')

### TF-IDF
---

In [12]:
import sklearn
import nltk

from sklearn.feature_extraction.text import CountVectorizer

In [13]:
def preprocess(corpus):
  preprocessed = []
  for doc in corpus:
    preprocessed.append(re.sub("[.]", " ", doc))
  return preprocessed

Pré-processamento do corpus

In [14]:
preprocessed_corpus = preprocess(corpus)
preprocessed_corpus

['alt atheism',
 'comp graphics',
 'comp os ms-windows misc',
 'comp sys ibm pc hardware',
 'comp sys mac hardware',
 'comp windows x',
 'misc forsale',
 'rec autos',
 'rec motorcycles',
 'rec sport baseball',
 'rec sport hockey',
 'sci crypt',
 'sci electronics',
 'sci med',
 'sci space',
 'soc religion christian',
 'talk politics guns',
 'talk politics mideast',
 'talk politics misc',
 'talk religion misc']

Montando o count vector

In [15]:
vectorizer = CountVectorizer()
docTermMatrix = vectorizer.fit_transform(preprocessed_corpus)
terms = vectorizer.get_feature_names_out()
df_vectorized_corpus = pd.DataFrame(docTermMatrix.A, columns=terms)
df_vectorized_corpus

Unnamed: 0,alt,atheism,autos,baseball,christian,comp,crypt,electronics,forsale,graphics,...,politics,rec,religion,sci,soc,space,sport,sys,talk,windows
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


Computando a matriz TF-IDF

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()
tfIdfMatrix = transformer.fit_transform(docTermMatrix)
df_tf_idf = pd.DataFrame(tfIdfMatrix.A, columns = terms)
df_tf_idf

Unnamed: 0,alt,atheism,autos,baseball,christian,comp,crypt,electronics,forsale,graphics,...,politics,rec,religion,sci,soc,space,sport,sys,talk,windows
0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.55787,0.0,0.0,0.0,0.829928,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.347005,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.453774
3,0.0,0.0,0.0,0.0,0.0,0.336214,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.439663,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.388272,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.507739,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.607451,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.794357
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.808998,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.808998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.587812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.587812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.659294,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.479038,0.0,0.0,0.0,0.0,0.579529,0.0,0.0,0.0


In [17]:
with open('20News_03.txt', 'w') as tf_idf_file:
  tf_idf_file.write(df_tf_idf.to_string())

## N-grams (2-grams)
---

In [18]:
preprocessed_corpus

['alt atheism',
 'comp graphics',
 'comp os ms-windows misc',
 'comp sys ibm pc hardware',
 'comp sys mac hardware',
 'comp windows x',
 'misc forsale',
 'rec autos',
 'rec motorcycles',
 'rec sport baseball',
 'rec sport hockey',
 'sci crypt',
 'sci electronics',
 'sci med',
 'sci space',
 'soc religion christian',
 'talk politics guns',
 'talk politics mideast',
 'talk politics misc',
 'talk religion misc']

In [19]:
vectorizer = CountVectorizer(ngram_range=(2,2))
docTermMatrix = vectorizer.fit_transform(preprocessed_corpus)
vocabulary = vectorizer.get_feature_names_out()
df_vocab = (pd.DataFrame(docTermMatrix.A, columns = vocabulary))
df_vocab

Unnamed: 0,alt atheism,comp graphics,comp os,comp sys,comp windows,ibm pc,mac hardware,misc forsale,ms windows,os ms,...,sci med,sci space,soc religion,sport baseball,sport hockey,sys ibm,sys mac,talk politics,talk religion,windows misc
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [20]:
with open("20News_04.txt", "w") as ngrams_file:
  ngrams_file.write(df_vocab.to_string())

## Co-occurrence Vectors
---

Montando o count vector

In [21]:
preprocessed_corpus

['alt atheism',
 'comp graphics',
 'comp os ms-windows misc',
 'comp sys ibm pc hardware',
 'comp sys mac hardware',
 'comp windows x',
 'misc forsale',
 'rec autos',
 'rec motorcycles',
 'rec sport baseball',
 'rec sport hockey',
 'sci crypt',
 'sci electronics',
 'sci med',
 'sci space',
 'soc religion christian',
 'talk politics guns',
 'talk politics mideast',
 'talk politics misc',
 'talk religion misc']

In [22]:
vectorizer = CountVectorizer()
docTermMatrix = vectorizer.fit_transform(preprocessed_corpus)
vocabulary = vectorizer.get_feature_names_out()
df_co_vocab = pd.DataFrame(docTermMatrix.A, columns = vocabulary)
df_co_vocab

Unnamed: 0,alt,atheism,autos,baseball,christian,comp,crypt,electronics,forsale,graphics,...,politics,rec,religion,sci,soc,space,sport,sys,talk,windows
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


Visualizando o vocabulário obtido

In [23]:
vectorizer.vocabulary_

{'alt': 0,
 'atheism': 1,
 'comp': 5,
 'graphics': 9,
 'os': 20,
 'ms': 19,
 'windows': 31,
 'misc': 17,
 'sys': 29,
 'ibm': 13,
 'pc': 21,
 'hardware': 11,
 'mac': 14,
 'forsale': 8,
 'rec': 23,
 'autos': 2,
 'motorcycles': 18,
 'sport': 28,
 'baseball': 3,
 'hockey': 12,
 'sci': 25,
 'crypt': 6,
 'electronics': 7,
 'med': 15,
 'space': 27,
 'soc': 26,
 'religion': 24,
 'christian': 4,
 'talk': 30,
 'politics': 22,
 'guns': 10,
 'mideast': 16}

In [24]:
co_occurence_matrix = (docTermMatrix.T * docTermMatrix)
print(co_occurence_matrix)

  (1, 0)	1
  (0, 0)	1
  (1, 1)	1
  (0, 1)	1
  (2, 2)	1
  (23, 2)	1
  (3, 3)	1
  (28, 3)	1
  (23, 3)	1
  (4, 4)	1
  (24, 4)	1
  (26, 4)	1
  (14, 5)	1
  (11, 5)	2
  (21, 5)	1
  (13, 5)	1
  (29, 5)	2
  (17, 5)	1
  (31, 5)	2
  (19, 5)	1
  (20, 5)	1
  (9, 5)	1
  (5, 5)	5
  (6, 6)	1
  (25, 6)	1
  :	:
  (24, 26)	1
  (26, 26)	1
  (27, 27)	1
  (25, 27)	1
  (12, 28)	1
  (3, 28)	1
  (28, 28)	2
  (23, 28)	2
  (14, 29)	1
  (11, 29)	2
  (21, 29)	1
  (13, 29)	1
  (29, 29)	2
  (5, 29)	2
  (24, 30)	1
  (17, 30)	2
  (16, 30)	1
  (10, 30)	1
  (22, 30)	3
  (30, 30)	4
  (17, 31)	1
  (31, 31)	2
  (19, 31)	1
  (20, 31)	1
  (5, 31)	2


Normalizando pela diagonal

In [25]:
import scipy.sparse as sp
g  = sp.diags(1./co_occurence_matrix.diagonal())
co_occurence_matrix_normalized = g * co_occurence_matrix
co_occurence_matrix.setdiag(0)
print(co_occurence_matrix_normalized.todense())

[[1. 1. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


Salvando os resultados

In [26]:
def matrix_2_str(m):
  str_matrix = ""
  for line in m:
      for el in line:
          str_matrix += str(el) + " "  # Adiciona um espaço entre os elementos
      str_matrix += "\n"  # Adiciona uma quebra de linha após cada linha
  return str_matrix

In [27]:
str_matrix = matrix_2_str(co_occurence_matrix_normalized.todense())
with open('20News_05.txt', 'w') as co_file:
  co_file.write(str_matrix)

## Word2Vec
---

In [28]:
preprocessed_corpus

['alt atheism',
 'comp graphics',
 'comp os ms-windows misc',
 'comp sys ibm pc hardware',
 'comp sys mac hardware',
 'comp windows x',
 'misc forsale',
 'rec autos',
 'rec motorcycles',
 'rec sport baseball',
 'rec sport hockey',
 'sci crypt',
 'sci electronics',
 'sci med',
 'sci space',
 'soc religion christian',
 'talk politics guns',
 'talk politics mideast',
 'talk politics misc',
 'talk religion misc']

In [29]:
import spacy
nlp = spacy.load('en_core_web_sm')
bag_vector = [nlp(sentence).vector for sentence in preprocessed_corpus]
print(bag_vector)

[array([-0.7954582 , -0.98655176,  0.0515427 ,  0.13508093, -0.15959325,
        0.2586966 ,  1.200045  ,  0.76451427,  0.03854417,  0.14450514,
        0.04850918, -0.17817537, -0.25338086, -0.19959874, -0.30313468,
       -0.12096608, -0.5854509 , -0.79546654,  0.47014832,  0.57572126,
        0.10127136,  1.0037248 , -1.1172953 , -0.60414684,  0.37199104,
        0.3180685 ,  0.07613146,  0.8745186 , -0.35697716,  0.22044638,
        0.3478574 , -0.01097913,  0.04112068,  0.05879265, -0.07342099,
       -0.49622858, -0.52929765, -0.40501   ,  0.3220029 , -0.8892287 ,
       -0.8088497 ,  0.15526013, -0.44938636,  0.62076783, -0.5475764 ,
       -0.5797746 ,  0.9262902 , -0.07025288, -0.11892524,  0.18795982,
       -0.25364587,  0.86911726, -0.41536444, -0.0911693 , -0.09557682,
        0.63764817,  0.9634266 ,  0.14100564, -0.2101104 ,  0.06945258,
       -1.0214891 , -0.42115688, -0.4206717 ,  0.32856828,  0.44711667,
       -0.35674864, -0.01652592,  0.12531166, -0.31684184, -0.4

Salvando os resultados

In [30]:
str_matrix = matrix_2_str(bag_vector)
with open('20News_06.txt', 'w') as co_file:
  co_file.write(str_matrix)