# Notícias G1 - 3. TF-IDF, Separação Massas para Treino, Teste e Classificação, Matrizes Esparsas

## Imports:

Importes de bibliotecas usadas com frequência no código.

In [1]:
import pandas as pd
import numpy as np

## Recuperando Dados Gravados:

Recuperando dados já tratados com higienização, stop words e stemmer.

In [2]:
noticias = pd.read_csv('noticias_g1_higienizadas.csv')
print(noticias.shape)
noticias.head()

(3834, 5)


Unnamed: 0,titulo,url,texto,categoria,dado_tratado
0,Entenda como uma ampla coalizão tirou Netanyah...,https://g1.globo.com/mundo/noticia/2021/06/13/...,O Parlamento de Israel confirmou neste doming...,FALTANDO,"['entend', 'ampl', 'coaliz', 'tir', 'netanyahu..."
1,"Milionário de direita: quem é Naftali Bennett,...",https://g1.globo.com/mundo/noticia/2021/06/13/...,"Milionário do setor de tecnologia, Naftali Be...",FALTANDO,"['milion', 'direit', 'naftal', 'bennett', 'nov..."
2,Governo de SP antecipa datas de vacinação; vej...,https://g1.globo.com/sp/sao-paulo/noticia/2021...,O governo de São Paulo antecipou em 30 dias o...,FALTANDO,"['govern', 'sp', 'antecip', 'dat', 'vacin', 'v..."
3,PMs que mataram homens dentro de carro em SP s...,https://g1.globo.com/sp/sao-paulo/noticia/2021...,A Polícia Militar instaurou um inquérito poli...,FALTANDO,"['pm', 'mat', 'hom', 'dentr', 'carr', 'sp', 'a..."
4,"Após Venezuela e Bolívia, Colômbia confirma 2 ...",https://ge.globo.com/futebol/copa-america/noti...,A Federação Colombiana de Futebol informou ne...,diversos,"['após', 'venezuel', 'bolív', 'colômb', 'confi..."


### Calculando TF-IDF:

Vetorização, cálculo de frequencia e relevância das palavras.
O resultudo será um array com o vocabulário (todas as palavras do problema) e uma matrix esparça com o TD-IDF de todas essas palavras.

In [3]:
# Array com os textos de todas as noticias:
texts = []
for x in range(0, noticias.index.size):
  noticia = noticias.loc[x]
  texts.append(str(noticia.dado_tratado))

# Imports:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF:
min_df = 0.01 # relevancia minima
max_df = 0.99 # relevancia maxima, nao cheg a 1 para remover algumas stop-words
vectorizer_tfidf = TfidfVectorizer(min_df=min_df, max_df=max_df)
noticias_tf_idf = vectorizer_tfidf.fit_transform(texts)
vocabulario = np.array(vectorizer_tfidf.get_feature_names())

# Resultado:
print(vocabulario)
pd.DataFrame.sparse.from_spmatrix(noticias_tf_idf)

['abaix' 'abal' 'abandon' ... 'ônibu' 'últ' 'únic']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2432,2433,2434,2435,2436,2437,2438,2439,2440,2441
0,0.0,0.0,0.000000,0.0,0.0,0.0000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0461,0.000000,0.048245,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.066671,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.029485,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0000,0.062474,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3829,0.0,0.0,0.000000,0.0,0.0,0.0000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.048493,0.0,0.089484,0.0
3830,0.0,0.0,0.052041,0.0,0.0,0.0000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.024037,0.0
3831,0.0,0.0,0.000000,0.0,0.0,0.0000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
3832,0.0,0.0,0.000000,0.0,0.0,0.0000,0.000000,0.000000,0.0,0.234776,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0


## Separando Massa de Treino e Classificação:

A massa de dados contendo todas as notícias será separada em duas matrizes esparsas, sendo:
* uma com as categorias já conhecidas, que será utilizada para treino;
* outra com notícias que desconhecemos as categorias, que será usada para classificação.

In [4]:
from scipy.sparse import coo_matrix, vstack

df_noticias_classificar = pd.DataFrame()
sm_tfidf_noticias_classificar = coo_matrix([])
sm_tfidf_noticias_treino = coo_matrix([])
sm_categorias_treino = np.array([], dtype=np.string_)

# Separa as massas de dados:
for x in range(0, noticias.index.size):
  if noticias.iloc[x]['categoria'] == 'FALTANDO':
    df_noticias_classificar = df_noticias_classificar.append(noticias.iloc[x])
    sm_tfidf_noticias_classificar = vstack([sm_tfidf_noticias_classificar, noticias_tf_idf.getrow(x)])
  else:
    sm_tfidf_noticias_treino = vstack([sm_tfidf_noticias_treino, noticias_tf_idf.getrow(x)])
    sm_categorias_treino = np.append(sm_categorias_treino, values=noticias.iloc[x]['categoria'])

# Remove a primeira linha que ele cria em branco:
sm_tfidf_noticias_classificar = coo_matrix(np.delete(sm_tfidf_noticias_classificar.toarray(), 1, axis=0))
sm_tfidf_noticias_treino = coo_matrix(np.delete(sm_tfidf_noticias_treino.toarray(), 1, axis=0))

# Totais:
print(sm_tfidf_noticias_classificar.shape, sm_tfidf_noticias_treino.shape, sm_categorias_treino.shape)

(2111, 2442) (1723, 2442) (1723,)


## Separando Massa de Treino e Teste

Separando a massa em conjunto de treino e de teste a 30%.

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sm_tfidf_noticias_treino, sm_categorias_treino, test_size=0.30, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1206, 2442) (517, 2442) (1206,) (517,)


## Tratando Dados:

Transformando matrizes esparças em Dataframe para tratar e gravar os dados em CSV.
O tratamento será a substituição de valores nulos por zero.

In [6]:
# Converte dados para Dataframe, para depois gravar em CSV:
df_tfidf_noticias_classificar = pd.DataFrame.sparse.from_spmatrix(sm_tfidf_noticias_classificar)
df_tfidf_noticias_treino = pd.DataFrame.sparse.from_spmatrix(X_train)
df_tfidf_noticias_teste = pd.DataFrame(X_test.toarray())
df_categorias_treino = pd.DataFrame(columns=['cat'], data=y_train)
df_categorias_teste = pd.DataFrame(columns=['cat'], data=y_test)

# Substituindo nulo por zero:
df_tfidf_noticias_classificar.fillna(0, inplace=True)
df_tfidf_noticias_treino.fillna(0, inplace=True)
df_tfidf_noticias_teste.fillna(0, inplace=True)

# Totais:
print(df_tfidf_noticias_classificar.shape, df_tfidf_noticias_treino.shape, 
      df_categorias_treino.shape, df_tfidf_noticias_teste.shape, df_categorias_teste.shape)

(2111, 2442) (1206, 2442) (1206, 1) (517, 2442) (517, 1)


## Gravação dos Dados:

Gravando os dados em CSV para posterior utilização.

In [7]:
# Grava em arquivo:
df_noticias_classificar.to_csv("noticias_g1_noticias_classificar.csv", index=False)
df_tfidf_noticias_classificar.to_csv("noticias_g1_tfidf_massa_classificar.csv", index=False)
df_tfidf_noticias_treino.to_csv("noticias_g1_tfidf_massa_treino.csv", index=False)
df_tfidf_noticias_teste.to_csv("noticias_g1_tfidf_massa_teste.csv", index=False)
df_categorias_treino.to_csv("noticias_g1_categorias_treino.csv", index=False)
df_categorias_teste.to_csv("noticias_g1_categorias_teste.csv", index=False)
pd.DataFrame(vocabulario).to_csv("noticias_g1_vocabulario.csv", index=False)
print("Dados gravados com sucesso!")

Dados gravados com sucesso!


In [8]:
df_noticias_classificar

Unnamed: 0,categoria,dado_tratado,texto,titulo,url
0,FALTANDO,"['entend', 'ampl', 'coaliz', 'tir', 'netanyahu...",O Parlamento de Israel confirmou neste doming...,Entenda como uma ampla coalizão tirou Netanyah...,https://g1.globo.com/mundo/noticia/2021/06/13/...
1,FALTANDO,"['milion', 'direit', 'naftal', 'bennett', 'nov...","Milionário do setor de tecnologia, Naftali Be...","Milionário de direita: quem é Naftali Bennett,...",https://g1.globo.com/mundo/noticia/2021/06/13/...
2,FALTANDO,"['govern', 'sp', 'antecip', 'dat', 'vacin', 'v...",O governo de São Paulo antecipou em 30 dias o...,Governo de SP antecipa datas de vacinação; vej...,https://g1.globo.com/sp/sao-paulo/noticia/2021...
3,FALTANDO,"['pm', 'mat', 'hom', 'dentr', 'carr', 'sp', 'a...",A Polícia Militar instaurou um inquérito poli...,PMs que mataram homens dentro de carro em SP s...,https://g1.globo.com/sp/sao-paulo/noticia/2021...
5,FALTANDO,"['rainh', 'elizabeth', 'ii', 'receb', 'biden',...",A rainha Elizabeth II recebeu neste domingo (...,Rainha Elizabeth II recebe Biden e a mulher no...,https://g1.globo.com/mundo/noticia/2021/06/13/...
...,...,...,...,...,...
3825,FALTANDO,"['eua', 'enfrent', 'ond', 'cal', 'histór', 'pi...",Os Estados Unidos atravessam uma seca históri...,EUA enfrentam onda de calor histórica e pior s...,https://g1.globo.com/mundo/noticia/2021/06/29/...
3828,FALTANDO,"['desab', 'flór', 'ges', 'diss', 'abril', 'dan...",Uma carta escrita pelo presidente da associaç...,Desabamento na Flórida: gestor disse em abril ...,https://g1.globo.com/mundo/noticia/2021/06/29/...
3830,FALTANDO,"['psiquiatr', 'conden', 'ano', 'pris', 'abus',...",Um médico psiquiatra foi condenado a mais de ...,Psiquiatra é condenado a mais de 20 anos de pr...,https://g1.globo.com/rj/rio-de-janeiro/noticia...
3831,FALTANDO,"['alpin', 'escal', 'mont', 'everest', 'mãe', '...",Jess Wedel foi diagnosticada com câncer no ov...,Alpinista escala Monte Everest com a mãe de 61...,https://g1.globo.com/olha-que-legal/noticia/20...


In [9]:
df_tfidf_noticias_classificar

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2432,2433,2434,2435,2436,2437,2438,2439,2440,2441
0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,0.000000,0.0,0.000000,0.0,0.0,0.046100,0.000000,0.048245,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066671,0.0
2,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029485,0.0
3,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.062474,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,0.000000,0.0,0.000000,0.0,0.0,0.057787,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2107,0.091835,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026349,0.0
2108,0.000000,0.0,0.052041,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024037,0.0
2109,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [10]:
df_tfidf_noticias_treino

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2432,2433,2434,2435,2436,2437,2438,2439,2440,2441
0,0.000000,0.0,0.0,0.0,0.0,0.055672,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.040671,0.0
3,0.023266,0.0,0.0,0.0,0.0,0.055388,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1201,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.038133,0.0
1202,0.034634,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.048466,0.0,0.000000,0.0
1203,0.000000,0.0,0.0,0.0,0.0,0.041189,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
1204,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0


In [11]:
df_tfidf_noticias_teste

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2432,2433,2434,2435,2436,2437,2438,2439,2440,2441
0,0.047540,0.0,0.000000,0.0,0.0,0.000000,0.0,0.029611,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.020460,0.029569
1,0.000000,0.0,0.000000,0.0,0.0,0.020134,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.043678,0.000000
2,0.000000,0.0,0.026758,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.024718,0.000000
3,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.038122,0.000000
4,0.019737,0.0,0.000000,0.0,0.0,0.117471,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.02762,0.0,0.016989,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.037584,0.030919,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.012985,0.000000
513,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.030343
514,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.051870,0.000000
515,0.027759,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.161149,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.047787,0.000000


In [12]:
df_categorias_treino

Unnamed: 0,cat
0,pop-arte
1,saude
2,pop-arte
3,politica
4,economia
...,...
1201,economia
1202,politica
1203,economia
1204,economia


In [13]:
df_categorias_teste

Unnamed: 0,cat
0,politica
1,economia
2,economia
3,economia
4,politica
...,...
512,educacao
513,saude
514,economia
515,politica
