In [4]:
import pandas as pd

train_data = pd.read_csv('../Data/train.csv')
test_data = pd.read_csv('../Data/test.csv')

train_data.head()

Unnamed: 0,title,text,date,category,subcategory,link
0,"Após polêmica, Marine Le Pen diz que abomina n...",A candidata da direita nacionalista à Presidên...,2017-04-28,mundo,,http://www1.folha.uol.com.br/mundo/2017/04/187...
1,"Macron e Le Pen vão ao 2º turno na França, em ...",O centrista independente Emmanuel Macron e a d...,2017-04-23,mundo,,http://www1.folha.uol.com.br/mundo/2017/04/187...
2,"Apesar de larga vitória nas legislativas, Macr...",As eleições legislativas deste domingo (19) na...,2017-06-19,mundo,,http://www1.folha.uol.com.br/mundo/2017/06/189...
3,"Governo antecipa balanço, e Alckmin anuncia qu...",O número de ocorrências de homicídios dolosos ...,2015-07-24,cotidiano,,http://www1.folha.uol.com.br/cotidiano/2015/07...
4,"Após queda em maio, a atividade econômica sobe...","A economia cresceu 0,25% no segundo trimestre,...",2017-08-17,mercado,,http://www1.folha.uol.com.br/mercado/2017/08/1...


In [2]:
text_to_process = (title.lower() for title in train_data['title'])

In [3]:
import spacy

nlp = spacy.load("pt_core_news_sm")

def process_text(doc):
    valid_tokens = []
    for token in doc:
        valid = not token.is_stop and token.is_alpha
        if valid:
            valid_tokens.append(token.text)

    if len(valid_tokens) > 2:
        return " ".join(valid_tokens)

text_processed = [process_text(doc) for doc in nlp.pipe(text_to_process, 
                                                            batch_size = 1000,
                                                            n_process = -1)]

text_processed_df = pd.DataFrame({'title': text_processed})
print(text_processed_df.shape)
text_processed_df.head()

(90000, 1)


Unnamed: 0,title
0,polêmica marine le pen abomina negacionistas h...
1,macron le pen turno frança revés siglas tradic...
2,apesar larga vitória legislativas macron terá ...
3,governo antecipa balanço alckmin anuncia queda...
4,queda maio atividade econômica sobe junho bc


In [4]:
text_processed_df_v2 = text_processed_df.dropna().drop_duplicates()
print(len(text_processed_df))
print(len(text_processed_df_v2))

90000
84466


## Model CBOW

In [5]:
import logging
from gensim.models import Word2Vec

tokens_list_list = [title.split(" ") for title  in text_processed_df_v2.title]

logging.basicConfig(format="%(asctime)s : - %(message)s", level=logging.INFO)

w2v_model = Word2Vec(sg = 0, 
                     window = 5,
                     vector_size = 300,
                     min_count = 5,
                     alpha = 0.03,
                     min_alpha = 0.007)


w2v_model.build_vocab(tokens_list_list, progress_per = 5000)

2023-11-12 11:47:43,702 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2023-11-12T11:47:43.702448', 'gensim': '4.3.2', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}
2023-11-12 11:47:43,715 : - collecting all words and their counts
2023-11-12 11:47:43,717 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-11-12 11:47:43,731 : - PROGRESS: at sentence #5000, processed 31930 words, keeping 10193 word types
2023-11-12 11:47:43,746 : - PROGRESS: at sentence #10000, processed 63848 words, keeping 14989 word types
2023-11-12 11:47:43,756 : - PROGRESS: at sentence #15000, processed 95753 words, keeping 18279 word types
2023-11-12 11:47:43,766 : - PROGRESS: at sentence #20000, processed 127689 words, keeping 21033 word types
2023-11-12 11:47:43,788 : - PROGRESS: at sentence #25000, processed 159589 words, keep

In [6]:
samples = w2v_model.corpus_count

w2v_model.train(tokens_list_list, 
                total_examples = samples,
                epochs = 30)


2023-11-12 11:47:44,627 : - Word2Vec lifecycle event {'msg': 'training model with 3 workers on 12924 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-11-12T11:47:44.627669', 'gensim': '4.3.2', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
2023-11-12 11:47:45,612 : - EPOCH 0: training on 540242 raw words (486148 effective words) took 0.9s, 511977 effective words/s
2023-11-12 11:47:46,627 : - EPOCH 1 - PROGRESS: at 98.14% examples, 474804 words/s, in_qsize 1, out_qsize 1
2023-11-12 11:47:46,641 : - EPOCH 1: training on 540242 raw words (486287 effective words) took 1.0s, 481498 effective words/s
2023-11-12 11:47:47,394 : - EPOCH 2: training on 540242 raw words (486137 effective words) took 0.7s, 651001 effective words/s
2023-11-12 11:47:48,281 : - EPOCH 3: training on 540242 raw words (486005 effective words) took 

(14584530, 16207260)

In [7]:
w2v_model.wv.most_similar('apple')

[('iphone', 0.6156098246574402),
 ('google', 0.6147168278694153),
 ('samsung', 0.5877225995063782),
 ('amazon', 0.5694668889045715),
 ('tesla', 0.5635470747947693),
 ('microsoft', 0.5556092858314514),
 ('sony', 0.5464751124382019),
 ('renault', 0.5459845066070557),
 ('invepar', 0.5425215363502502),
 ('yahoo', 0.5413187146186829)]

In [8]:
w2v_model.wv.save_word2vec_format('models/cbow_model.txt', binary = False)

2023-11-12 11:48:16,956 : - storing 12924x300 projection weights into models/cbow_model.txt


## Model Skip Gram

In [9]:
logging.basicConfig(format="%(asctime)s : - %(message)s", level=logging.INFO)

tokens_list_list = [title.split(" ") for title  in text_processed_df_v2.title]

w2v_model_sg = Word2Vec(sg = 1, 
                     window = 5,
                     vector_size = 300,
                     min_count = 5,
                     alpha = 0.03,
                     min_alpha = 0.007)

w2v_model_sg.build_vocab(tokens_list_list, progress_per = 5000)

2023-11-12 11:48:24,426 : - Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2023-11-12T11:48:24.426624', 'gensim': '4.3.2', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}
2023-11-12 11:48:24,427 : - collecting all words and their counts
2023-11-12 11:48:24,427 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-11-12 11:48:24,443 : - PROGRESS: at sentence #5000, processed 31930 words, keeping 10193 word types
2023-11-12 11:48:24,467 : - PROGRESS: at sentence #10000, processed 63848 words, keeping 14989 word types
2023-11-12 11:48:24,483 : - PROGRESS: at sentence #15000, processed 95753 words, keeping 18279 word types
2023-11-12 11:48:24,493 : - PROGRESS: at sentence #20000, processed 127689 words, keeping 21033 word types
2023-11-12 11:48:24,508 : - PROGRESS: at sentence #25000, processed 159589 words, keep

In [10]:
samples = w2v_model_sg.corpus_count

w2v_model_sg.train(tokens_list_list, 
                total_examples = samples,
                epochs = 30)

2023-11-12 11:48:25,404 : - Word2Vec lifecycle event {'msg': 'training model with 3 workers on 12924 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-11-12T11:48:25.404730', 'gensim': '4.3.2', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
2023-11-12 11:48:26,442 : - EPOCH 0 - PROGRESS: at 31.50% examples, 150947 words/s, in_qsize 6, out_qsize 1
2023-11-12 11:48:27,510 : - EPOCH 0 - PROGRESS: at 74.06% examples, 173325 words/s, in_qsize 5, out_qsize 0
2023-11-12 11:48:28,262 : - EPOCH 0: training on 540242 raw words (486052 effective words) took 2.8s, 171902 effective words/s
2023-11-12 11:48:29,283 : - EPOCH 1 - PROGRESS: at 40.79% examples, 196715 words/s, in_qsize 5, out_qsize 0
2023-11-12 11:48:30,343 : - EPOCH 1 - PROGRESS: at 96.23% examples, 226891 words/s, in_qsize 3, out_qsize 0
2023-11-12 11:48:30,366 : 

(14584638, 16207260)

In [11]:
w2v_model_sg.wv.most_similar('google')

[('reguladores', 0.4149758517742157),
 ('android', 0.4046587646007538),
 ('apple', 0.3919309675693512),
 ('waze', 0.3781903088092804),
 ('toshiba', 0.36099085211753845),
 ('concorda', 0.3560527265071869),
 ('buffett', 0.35300764441490173),
 ('verizon', 0.3490554690361023),
 ('patente', 0.3490547239780426),
 ('navais', 0.34103327989578247)]

In [12]:
w2v_model_sg.wv.save_word2vec_format('models/skip_gram_model.txt', binary = False)

2023-11-12 11:49:23,598 : - storing 12924x300 projection weights into models/skip_gram_model.txt


In [13]:
from gensim.models import KeyedVectors

w2v_model_cbow = KeyedVectors.load_word2vec_format("models/cbow_model.txt")
w2v_model_sg = KeyedVectors.load_word2vec_format("models/skip_gram_model.txt")

2023-11-12 11:49:29,489 : - loading projection weights from models/cbow_model.txt
2023-11-12 11:49:35,472 : - KeyedVectors lifecycle event {'msg': 'loaded (12924, 300) matrix of type float32 from models/cbow_model.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-11-12T11:49:35.472629', 'gensim': '4.3.2', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'load_word2vec_format'}
2023-11-12 11:49:35,485 : - loading projection weights from models/skip_gram_model.txt
2023-11-12 11:49:40,648 : - KeyedVectors lifecycle event {'msg': 'loaded (12924, 300) matrix of type float32 from models/skip_gram_model.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-11-12T11:49:40.648600', 'gensim': '4.3.2', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'load_word2vec_format'}


Tokenization Function

In [14]:
nlp = spacy.load("pt_core_news_sm", disable=["paser", "ner", "tagger", "textcat"])

def tokenizer(text):
    doc = nlp(text)
    valid_tokens = []

    for token in doc:
        is_valid = not token.is_stop and token.is_alpha
        if is_valid:
            valid_tokens.append(token.text.lower())

    return valid_tokens

Word Vector Combination Function

In [15]:
import numpy as np

def vector_combination_by_sum(words, model):

    result_vector = np.zeros((1, 300))

    for word in words:
        try:
            result_vector += model.get_vector(word)

        except KeyError:
            pass

    return result_vector

Matrix of Word Vectors Function

In [16]:
def vector_matrix(texts, model):
    rows = len(texts)
    cols = 300
    matrix = np.zeros((rows, cols))

    for i in range(rows):
        words = tokenizer(texts.iloc[i])
        matrix[i] = vector_combination_by_sum(words, model)

    return matrix

training_vector_matrix_cbow = vector_matrix(train_data.title, w2v_model_cbow)
testing_vector_matrix_cbow = vector_matrix(test_data.title, w2v_model_cbow)

print(training_vector_matrix_cbow.shape)
print(testing_vector_matrix_cbow.shape) 

(90000, 300)
(20513, 300)


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def classifier(X_train, y_train, X_test, y_test):
    logistic_regression = LogisticRegression(max_iter=800)
    logistic_regression.fit(X_train, y_train)
    
    predictions = logistic_regression.predict(X_test)
    classification_results = classification_report(y_test, predictions)
    
    print(classification_results)
    
    return logistic_regression


In [18]:
rl_cbow = classifier(
                     training_vector_matrix_cbow,
                     train_data.category,
                     testing_vector_matrix_cbow,
                     test_data.category)


              precision    recall  f1-score   support

     colunas       0.80      0.71      0.76      6103
   cotidiano       0.65      0.80      0.72      1698
     esporte       0.93      0.87      0.90      4663
   ilustrada       0.14      0.85      0.23       131
     mercado       0.84      0.79      0.81      5867
       mundo       0.75      0.84      0.79      2051

    accuracy                           0.79     20513
   macro avg       0.68      0.81      0.70     20513
weighted avg       0.82      0.79      0.80     20513



In [19]:
training_vector_matrix_sg = vector_matrix(train_data.title, w2v_model_sg)
testing_vector_matrix_sg = vector_matrix(test_data.title, w2v_model_sg)

rl_sg = classifier(
                     training_vector_matrix_sg,
                     train_data.category,
                     testing_vector_matrix_sg,
                     test_data.category)


              precision    recall  f1-score   support

     colunas       0.81      0.71      0.76      6103
   cotidiano       0.63      0.80      0.71      1698
     esporte       0.93      0.87      0.90      4663
   ilustrada       0.14      0.87      0.25       131
     mercado       0.84      0.79      0.82      5867
       mundo       0.76      0.85      0.80      2051

    accuracy                           0.80     20513
   macro avg       0.69      0.82      0.71     20513
weighted avg       0.82      0.80      0.81     20513



In [20]:
import pickle

with open("models/rl_cbow.pkl", "wb") as f:
    pickle.dump(rl_cbow, f)
    
    
with open("models/rl_sg.pkl", "wb") as f:
    pickle.dump(rl_sg, f)