In [None]:
import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt

from scipy.sparse import hstack, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, average_precision_score

from lightgbm import LGBMClassifier
from skopt import forest_minimize

In [2]:
df = pd.read_csv('dados/silver/dados_tratados_com_features_novas_com_todas_labels.csv', index_col=0).dropna(subset=['label'])
df.shape

(1314, 14)

In [3]:
df.head(2)

Unnamed: 0,id,titulo,label,canal,data_upload,categoria,tags,duracao_segundos,link,visualizacoes,quantidade_likes,query,tempo_desde_pub,visualizacoes_por_dia
0,watch?v=kE9gYQDyVr0,Trabalhando com Bases de Dados (Datasets) Desb...,0.0,Hashtag Programação,2023-04-17,['Education'],"['Bases de dados desbalanceadas', 'Datasets de...",1326.0,https://www.youtube.com/watch?v=kE9gYQDyVr0,858,72.0,ciencia+de+dados,3.0,286.0
1,watch?v=GgEUPkAG1ho,Trabalho ciencia de dados-parte-1,0.0,Jonas Silva,2023-04-13,['People & Blogs'],,389.0,https://www.youtube.com/watch?v=GgEUPkAG1ho,2,0.0,ciencia+de+dados,7.0,0.285714


# Features

In [4]:
features = df[['visualizacoes', 'visualizacoes_por_dia']]
features = features.astype(float)
y = df['label'].copy()
features

Unnamed: 0,visualizacoes,visualizacoes_por_dia
0,858.0,286.000000
1,2.0,0.285714
2,1.0,0.100000
3,653.0,59.363636
4,2.0,0.066667
...,...,...
1312,647.0,3.634831
1313,2.0,0.011236
1314,214.0,1.188889
1315,520.0,2.872928


In [5]:
data_treino = (df['data_upload'] < '2023-03-01')
data_teste = (df['data_upload'] >= '2023-03-01')

X_train, X_test = features[data_treino], features[data_teste] 
y_train, y_test = y[data_treino], y[data_teste] 
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((869, 2), (445, 2), (869,), (445,))

In [6]:
# TfidfVectorizer -> palavras que aparecerem pouco em muitos vídeos, mas aparecerem
# muito em um vídeo, vão ter um peso maior. Palavras que aparecem muito em muitos vídeos
# terão menos peso

from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df[data_treino]['titulo']
title_test = df[data_teste]['titulo']

#min_df: número mínimo para que uma palavra precisa aparecer nos dados para que essa palavra vire uma coluna. Ex.: machine tem que aparecer em no mínimo dois vídeos para poder virar uma coluna de palavra
title_vec = TfidfVectorizer(min_df=2, ngram_range=(1, 5)) 
title_bow_train = title_vec.fit_transform(title_train)
title_bow_test = title_vec.transform(title_test)

In [7]:
title_bow_train.shape

(869, 2165)

In [8]:
X_train_wtitle = hstack([X_train, title_bow_train])
X_test_wtitle = hstack([X_test, title_bow_test])

In [9]:
X_train_wtitle.shape, X_test_wtitle.shape

((869, 2167), (445, 2167))

# Random Forest

In [10]:
title_vec = TfidfVectorizer(min_df=1, ngram_range=(1, 5))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_test = title_vec.transform(title_test)

X_train_wtitle = hstack([X_train, title_bow_train])
X_test_wtitle = hstack([X_test, title_bow_test])

random_forest = RandomForestClassifier(max_depth=51, 
                                       max_features='sqrt', 
                                       min_samples_leaf=1,
                                       min_samples_split=8, 
                                       n_estimators=993,
                                       class_weight="balanced", n_jobs=6,
                                       random_state=160745)

random_forest.fit(X_train_wtitle, y_train)
probabilidade_rf = random_forest.predict_proba(X_test_wtitle)[:, 1]
print(average_precision_score(y_test, probabilidade_rf), roc_auc_score(y_test, probabilidade_rf))

0.4864659892691304 0.8986624348220358


|Modelo| Tipo de modelo| Precision | ROC AUC| Parâmetros
|------|---------------|-----------|--------|-----------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 2 | Random Forest| 0.4190096457477138| 0.6517067003792667| 
|modelo 3 (aumento teste)| Random Forest| 0.3840157528914409| 0.6327519379844962|
|modelo 3 (aumento treino)| Random Forest| 0.46846074116867686| 0.6336915297092287|
|modelo 4 (normal)| Random Forest| 0.42419901378303854 | 0.877097030151893|
|modelo 4 (tunado)| Random Forest| 0.47588711602407213 | 0.8925980503287236| [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:3]
|modelo 4 (normal)| LightGBM| 0.3900304192098481 | 0.8610292450691452|
|modelo 4 (tunado)| LightGBM| 0.5064733173326582 | 0.8971321695760599| [lr:0.0014099928811969545, max_depth:9, min_child_samples:9, subsample:0.6502182010234373, colsample_bytree:0.6866210554187129, n_estimators:979,min_df:1, ngram_range:5]
|modelo 4 (tunado)| Logistic Regression| 0.45449846483463946 | 0.8893674903649966| [C=10
|modelo 5 (tunado)| Random Forest| 0.4864659892691304  | 0.8986624348220358 | [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:5]



# LightGBM

In [16]:
title_vec = TfidfVectorizer(min_df=1, ngram_range=(1, 5))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_test = title_vec.transform(title_test)

X_train_wtitle = hstack([X_train, title_bow_train])
X_test_wtitle = hstack([X_test, title_bow_test])

lgbm = LGBMClassifier(learning_rate=0.0014099928811969545, num_leaves=2 ** 9,
                     max_depth=9, min_child_samples=9,
                     subsample=0.6502182010234373, colsample_bytree= 0.6866210554187129, 
                     bagging_freq=1, n_estimators=979, random_state=0,
                     class_weight="balanced", n_jobs=6)
lgbm.fit(X_train_wtitle, y_train)
probabilidade_lgbm = lgbm.predict_proba(X_test_wtitle)[:, 1]
print(average_precision_score(y_test, probabilidade_lgbm), roc_auc_score(y_test, probabilidade_lgbm))

0.5064733173326582 0.8971321695760599




|Modelo| Tipo de modelo| Precision | ROC AUC| Parâmetros
|------|---------------|-----------|--------|-----------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 2 | Random Forest| 0.4190096457477138| 0.6517067003792667| 
|modelo 3 (aumento teste)| Random Forest| 0.3840157528914409| 0.6327519379844962|
|modelo 3 (aumento treino)| Random Forest| 0.46846074116867686| 0.6336915297092287|
|modelo 4 (normal)| Random Forest| 0.42419901378303854 | 0.877097030151893|
|modelo 4 (tunado)| Random Forest| 0.47588711602407213 | 0.8925980503287236| [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:3]
|modelo 4 (normal)| LightGBM| 0.3900304192098481 | 0.8610292450691452|
|modelo 4 (tunado)| LightGBM| 0.5064733173326582 | 0.8971321695760599| [lr:0.0014099928811969545, max_depth:9, min_child_samples:9, subsample:0.6502182010234373, colsample_bytree:0.6866210554187129, n_estimators:979,min_df:1, ngram_range:5]
|modelo 4 (tunado)| Logistic Regression| 0.45449846483463946 | 0.8893674903649966| [C=10
|modelo 5 (tunado)| Random Forest| 0.4864659892691304  | 0.8986624348220358 | [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:5]
|modelo 5 (tunado)| LightGBM| 0.4564099413174987 | 0.8795057810020404| [lr:0.0014099928811969545, max_depth:9, min_child_samples:9, subsample:0.6502182010234373, colsample_bytree:0.6866210554187129, n_estimators:979,min_df:1, ngram_range:3]



# Regressão Logistíca

In [11]:
# maxabsclares: pega o maior valor da coluna, pega seu valor absoluto e dividi todos os outros valores por ele
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix
from sklearn.pipeline import make_pipeline

In [12]:
X_train_wtitle2 = csr_matrix(X_train_wtitle.copy())
X_test_wtitle2 = csr_matrix(X_test_wtitle.copy())

# scaler = StandardScaler()
# max_abs_scaler = MaxAbsScaler()

# X_train_wtitle2[:, :2] = scaler.fit_transform(X_train_wtitle2[:, :2].todense())
# X_test_wtitle2[:, :2] = scaler.fit_transform(X_test_wtitle2[:, :2].todense())

#X_train_wtitle2 = max_abs_scaler.fit_transform(X_train_wtitle2)
#X_test_wtitle2= max_abs_scaler.fit_transform(X_test_wtitle2)

lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C=10, penalty='l2', n_jobs=6, random_state=0))
lr_pipeline.fit(X_train_wtitle2, y_train)

Pipeline(steps=[('maxabsscaler', MaxAbsScaler()),
                ('logisticregression',
                 LogisticRegression(C=10, n_jobs=6, random_state=0))])

In [13]:
probabilidade_lr = lr_pipeline.predict_proba(X_test_wtitle2)[:, 1]

In [14]:
print(average_precision_score(y_test, probabilidade_lr), roc_auc_score(y_test, probabilidade_lr))

0.4276112689005667 0.8854568125141691


|Modelo| Tipo de modelo| Precision | ROC AUC| Parâmetros
|------|---------------|-----------|--------|-----------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 2 | Random Forest| 0.4190096457477138| 0.6517067003792667| 
|modelo 3 (aumento teste)| Random Forest| 0.3840157528914409| 0.6327519379844962|
|modelo 3 (aumento treino)| Random Forest| 0.46846074116867686| 0.6336915297092287|
|modelo 4 (normal)| Random Forest| 0.42419901378303854 | 0.877097030151893|
|modelo 4 (tunado)| Random Forest| 0.47588711602407213 | 0.8925980503287236| [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:3]
|modelo 4 (normal)| LightGBM| 0.3900304192098481 | 0.8610292450691452|
|modelo 4 (tunado)| LightGBM| 0.5064733173326582 | 0.8971321695760599| [lr:0.0014099928811969545, max_depth:9, min_child_samples:9, subsample:0.6502182010234373, colsample_bytree:0.6866210554187129, n_estimators:979,min_df:1, ngram_range:5]
|modelo 4 (tunado)| Logistic Regression| 0.45449846483463946 | 0.8893674903649966| [C=10
|modelo 5 (tunado)| Random Forest| 0.4864659892691304  | 0.8986624348220358 | [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:5]
|modelo 5 (tunado)| LightGBM| 0.4564099413174987 | 0.8795057810020404| [lr:0.0014099928811969545, max_depth:9, min_child_samples:9, subsample:0.6502182010234373, colsample_bytree:0.6866210554187129, n_estimators:979,min_df:1, ngram_range:3]
|modelo 5 (tunado)| Logistic Regression| 0.4191684005022833 | 0.8830763999093176 | Pipeline([C=10])

# Ensemble

Selecionando os melhores modelos de cada algoritmo:

|Modelo| Tipo de modelo| Precision | ROC AUC| Parâmetros
|------|---------------|-----------|--------|-----------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 4 (tunado)| LightGBM| 0.5064733173326582 | 0.8971321695760599| [lr:0.0014099928811969545, max_depth:9, min_child_samples:9, subsample:0.6502182010234373, colsample_bytree:0.6866210554187129, n_estimators:979,min_df:1, ngram_range:5]
|modelo 4 (tunado)| Logistic Regression| 0.45449846483463946 | 0.8893674903649966| [C=10
|modelo 5 (tunado)| Random Forest| 0.4864659892691304  | 0.8986624348220358 | [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:5]


## Três modelos juntos

In [15]:
probabilidade_final = (probabilidade_rf + probabilidade_lgbm + probabilidade_lr) / 3
print(average_precision_score(y_test, probabilidade_final), roc_auc_score(y_test, probabilidade_final))

NameError: name 'probabilidade_lgbm' is not defined

|Modelo| Tipo de modelo| Precision | ROC AUC| Parâmetros
|------|---------------|-----------|--------|-----------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 4 (tunado)| LightGBM| 0.5064733173326582 | 0.8971321695760599| [lr:0.0014099928811969545, max_depth:9, min_child_samples:9, subsample:0.6502182010234373, colsample_bytree:0.6866210554187129, n_estimators:979,min_df:1, ngram_range:5]
|modelo 4 (tunado)| Logistic Regression| 0.45449846483463946 | 0.8893674903649966| [C=10
|modelo 5 (tunado)| Random Forest| 0.4864659892691304  | 0.8986624348220358 | [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:5]
|Modelo 5 |ensemble (RF + LGBM + RL)| 0.5164652821315261 | 0.9024030832010882

In [24]:
pd.DataFrame({'LR': probabilidade_lr, 'RF': probabilidade_rf, 'LGBM': probabilidade_lgbm}).corr()

Unnamed: 0,LR,RF,LGBM
LR,1.0,0.69669,0.456896
RF,0.69669,1.0,0.880958
LGBM,0.456896,0.880958,1.0


# RF + LGBM

In [25]:
probabilidade_final = (0.3 * probabilidade_rf + 0.7 * probabilidade_lgbm) / 2
print(average_precision_score(y_test, probabilidade_final), roc_auc_score(y_test, probabilidade_final))

0.5538789009146141 0.8981523464067105


|Modelo| Tipo de modelo| Precision | ROC AUC| Parâmetros| Modelo escolhido
|------|---------------|-----------|--------|-----------|-----------------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 4 (tunado)| LightGBM| 0.5064733173326582 | 0.8971321695760599| [lr:0.0014099928811969545, max_depth:9, min_child_samples:9, subsample:0.6502182010234373, colsample_bytree:0.6866210554187129, n_estimators:979,min_df:1, ngram_range:5]
|modelo 4 (tunado)| Logistic Regression| 0.45449846483463946 | 0.8893674903649966| [C=10
|modelo 5 (tunado)| Random Forest| 0.4864659892691304  | 0.8986624348220358 | [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:5]
|Modelo 5 |ensemble 3 modelos(RF + LGBM + RL)| 0.5164652821315261 | 0.9024030832010882
|**Modelo 5** |**ensemble 2 modelos (RF + LGBM)**| **0.5538789009146141** | **0.8981523464067105**| | **Modelo final**

# Salvar modelos

In [26]:
import joblib as jb

In [28]:
jb.dump(lgbm, 'modelos/lgbm_2023_04_19.pkl.z')
jb.dump(random_forest, 'modelos/random_forest_2023_04_19.pkl.z')
jb.dump(title_vec, 'modelos/title_vectorize_2023_04_19.pkl.z')

['modelos/title_vectorize_2023_04_19.pkl.z']