In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt

from scipy.sparse import hstack, vstack
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, average_precision_score
from lightgbm import LGBMClassifier
from skopt import forest_minimize

In [2]:
df = pd.read_csv('dados/silver/dados_tratados_com_features_novas_com_todas_labels.csv', index_col=0).dropna(subset=['label'])
df.shape

(1314, 14)

In [3]:
df.head(2)

Unnamed: 0,id,titulo,label,canal,data_upload,categoria,tags,duracao_segundos,link,visualizacoes,quantidade_likes,query,tempo_desde_pub,visualizacoes_por_dia
0,watch?v=kE9gYQDyVr0,Trabalhando com Bases de Dados (Datasets) Desb...,0.0,Hashtag Programação,2023-04-17,['Education'],"['Bases de dados desbalanceadas', 'Datasets de...",1326.0,https://www.youtube.com/watch?v=kE9gYQDyVr0,858,72.0,ciencia+de+dados,3.0,286.0
1,watch?v=GgEUPkAG1ho,Trabalho ciencia de dados-parte-1,0.0,Jonas Silva,2023-04-13,['People & Blogs'],,389.0,https://www.youtube.com/watch?v=GgEUPkAG1ho,2,0.0,ciencia+de+dados,7.0,0.285714


# Features

In [4]:
features = df[['visualizacoes', 'visualizacoes_por_dia']]
features = features.astype(float)
y = df['label'].copy()
features

Unnamed: 0,visualizacoes,visualizacoes_por_dia
0,858.0,286.000000
1,2.0,0.285714
2,1.0,0.100000
3,653.0,59.363636
4,2.0,0.066667
...,...,...
1312,647.0,3.634831
1313,2.0,0.011236
1314,214.0,1.188889
1315,520.0,2.872928


In [5]:
data_treino = (df['data_upload'] < '2023-03-01')
data_teste = (df['data_upload'] >= '2023-03-01')

X_train, X_test = features[data_treino], features[data_teste] 
y_train, y_test = y[data_treino], y[data_teste] 
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((869, 2), (445, 2), (869,), (445,))

In [6]:
# TfidfVectorizer -> palavras que aparecerem pouco em muitos vídeos, mas aparecerem
# muito em um vídeo, vão ter um peso maior. Palavras que aparecem muito em muitos vídeos
# terão menos peso

from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df[data_treino]['titulo']
title_test = df[data_teste]['titulo']

#min_df: número mínimo para que uma palavra precisa aparecer nos dados para que essa palavra vire uma coluna. Ex.: machine tem que aparecer em no mínimo dois vídeos para poder virar uma coluna de palavra
title_vec = TfidfVectorizer(min_df=2, ngram_range=(1, 5)) 
title_bow_train = title_vec.fit_transform(title_train)
title_bow_test = title_vec.transform(title_test)

In [7]:
title_bow_train.shape #linhas e colunas, logo achou 188 palavras que repetem 2 ou mais vezes

(869, 2165)

In [8]:
from scipy.sparse import hstack, vstack

X_train_wtitle = hstack([X_train, title_bow_train])
X_test_wtitle = hstack([X_test, title_bow_test])

In [9]:
X_train_wtitle.shape, X_test_wtitle.shape

((869, 2167), (445, 2167))

# Random Forest

In [10]:
# min_samples_leaf = quantos exemplos pra vc precisar ter para criar um novo nó
random_forest = RandomForestClassifier(n_estimators=1000, random_state=0, 
                                       min_samples_leaf=1, class_weight='balanced',
                                       n_jobs=6)
random_forest.fit(X_train_wtitle, y_train)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=6,
                       random_state=0)

In [11]:
probabilidade = random_forest.predict_proba(X_test_wtitle)[:, 1]

In [12]:
average_precision_score(y_test, probabilidade)

0.42419901378303854

In [13]:
roc_auc_score(y_test, probabilidade)

0.877097030151893

|Modelo| Tipo de modelo| Precision | ROC AUC| 
|------|---------------|-----------|--------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 2 | Random Forest| 0.4190096457477138| 0.6517067003792667| 
|modelo 3 (aumento teste)| Random Forest| 0.3840157528914409| 0.6327519379844962|
|modelo 3 (aumento treino)| Random Forest| 0.46846074116867686| 0.6336915297092287|
|modelo 4 (normal)| Random Forest| 0.42419901378303854 | 0.877097030151893|

## Bayesian Optimization

In [14]:
def tune_rf(params):
    print(params)
    max_depth = params[0]
    max_features = params[1]
    min_samples_leaf = params[2]
    min_samples_split = params[3]
    n_estimators = params[4]
    min_df = params[5]
    ngram_range = (1, params[6])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_test = title_vec.transform(title_test)
    
    X_train_wtitle = hstack([X_train, title_bow_train])
    X_test_wtitle = hstack([X_test, title_bow_test])
    
    random_forest = RandomForestClassifier(max_depth=max_depth,
                                           max_features=max_features, 
                                           min_samples_leaf=min_samples_leaf,
                                           min_samples_split=min_samples_split, 
                                           n_estimators=n_estimators,
                                           class_weight="balanced", n_jobs=6,
                                           random_state=160745)
    
    random_forest.fit(X_train_wtitle, y_train)
    probabilidade = random_forest.predict_proba(X_test_wtitle)[:, 1]
    print(roc_auc_score(y_test, probabilidade))
    
    return -average_precision_score(y_test, probabilidade)

In [15]:
space = [(10, 1000), # max_depth
          ('auto', 'sqrt', 'log2'), # max_features
          (1, 10), # min_samples_leaf
          (2, 20), # min_samples_split
          (100, 2000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

resultado = forest_minimize(tune_rf, space, random_state=160745,
                            n_random_starts=10, n_calls=20, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[868, 'sqrt', 5, 2, 834, 3, 1]
0.8499206529131716
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 1.6217
Function value obtained: -0.3894
Current minimum: -0.3894
Iteration No: 2 started. Evaluating function at random point.
[182, 'auto', 2, 11, 804, 3, 4]
0.8747449557923372
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 1.5879
Function value obtained: -0.3851
Current minimum: -0.3894
Iteration No: 3 started. Evaluating function at random point.
[159, 'auto', 5, 7, 1695, 4, 4]
0.8622761278621629
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 3.2371
Function value obtained: -0.3856
Current minimum: -0.3894
Iteration No: 4 started. Evaluating function at random point.
[864, 'log2', 8, 13, 1486, 3, 1]
0.8492972115166628
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 2.7850
Function value obtained: -0.3841
Current minimum: -0.3894
Iteration No: 5 

In [16]:
resultado.x

[51, 'sqrt', 1, 8, 993, 1, 3]

In [17]:
title_vec = TfidfVectorizer(min_df=1, ngram_range=(1, 3))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_test = title_vec.transform(title_test)

X_train_wtitle = hstack([X_train, title_bow_train])
X_test_wtitle = hstack([X_test, title_bow_test])

random_forest = RandomForestClassifier(max_depth=51, 
                                       max_features='sqrt', 
                                       min_samples_leaf=1,
                                       min_samples_split=8, 
                                       n_estimators=993,
                                       class_weight="balanced", n_jobs=6,
                                       random_state=160745)

random_forest.fit(X_train_wtitle, y_train)
probabilidade = random_forest.predict_proba(X_test_wtitle)[:, 1]
print(roc_auc_score(y_test, probabilidade), average_precision_score(y_test, probabilidade))

0.8925980503287236 0.47588711602407213


|Modelo| Tipo de modelo| Precision | ROC AUC| Parâmetros
|------|---------------|-----------|--------|-----------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 2 | Random Forest| 0.4190096457477138| 0.6517067003792667| 
|modelo 3 (aumento teste)| Random Forest| 0.3840157528914409| 0.6327519379844962|
|modelo 3 (aumento treino)| Random Forest| 0.46846074116867686| 0.6336915297092287|
|modelo 4 (normal)| Random Forest| 0.42419901378303854 | 0.877097030151893|
|modelo 4 (tunado)| Random Forest| 0.47588711602407213 | 0.8925980503287236| [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:3]

# LightGBM

In [19]:
lgbm = LGBMClassifier(random_state=0, class_weight='balanced', n_jobs=6)
lgbm.fit(X_train_wtitle, y_train)

LGBMClassifier(class_weight='balanced', n_jobs=6, random_state=0)

In [20]:
probabilidade = lgbm.predict_proba(X_test_wtitle)[:, 1]



In [21]:
average_precision_score(y_test, probabilidade), roc_auc_score(y_test, probabilidade)

(0.3900304192098481, 0.8610292450691452)

|Modelo| Tipo de modelo| Precision | ROC AUC| Parâmetros
|------|---------------|-----------|--------|-----------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 2 | Random Forest| 0.4190096457477138| 0.6517067003792667| 
|modelo 3 (aumento teste)| Random Forest| 0.3840157528914409| 0.6327519379844962|
|modelo 3 (aumento treino)| Random Forest| 0.46846074116867686| 0.6336915297092287|
|modelo 4 (normal)| Random Forest| 0.42419901378303854 | 0.877097030151893|
|modelo 4 (tunado)| Random Forest| 0.47588711602407213 | 0.8925980503287236| [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:3]
|modelo 4 (normal)| LightGBM| 0.3900304192098481 | 0.8610292450691452|

## Bayesian Optimization

In [22]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    min_df = params[6]
    ngram_range = (1, params[7])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_test = title_vec.transform(title_test)
    
    X_train_wtitle = hstack([X_train, title_bow_train])
    X_test_wtitle = hstack([X_test, title_bow_test])
    
    lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth,
                         max_depth=max_depth, min_child_samples=min_child_samples,
                         subsample=subsample, colsample_bytree=colsample_bytree, 
                         bagging_freq=1, n_estimators=n_estimators, random_state=0,
                         class_weight="balanced", n_jobs=6)
    lgbm.fit(X_train_wtitle, y_train)
    probabilidade = lgbm.predict_proba(X_test_wtitle)[:, 1]
    print(roc_auc_score(y_test, probabilidade))
    
    return -average_precision_score(y_test, probabilidade)

In [23]:
space = [(1e-3, 1e-1, 'log-uniform'), #lr
       (1, 10), #max_depth
       (1, 20), #min_child_samples
       (0.05, 1.), #subsample
       (0.05, 1.), # colsample_bytree  
       (10, 1000), #n_estimators
       (1, 5), # min_df
       (1, 5)] # ngram_range

resultado = forest_minimize(tune_lgbm, space, random_state=160745,
                            n_random_starts=10, n_calls=20, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 182, 3, 1]




0.8254930854681478
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.3187
Function value obtained: -0.3261
Current minimum: -0.3261
Iteration No: 2 started. Evaluating function at random point.
[0.053887464791860025, 1, 15, 0.7437489153990157, 0.8675167974293533, 459, 3, 4]
0.8336828383586489
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1204
Function value obtained: -0.3436
Current minimum: -0.3436
Iteration No: 3 started. Evaluating function at random point.
[0.004151454520895999, 6, 20, 0.8682075103820793, 0.9491436163200662, 321, 4, 3]




0.8660167762412152
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.2140
Function value obtained: -0.3665
Current minimum: -0.3665
Iteration No: 4 started. Evaluating function at random point.
[0.0014099928811969545, 9, 9, 0.6502182010234373, 0.6866210554187129, 979, 1, 5]




0.8971321695760599
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 1.1606
Function value obtained: -0.5065
Current minimum: -0.5065
Iteration No: 5 started. Evaluating function at random point.
[0.0012872056698236922, 6, 8, 0.3448850990367951, 0.371803112561826, 703, 4, 1]




0.8530378598957152
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.5523
Function value obtained: -0.4159
Current minimum: -0.5065
Iteration No: 6 started. Evaluating function at random point.
[0.003567949451535685, 10, 19, 0.7232951768944309, 0.7298538828427115, 849, 4, 3]




0.8778054862842892
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 0.7628
Function value obtained: -0.4238
Current minimum: -0.5065
Iteration No: 7 started. Evaluating function at random point.
[0.014828577273549474, 7, 1, 0.18428087097824575, 0.3261556557915816, 184, 1, 2]




0.861085921559737
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 0.9328
Function value obtained: -0.4665
Current minimum: -0.5065
Iteration No: 8 started. Evaluating function at random point.
[0.0015212976972079912, 3, 12, 0.44234694306528044, 0.399351303640462, 182, 3, 5]
0.8739231466787577
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 0.2059
Function value obtained: -0.4110
Current minimum: -0.5065
Iteration No: 9 started. Evaluating function at random point.
[0.01946212855369041, 9, 18, 0.5235636153223084, 0.6728679300083596, 657, 4, 5]








0.8463500340058944
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 0.5021
Function value obtained: -0.3424
Current minimum: -0.5065
Iteration No: 10 started. Evaluating function at random point.
[0.0012116790683302117, 3, 2, 0.06616307483844217, 0.23025600705315752, 587, 2, 5]




0.8577986851054182
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 1.5761
Function value obtained: -0.3864
Current minimum: -0.5065
Iteration No: 11 started. Searching for the next optimal point.
[0.05696329143679483, 10, 10, 0.7704339000317592, 0.5363769745994437, 818, 2, 2]




0.8034459306279755
Iteration No: 11 ended. Search finished for the next optimal point.
Time taken: 1.5818
Function value obtained: -0.3845
Current minimum: -0.5065
Iteration No: 12 started. Searching for the next optimal point.
[0.0011992657133966242, 8, 6, 0.9129013695364318, 0.6669627457969953, 982, 2, 5]




0.8354681478122875
Iteration No: 12 ended. Search finished for the next optimal point.
Time taken: 2.4070
Function value obtained: -0.3589
Current minimum: -0.5065
Iteration No: 13 started. Searching for the next optimal point.
[0.0011181590238459151, 9, 15, 0.5149880123416892, 0.645885726051927, 836, 2, 5]




0.8800725459079574
Iteration No: 13 ended. Search finished for the next optimal point.
Time taken: 0.8401
Function value obtained: -0.4507
Current minimum: -0.5065
Iteration No: 14 started. Searching for the next optimal point.
[0.0025076342055892287, 10, 11, 0.22866269936746775, 0.7488403059207158, 882, 2, 5]




0.8630695987304466
Iteration No: 14 ended. Search finished for the next optimal point.
Time taken: 0.9337
Function value obtained: -0.4002
Current minimum: -0.5065
Iteration No: 15 started. Searching for the next optimal point.
[0.0014331286968287877, 9, 9, 0.5805040445517531, 0.7546293525384392, 410, 2, 5]




0.8780321922466561
Iteration No: 15 ended. Search finished for the next optimal point.
Time taken: 0.6533
Function value obtained: -0.4477
Current minimum: -0.5065
Iteration No: 16 started. Searching for the next optimal point.
[0.00200534010861793, 9, 9, 0.6410348508642206, 0.9779588977174413, 922, 2, 4]




0.8774087508501474
Iteration No: 16 ended. Search finished for the next optimal point.
Time taken: 1.2833
Function value obtained: -0.4191
Current minimum: -0.5065
Iteration No: 17 started. Searching for the next optimal point.
[0.0010828743643737323, 9, 8, 0.7000271364885504, 0.8498309079688997, 933, 1, 5]




0.8881206075719792
Iteration No: 17 ended. Search finished for the next optimal point.
Time taken: 1.2988
Function value obtained: -0.4891
Current minimum: -0.5065
Iteration No: 18 started. Searching for the next optimal point.
[0.04701635352606865, 6, 14, 0.31996393654102856, 0.5045672957333033, 397, 1, 2]




0.8245862616186805
Iteration No: 18 ended. Search finished for the next optimal point.
Time taken: 0.5321
Function value obtained: -0.3554
Current minimum: -0.5065
Iteration No: 19 started. Searching for the next optimal point.
[0.008663929625020213, 9, 6, 0.9727910200295122, 0.8353148654941072, 909, 1, 5]




0.8314441169802766
Iteration No: 19 ended. Search finished for the next optimal point.
Time taken: 1.3587
Function value obtained: -0.4196
Current minimum: -0.5065
Iteration No: 20 started. Searching for the next optimal point.
[0.0012948667679063663, 8, 6, 0.9805865967329576, 0.7187568369865114, 870, 1, 5]




0.8630695987304466
Iteration No: 20 ended. Search finished for the next optimal point.
Time taken: 1.4412
Function value obtained: -0.4191
Current minimum: -0.5065


In [24]:
resultado.x

[0.0014099928811969545,
 9,
 9,
 0.6502182010234373,
 0.6866210554187129,
 979,
 1,
 5]

In [25]:
title_vec = TfidfVectorizer(min_df=1, ngram_range=(1, 5))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_test = title_vec.transform(title_test)

X_train_wtitle = hstack([X_train, title_bow_train])
X_test_wtitle = hstack([X_test, title_bow_test])

lgbm = LGBMClassifier(learning_rate=0.0014099928811969545, num_leaves=2 ** 9,
                     max_depth=9, min_child_samples=9,
                     subsample=0.6502182010234373, colsample_bytree= 0.6866210554187129, 
                     bagging_freq=1, n_estimators=979, random_state=0,
                     class_weight="balanced", n_jobs=6)
lgbm.fit(X_train_wtitle, y_train)
probabilidade = lgbm.predict_proba(X_test_wtitle)[:, 1]
print(average_precision_score(y_test, probabilidade), roc_auc_score(y_test, probabilidade))

0.5064733173326582 0.8971321695760599




|Modelo| Tipo de modelo| Precision | ROC AUC| Parâmetros
|------|---------------|-----------|--------|-----------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 2 | Random Forest| 0.4190096457477138| 0.6517067003792667| 
|modelo 3 (aumento teste)| Random Forest| 0.3840157528914409| 0.6327519379844962|
|modelo 3 (aumento treino)| Random Forest| 0.46846074116867686| 0.6336915297092287|
|modelo 4 (normal)| Random Forest| 0.42419901378303854 | 0.877097030151893|
|modelo 4 (tunado)| Random Forest| 0.47588711602407213 | 0.8925980503287236| [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:3]
|modelo 4 (normal)| LightGBM| 0.3900304192098481 | 0.8610292450691452|
|modelo 4 (tunado)| LightGBM| 0.5064733173326582 | 0.8971321695760599| [lr:0.0014099928811969545, max_depth:9, min_child_samples:9, subsample:0.6502182010234373, colsample_bytree:0.6866210554187129, n_estimators:979,min_df:1, ngram_range:5]

# Regressão Logistíca

In [26]:
# maxabsclares: pega o maior valor da coluna, pega seu valor absoluto e dividi todos os outros valores por ele
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

In [27]:
X_train_wtitle2 = csr_matrix(X_train_wtitle.copy())
X_test_wtitle2 = csr_matrix(X_test_wtitle.copy())

scaler = StandardScaler()
max_abs_scaler = MaxAbsScaler()

X_train_wtitle2[:, :2] = scaler.fit_transform(X_train_wtitle2[:, :2].todense())
X_test_wtitle2[:, :2] = scaler.fit_transform(X_test_wtitle2[:, :2].todense())

#X_train_wtitle2 = max_abs_scaler.fit_transform(X_train_wtitle2)
#X_test_wtitle2= max_abs_scaler.fit_transform(X_test_wtitle2)

  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)


In [28]:
logistic_regression = LogisticRegression(C=10, n_jobs=6, random_state=0)
logistic_regression.fit(X_train_wtitle2, y_train)

LogisticRegression(C=10, n_jobs=6, random_state=0)

In [29]:
probabilidade = logistic_regression.predict_proba(X_test_wtitle2)[:, 1]

In [30]:
print(average_precision_score(y_test, probabilidade), roc_auc_score(y_test, probabilidade))

0.45449846483463946 0.8893674903649966


|Modelo| Tipo de modelo| Precision | ROC AUC| Parâmetros
|------|---------------|-----------|--------|-----------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 2 | Random Forest| 0.4190096457477138| 0.6517067003792667| 
|modelo 3 (aumento teste)| Random Forest| 0.3840157528914409| 0.6327519379844962|
|modelo 3 (aumento treino)| Random Forest| 0.46846074116867686| 0.6336915297092287|
|modelo 4 (normal)| Random Forest| 0.42419901378303854 | 0.877097030151893|
|modelo 4 (tunado)| Random Forest| 0.47588711602407213 | 0.8925980503287236| [max_depth:51, max_features:'sqrt', min_samples_leaf:1, min_samples_split:8, n_estimators:993, min_df:1, ngram_range:3]
|modelo 4 (normal)| LightGBM| 0.3900304192098481 | 0.8610292450691452|
|modelo 4 (tunado)| LightGBM| 0.5064733173326582 | 0.8971321695760599| [lr:0.0014099928811969545, max_depth:9, min_child_samples:9, subsample:0.6502182010234373, colsample_bytree:0.6866210554187129, n_estimators:979,min_df:1, ngram_range:5]
|modelo 4 (tunado)| Logistic Regression| 0.45449846483463946 | 0.8893674903649966| [C=10]
