In [1]:
import pandas as pd
import numpy as np
import datetime

from scipy.sparse import hstack, vstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, average_precision_score

In [2]:
df1 = pd.read_csv('dados/silver/dados_tratados_com_features_novas_com_labels.csv', index_col=0)
df1 = df1[df1['label'].notnull()]
df1 = df1.reset_index().drop('index', axis=1)
print(df1.shape)

(500, 14)


In [3]:
df2 = pd.read_csv('dados/silver/active_label1_done.csv', index_col=0)
df2 = df2[df2['label'].notnull()]
df2['novo'] = 1
df2.shape

(100, 16)

In [4]:
df2.head(2)

Unnamed: 0,id,titulo,label,canal,data_upload,categoria,tags,duracao_segundos,link,visualizacoes,quantidade_likes,query,tempo_desde_pub,visualizacoes_por_dia,probabilidade,novo
503,watch?v=3Ie39phKdpk,Manual machine - learning BMX racing bike skills!,0,Rockstar Harley,2022-10-11,['Sports'],"['bmx racing', 'bike skills', 'dream', 'fun ki...",11.0,https://www.youtube.com/watch?v=3Ie39phKdpk,53,954870.0,machine+learning,191.0,0.277487,309.0,1
523,watch?v=wtolixa9XTg,How I would learn Machine Learning (if I could...,0,AssemblyAI,2022-09-03,['People & Blogs'],,463.0,https://www.youtube.com/watch?v=wtolixa9XTg,210,10423.0,machine+learning,229.0,0.917031,338.0,1


In [5]:
average_precision_score(df2['label'], df2['probabilidade']), roc_auc_score(df2['label'], df2['probabilidade'])

(0.43295339460466015, 0.62625)

In [6]:
df = pd.concat([df1, df2.drop('probabilidade', axis=1)])
df['novo'] = df['novo'].fillna(0)
df.head(3)
print(df.shape)

(600, 15)


# Features

In [7]:
features = df[['visualizacoes', 'visualizacoes_por_dia']]
features = features.astype(float)
y = df['label'].copy()
features

Unnamed: 0,visualizacoes,visualizacoes_por_dia
0,858.0,286.000000
1,2.0,0.285714
2,1.0,0.100000
3,653.0,59.363636
4,2.0,0.066667
...,...,...
875,10.0,1.250000
660,232.0,0.334294
537,595.0,2.195572
1040,2.0,0.045455


# Aumenta validação

In [8]:
data_treino = (df['data_upload'] < '2023-03-01') & (df['novo'] == 0)
data_teste = (df['data_upload'] >= '2023-03-01')

X_train, X_test = features[data_treino], features[data_teste] 
y_train, y_test = y[data_treino], y[data_teste] 
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((345, 2), (175, 2), (345,), (175,))

In [9]:
# TfidfVectorizer -> palavras que aparecerem pouco em muitos vídeos, mas aparecerem
# muito em um vídeo, vão ter um peso maior. Palavras que aparecem muito em muitos vídeos
# terão menos peso

from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df[data_treino]['titulo']
title_test = df[data_teste]['titulo']

#min_df: número mínimo para que uma palavra precisa aparecer nos dados para que essa palavra vire uma coluna. Ex.: machine tem que aparecer em no mínimo dois vídeos para poder virar uma coluna de palavra
title_vec = TfidfVectorizer(min_df=2) 
title_bow_train = title_vec.fit_transform(title_train)
title_bow_test = title_vec.transform(title_test)

In [10]:
title_bow_train.shape #linhas e colunas, logo achou 188 palavras que repetem 2 ou mais vezes

(345, 277)

In [11]:
title_bow_test
# matriz esparsa -> matriz com elementos somente diferente de 0

<175x277 sparse matrix of type '<class 'numpy.float64'>'
	with 797 stored elements in Compressed Sparse Row format>

In [12]:
X_train_wtitle = hstack([X_train, title_bow_train])
X_test_wtitle = hstack([X_test, title_bow_test])

In [13]:
X_train_wtitle.shape, X_test_wtitle.shape

((345, 279), (175, 279))

In [14]:
random_forest = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced', n_jobs=6)
random_forest.fit(X_train_wtitle, y_train)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=6,
                       random_state=0)

In [15]:
probabilidade = random_forest.predict_proba(X_test_wtitle)[:, 1]

In [16]:
average_precision_score(y_test, probabilidade)

0.3840157528914409

In [17]:
roc_auc_score(y_test, probabilidade)

0.6327519379844962

|Modelo| Tipo de modelo| Precision | ROC AUC| 
|------|---------------|-----------|--------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 2 | Random Forest| 0.4190096457477138| 0.6517067003792667| 
|modelo 3 (aumento teste)| Random Forest| 0.3840157528914409| 0.6327519379844962|

# Aumenta o treino

In [18]:
data_treino = (df['data_upload'] < '2023-03-01') 
data_teste = (df['data_upload'] >= '2023-03-01') & (df['novo'] == 0)

X_train, X_test = features[data_treino], features[data_teste] 
y_train, y_test = y[data_treino], y[data_teste] 
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((425, 2), (155, 2), (425,), (155,))

In [19]:
title_train = df[data_treino]['titulo']
title_test = df[data_teste]['titulo']

title_vec = TfidfVectorizer(min_df=2) 
title_bow_train = title_vec.fit_transform(title_train)
title_bow_test = title_vec.transform(title_test)

In [20]:
X_train_wtitle = hstack([X_train, title_bow_train])
X_test_wtitle = hstack([X_test, title_bow_test])

In [21]:
random_forest = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced', n_jobs=6)
random_forest.fit(X_train_wtitle, y_train)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=6,
                       random_state=0)

In [22]:
probabilidade = random_forest.predict_proba(X_test_wtitle)[:, 1]

In [23]:
average_precision_score(y_test, probabilidade)

0.46846074116867686

In [24]:
roc_auc_score(y_test, probabilidade)

0.6336915297092287

|Modelo| Tipo de modelo| Precision | ROC AUC| 
|------|---------------|-----------|--------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 2 | Random Forest| 0.4190096457477138| 0.6517067003792667| 
|modelo 3 (aumento teste)| Random Forest| 0.3840157528914409| 0.6327519379844962|
|modelo 3 (aumento treino)| Random Forest| 0.46846074116867686| 0.6336915297092287|