In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score

In [2]:
df = pd.read_csv('dados/silver/dados_tratados_com_features_novas_com_labels.csv', index_col=0)
df = df[df['label'].notnull()]
df = df.reset_index().drop('index', axis=1)
y = df['label'].astype('int')
print(df.shape, y.shape)

(500, 14) (500,)


# Features

In [3]:
features = df[['visualizacoes', 'visualizacoes_por_dia']]
features = features.astype(float)

In [4]:
features

Unnamed: 0,visualizacoes,visualizacoes_por_dia
0,858.0,286.000000
1,2.0,0.285714
2,1.0,0.100000
3,653.0,59.363636
4,2.0,0.066667
...,...,...
495,37.0,0.248322
496,260.0,1.721854
497,114.0,0.740260
498,241.0,1.554839


In [5]:
data_treino = df['data_upload'] < '2023-03-01'
data_teste = df['data_upload'] >= '2023-03-01'

X_train, X_test = features[data_treino], features[data_teste] 
y_train, y_test = y[data_treino], y[data_teste] 

In [6]:
print(X_train.shape, y_train.shape)

(345, 2) (345,)


In [7]:
print(X_test.shape, y_test.shape)

(155, 2) (155,)


In [8]:
# TfidfVectorizer -> palavras que aparecerem pouco em muitos vídeos, mas aparecerem
# muito em um vídeo, vão ter um peso maior. Palavras que aparecem muito em muitos vídeos
# terão menos peso

from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df[data_treino]['titulo']
title_test = df[data_teste]['titulo']

#min_df: número mínimo para que uma palavra precisa aparecer nos dados para que essa palavra vire uma coluna. Ex.: machine tem que aparecer em no mínimo dois vídeos para poder virar uma coluna de palavra
title_vec = TfidfVectorizer(min_df=2) 
title_bow_train = title_vec.fit_transform(title_train)
title_bow_test = title_vec.transform(title_test)

In [9]:
title_bow_train.shape #linhas e colunas, logo achou 188 palavras que repetem 2 ou mais vezes

(345, 277)

In [10]:
title_bow_test
# matriz esparsa -> matriz com elementos somente diferente de 0

<155x277 sparse matrix of type '<class 'numpy.float64'>'
	with 714 stored elements in Compressed Sparse Row format>

In [11]:
1 - 1049/(146*188) # mais de 96% da matriz é composta de elementos 0

0.9617822792188866

* hstack:
    * $$[1, 2], [3, 4] -> [1, 2, 3, 4]$$
* vstack:
    * $$[1, 2], [3, 4] -> \begin{bmatrix} 1 & 2 \\ 3 & 4\end{bmatrix}$$

In [12]:
from scipy.sparse import hstack, vstack

X_train_wtitle = hstack([X_train, title_bow_train])
X_test_wtitle = hstack([X_test, title_bow_test])

In [13]:
X_train_wtitle.shape, X_test_wtitle.shape

((345, 279), (155, 279))

In [14]:
random_forest = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced', n_jobs=6)
random_forest.fit(X_train_wtitle, y_train)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=6,
                       random_state=0)

In [15]:
decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2, class_weight="balanced")
decision_tree.fit(X_train_wtitle, y_train)

DecisionTreeClassifier(class_weight='balanced', max_depth=2, random_state=0)

In [16]:
probabilidade1 = random_forest.predict_proba(X_test_wtitle)[:, 1]
probabilidade2 = decision_tree.predict_proba(X_test_wtitle)[:, 1]


In [17]:
print(f'Average precision Random Forest:{average_precision_score(y_test, probabilidade1)}')
print(f'Average precision Decision Tree:{average_precision_score(y_test, probabilidade2)}') 

Average precision Random Forest:0.4190096457477138
Average precision Decision Tree:0.28247788548122255


In [18]:
print(f'ROC AUC Random Forest:{roc_auc_score(y_test, probabilidade1)}')
print(f'ROC AUC Decision Tree:{roc_auc_score(y_test, probabilidade2)}')

ROC AUC Random Forest:0.6517067003792667
ROC AUC Decision Tree:0.5279182469447956


|Modelo| Tipo de modelo| Precision | ROC AUC| 
|------|---------------|-----------|--------|
|modelo 1 (Baseline)| Decision Tree|0.27195362537631207| 0.4626000842815002|
|modelo 2 | Random Forest| 0.4190096457477138| 0.6517067003792667| 

# Active Learning

70 exemplos que o modelo tenha dificuldade

30 exemplos aleatoriamente

In [19]:
df_unlabeled = pd.read_csv('dados/silver/dados_tratados_com_features_novas_com_labels.csv', index_col=0)
df_unlabeled = df_unlabeled[df_unlabeled['label'].isnull()].dropna(how='all')
df_unlabeled.shape

(817, 14)

In [20]:
features_u = df_unlabeled[['visualizacoes', 'visualizacoes_por_dia']]
features_u = features_u.astype(float)

In [21]:
# TfidfVectorizer -> palavras que aparecerem pouco em muitos vídeos, mas aparecerem
# muito em um vídeo, vão ter um peso maior. Palavras que aparecem muito em muitos vídeos
# terão menos peso

from sklearn.feature_extraction.text import TfidfVectorizer

title_u = df_unlabeled['titulo']

#min_df: número mínimo para que uma palavra precisa aparecer nos dados para que essa palavra vire uma coluna. Ex.: machine tem que aparecer em no mínimo dois vídeos para poder virar uma coluna de palavra
#title_vec = TfidfVectorizer(min_df=2) 
title_bow_u = title_vec.transform(title_u)
# não criamos outro tfidVectorize pq já temos um treinado e queremos as previsões dele

In [22]:
title_bow_u

<817x277 sparse matrix of type '<class 'numpy.float64'>'
	with 3679 stored elements in Compressed Sparse Row format>

In [23]:
Xu_wtitle = hstack([features_u, title_bow_u])

In [24]:
Xu_wtitle

<817x279 sparse matrix of type '<class 'numpy.float64'>'
	with 5313 stored elements in COOrdinate format>

In [25]:
probabilidade_u = random_forest.predict_proba(Xu_wtitle)[:, 1]

In [26]:
df_unlabeled['probabilidade'] = probabilidade_u

In [27]:
df_unlabeled.head(30)

Unnamed: 0,id,titulo,label,canal,data_upload,categoria,tags,duracao_segundos,link,visualizacoes,quantidade_likes,query,tempo_desde_pub,visualizacoes_por_dia,probabilidade
500,watch?v=PVEXKbfxTuI,The Future of Machine Learning,,the data janitor,2022-11-14,['People & Blogs'],,103.0,https://www.youtube.com/watch?v=PVEXKbfxTuI,1,101.0,machine+learning,157.0,0.006369,0.063
501,watch?v=NqHKr9CGWJ0,Quantum Machine Learning Explained,,IBM Technology,2022-10-19,['Education'],"['IBM', 'Quantum', 'Quantum Computing', 'Quant...",358.0,https://www.youtube.com/watch?v=NqHKr9CGWJ0,16,592.0,machine+learning,183.0,0.087432,0.204
502,watch?v=-TjLX93w64A,How I Became A Self-Taught Machine Learning En...,,Smitha Kolan - Machine Learning Engineer,2022-10-14,['Science & Technology'],"['how to learn machine learning', 'machine lea...",403.0,https://www.youtube.com/watch?v=-TjLX93w64A,17,837.0,machine+learning,188.0,0.090426,0.187
503,watch?v=3Ie39phKdpk,Manual machine - learning BMX racing bike skills!,,Rockstar Harley,2022-10-11,['Sports'],"['bmx racing', 'bike skills', 'dream', 'fun ki...",11.0,https://www.youtube.com/watch?v=3Ie39phKdpk,53,954870.0,machine+learning,191.0,0.277487,0.309
504,watch?v=V_xro1bcAuA,PyTorch for Deep Learning & Machine Learning –...,,freeCodeCamp.org,2022-10-06,['Education'],,92246.0,https://www.youtube.com/watch?v=V_xro1bcAuA,505,16524.0,machine+learning,196.0,2.576531,0.138
505,watch?v=wIHW1rpKhoc,Curso de Introducción al Machine Learning 💻 (C...,,AfiEscuela,2022-10-04,['Education'],"['banca', 'finanzas', 'economía', 'educación']",8613.0,https://www.youtube.com/watch?v=wIHW1rpKhoc,2,65.0,machine+learning,198.0,0.010101,0.031
506,watch?v=B8LT-2JhrRE,Difference between Machine Learning and Deep L...,,Karthik's Show,2022-10-02,['Education'],"['machine learning', 'deep learning', 'artific...",519.0,https://www.youtube.com/watch?v=B8LT-2JhrRE,7,421.0,machine+learning,200.0,35.0,0.245
507,watch?v=o_EbGPzItik,"Método de Machine Learning ""Support-Vector Mac...",,FutPythonTrader,2022-09-29,['Science & Technology'],"['trader esportivo', 'punter', 'python tutoria...",676.0,https://www.youtube.com/watch?v=o_EbGPzItik,409,43.0,machine+learning,203.0,2.014778,0.136
508,watch?v=i_LwzRVP7bg,Machine Learning for Everybody – Full Course,,freeCodeCamp.org,2022-09-26,['Education'],,14033.0,https://www.youtube.com/watch?v=i_LwzRVP7bg,1,24012.0,machine+learning,206.0,0.004854,0.098
509,watch?v=KFtz_t1XNP8,Aplicaciones (más comunes) en Machine Learning,,Rafa Gonzalez Gouveia,2022-09-23,['Education'],"['gonzalezgouveia', 'data science español', 'r...",348.0,https://www.youtube.com/watch?v=KFtz_t1XNP8,4,320.0,machine+learning,209.0,0.019139,0.033


In [28]:
mask_u = (df_unlabeled['probabilidade'] >= 0.29) & (df_unlabeled['probabilidade'] <= 0.71)
mask_u.sum()

62

In [29]:
dificeis = df_unlabeled[mask_u]

In [30]:
aleatorios = df_unlabeled[~mask_u].sample(38)

In [31]:
pd.concat([dificeis, aleatorios]).to_csv('dados/silver/active_label1.csv')