In [1]:
import pandas as pd
import numpy as np
import re
import time

import bs4 as bs4
import json 

import glob
import tqdm

pd.set_option("max.columns", 131)

%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df1 = pd.read_csv("raw_data_with_labels.csv", index_col=0)
df1 = df1[df1["y"].notnull()]
df1.shape

(498, 16)

Pelo Google Sheets fizemos o processo de rotulação, colocando 0 e 1 no arquvio salvo anteriormente.

In [8]:
df2 = pd.read_csv("active_labels1_done.csv", index_col=0)
df2 = df2[df2["y"].notnull()]
df2["novo"] = 1
df2.shape

(100, 18)

In [9]:
df2.head(1)

Unnamed: 0,watch-title,y,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:tag,channel_link_0,p,novo
505,Platform Overview - Machine Learning,0,4.298 visualizações,Publicado em 21 de mai. de 2019,Ciência e tecnologia,Platform Overview - Machine Learning,Google Cloud Platform\n\n\n\n\n\n\n\n\n\n\n\n\...,4.298 visualizações\n\n\n\n\n\n\n\n141\n\nGost...,https://i.ytimg.com/vi/QR_LQQ-vvko/maxresdefau...,1280.0,720.0,"In this short GCP Essentials video, see how GC...",1280.0,720.0,Alexis Moussine Pouchkine,/channel/UCJS9pqu9BzkAMNTmzNMNhvg,0.502,1


Avaliando se as métricas do novo dataset está perto ou longe do que foi esperado. 
<br> Nos dados de validação que temos, a árvore de precisão era 0.19 e agora como está a seguir deu 0.20, então está perto o bastante e o modelo está generalizando.
<br> O nosso AUC era 0.58 e aqui está mais baixo, porém por ele está sensível ao número de exemplos.

In [10]:
from sklearn.metrics import roc_auc_score, average_precision_score
average_precision_score(df2['y'], df2['p']), roc_auc_score(df2['y'], df2['p'])

(0.2037344613689981, 0.5386250885896527)

Juntando os dois dataframe

In [11]:
df = pd.concat([df1, df2.drop("p", axis=1)])

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

Parte de criação do DataFrame vazio, já realizada anteriormente

In [13]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo["title"] = df["watch-title"]
df_limpo["novo"] = df["novo"].fillna(0)

# 1. Limpeza de dados

In [14]:
clean_date = df["watch-time-text"].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x) == 1 else x)

mapa_meses = {"jan": "Jan",
              "fev": "Feb", 
              "mar": "Mar",
              "abr": "Apr",
              "mai": "May",
              "jun": "Jun",
              "jul": "Jul",
              "ago": "Aug",
              "set": "Sep",
              "out": "Oct",
              "nov": "Nov",
              "dez": "Dec"}
clean_date[1] = clean_date[1].map(mapa_meses)
clean_date = clean_date.apply(lambda x: " ".join(x), axis = 1)
df_limpo["date"] = pd.to_datetime(clean_date, format= "%d %b %Y") 

# 2. Limpeza de views

In [15]:
views = df["watch-view-count"].str.extract(r"(\d+\.?\d+)", expand=False).str.replace(".", "").fillna(0).astype(int)
df_limpo["views"] = views

# 3. Features

In [16]:
features = pd.DataFrame(index=df_limpo.index)
y = df["y"].copy()
features["tempo_desde_pub"] = (pd.to_datetime("2019-12-3") - df_limpo["date"]) / np.timedelta64(1, "D")
features["views"] = df_limpo["views"]
features["views_por_dia"] = features["views"] / features["tempo_desde_pub"]
features = features.drop(["tempo_desde_pub"], axis = 1)

In [17]:
features.head()

Unnamed: 0,views,views_por_dia
0,28028,61.464912
1,1131,2.960733
2,1816,8.446512
3,1171,10.455357
4,1228,3.336957


## Aumenta validação

In [18]:
# Como vimos o AUC muito instável, iremos testar só mandando os novos exemplos para o dataset de validação
# e analisaremos a estimativa e o erro
mask_train = (df_limpo["date"] < "2019-04-01") & (df_limpo["novo"]==0)
mask_val = df_limpo["date"] >= "2019-04-01"

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((228, 2), (316, 2), (228,), (316,))

Fizemos os mesmos passos já realizados anteriormente

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [21]:
title_bow_train.shape

(228, 193)

In [22]:
from scipy.sparse import hstack, vstack
Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [23]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((228, 195), (316, 195))

In [24]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight="balanced", n_jobs=4)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=4, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [25]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [26]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [27]:
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.18659811616991204, 0.5929353493222106)

Concluimos que a diferença de variação foi pouca e nada mudou. Com isso iremos aumentar o treino

## Aumentando treino
Usaremos os mesmos dados para validação e aumentaremos os dados de treino com os novos exemplos

In [30]:
mask_train = (df_limpo["date"] < "2019-04-01") 
#mask_val = (df_limpo["date"] >= "2019-04-01") & (df_limpo["novo"]==0)
mask_val = (df_limpo["date"] >= "2019-04-01")

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((282, 2), (316, 2), (282,), (316,))

In [31]:
title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight="balanced", n_jobs=4)
mdl.fit(Xtrain_wtitle, ytrain)

p = mdl.predict_proba(Xval_wtitle)[:, 1]

average_precision_score(yval, p), roc_auc_score(yval, p)

(0.19316899908774002, 0.6100104275286757)