# 4. After active learning / Ensemble

In [0]:
import pandas as pd
import numpy as np
import re
import time

import bs4 as bs4
import json

import glob
import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from lightgbm import LGBMClassifier

from sklearn.pipeline import make_pipeline

from scipy.sparse import hstack, vstack, csr_matrix

import joblib as jb

## Resultado do Active Learning

In [0]:
df1 = pd.read_csv("raw_data_with_labels.csv", index_col=0)
df1 = df1[df1['y'].notnull()]
df1.shape

(522, 17)

In [0]:
df2 = pd.read_csv("active_labeled.csv", index_col=0)
df2 = df2[df2['y'].notnull()]
df2['novo'] = 1
df2.shape

(235, 19)

In [0]:
average_precision_score(df2['y'], df2['p']), roc_auc_score(df2['y'], df2['p'])

(0.24140788556230647, 0.5093140153755175)

In [0]:
df = pd.concat([df1, df2.drop("p", axis=1)])

In [0]:
df.head()

Unnamed: 0,watch-title,y,og:url,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:tag,channel_link_0,novo
0,#DataScience #Pandas #python Python Pandas Tut...,0.0,https://www.youtube.com/watch?v=--EdOZqByHo,62 visualizações,Publicado em 11 de abr. de 2020,Educação,#DataScience #Pandas #python Python Pandas Tut...,Code Mania\n\n\n\n\n\n\n\n\n\n\n\n\n\nCarregan...,62 visualizações\n\n\n\n\n\n\n\n4\n\nGostou de...,https://i.ytimg.com/vi/--EdOZqByHo/hqdefault.jpg,480,360,This pandas tutorial covers basics on datafram...,640.0,360.0,python data science tutorial,/channel/UCiO8B22LQBecxz9JjYrk7yA,
1,Machine Learning Course A To Z || Beginner to ...,0.0,https://www.youtube.com/watch?v=-58kO_zYUGE,174.642 visualizações,Publicado em 10 de ago. de 2018,Educação,Machine Learning Course A To Z || Beginner to ...,Geek's Lesson\n\n\n\n\n\n\n\n\n\n\n\n\n\nCarre...,174.642 visualizações\n\n\n\n\n\n\n\n5.121\n\n...,https://i.ytimg.com/vi/-58kO_zYUGE/maxresdefau...,1280,720,Welcome to this free online class on machine l...,640.0,360.0,Ai and machine learning course,/channel/UCKXx22vOENUyHrVAADq7Z_g,
2,Python For Data Science Full Course - 9 Hours ...,0.0,https://www.youtube.com/watch?v=-6RqxhNO2yY,14.707 visualizações,Publicado em 15 de mar. de 2020,Educação,#edureka #PythonEdureka #pythonfordatasciencef...,edureka!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCarregan...,14.707 visualizações\n\n\n\n\n\n\n\n630\n\nGos...,https://i.ytimg.com/vi/-6RqxhNO2yY/maxresdefau...,1280,720,🔥Edureka Python Certification Training: https:...,1280.0,720.0,edureka,/channel/UCkw4JCwteGrDHIsyIIKo4tQ,
3,Michael I. Jordan: Machine Learning: Dynamical...,0.0,https://www.youtube.com/watch?v=-8yYFdV5SOc,4.021 visualizações,Publicado em 2 de mai. de 2019,Licença de atribuição Creative Commons (reutil...,#purdue #michaelijordan #engineering\n\n\n\n ...,Purdue Engineering\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,4.021 visualizações\n\n\n\n\n\n\n\n93\n\nGosto...,https://i.ytimg.com/vi/-8yYFdV5SOc/maxresdefau...,1280,720,2019 Purdue Engineering Distinguished Lecture ...,1280.0,720.0,electrical engineer,/channel/UC8FZ6dzFVkCACLH9YoMNFog,
4,From UX Designer to Data Scientist | How to Ch...,0.0,https://www.youtube.com/watch?v=-9GaAw2DZh8,20 visualizações,Publicado em 13 de abr. de 2020,Ciência e tecnologia,#UXDesign #DataScience #DataScientist\n\n\n\n ...,Radu Fotolescu\n\n\n\n\n\n\n\n\n\n\n\n\n\nCarr...,20 visualizações\n\n\n\n\n\n\n\n2\n\nGostou de...,https://i.ytimg.com/vi/-9GaAw2DZh8/maxresdefau...,1280,720,This is how I changed my career from UX Design...,1280.0,720.0,data science career,/channel/UCHz35rvIKf2CMqj7oiMv9WQ,


In [0]:
df_clean = pd.DataFrame(index=df.index)
df_clean['title'] = df['watch-title']
df_clean['novo'] = df['novo'].fillna(0)

## Limpeza de dados

### Pre processamento da data

In [0]:
r_today = pd.to_datetime('today')
today = r_today.strftime("%Y-%m-%d")

In [0]:
today_post = r_today.strftime("Publicado em %d de %b de %Y")
df = df.replace({'watch-time-text': r"\bhá \b\d+\b hor.+\b"}, {'watch-time-text': today_post}, regex=True)
clean_date = df['watch-time-text'].str.extract(r"(\d+) de ([a-zA-Z]+)\.? de (\d+)")
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x) == 1 else x)

month_map = {"jan": "Jan", "fev": "Feb", "mar": "Mar", "abr": "Apr", "mai": "May", "jun": "Jun",
             "jul": "Jul", "ago": "Aug", "set": "Sep", "out": "Oct", "nov": "Nov", "dez": "Dec"}

clean_date[1] = clean_date[1].map(month_map).fillna(r_today.strftime("%b"))
    
clean_date = clean_date.apply(lambda x: ' '.join(x), axis=1)
df_clean['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

### Pre processamento das views

In [0]:
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False).str.replace(".", "").fillna(0).astype(int)
df_clean['views'] = views
df_clean.head()

Unnamed: 0,title,novo,date,views
0,#DataScience #Pandas #python Python Pandas Tut...,0.0,2020-04-11,62
1,Machine Learning Course A To Z || Beginner to ...,0.0,2018-08-10,174642
2,Python For Data Science Full Course - 9 Hours ...,0.0,2020-03-15,14707
3,Michael I. Jordan: Machine Learning: Dynamical...,0.0,2019-05-02,4021
4,From UX Designer to Data Scientist | How to Ch...,0.0,2020-04-13,20


### Definição das features

In [0]:
features = pd.DataFrame(index=df_clean.index)
y = df['y'].copy()

In [0]:
features['days_since_pub'] = (pd.to_datetime(today) - df_clean['date']) / np.timedelta64(1, 'D')
features['views'] = df_clean['views']
features['daily_views'] = features['views'] / features['days_since_pub']
features = features.drop(['days_since_pub'], axis=1)

In [0]:
# mask_train = (df_clean['date'] < "2019-03-01") & (df_clean['novo'] == 0)
# mask_val = (df_clean['date'] >= "2019-03-01") & (df_clean['novo'] == 0)

mask_train = df_clean['date'] < "2019-03-01'"
mask_val = df_clean['date'] >= "2019-03-01'"

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((361, 2), (396, 2), (361,), (396,))

In [0]:
title_train = df_clean[mask_train]['title']
title_val = df_clean[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,3))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [0]:
Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

## Random Forest

In [0]:
mdl_rf = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight="balanced", n_jobs=6)
mdl_rf.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=6, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [0]:
p_rf = mdl_rf.predict_proba(Xval_wtitle)[:, 1]

In [0]:
yval.value_counts()

0.0    261
1.0    135
Name: y, dtype: int64

In [0]:
average_precision_score(yval, p_rf), roc_auc_score(yval, p_rf)

(0.543742228397418, 0.6651340996168582)

### LightGBM

In [0]:
params = [0.08265121231498246, 7, 1, 0.7251351011494334, 0.07547006552546137, 839, 2, 3]
lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]

min_df = params[6]
ngram_range = (1, params[7])

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                     min_child_samples=min_child_samples, subsample=subsample,
                     colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                     class_weight="balanced", n_jobs=6)
mdl_lgbm.fit(Xtrain_wtitle, ytrain)

p_lgbm = mdl_lgbm.predict_proba(Xval_wtitle)[:, 1]



In [0]:
average_precision_score(yval, p_lgbm), roc_auc_score(yval, p_lgbm)

(0.5598577831354639, 0.6870725131261529)

### Logistic Regression

In [0]:
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

#scaler = StandardScaler()
#scaler = MaxAbsScaler()

#Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
#Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())
#Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
#Xval_wtitle2 = scaler.transform(Xval_wtitle2)

lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C=0.5, penalty='l2',n_jobs=6, random_state=0))
lr_pipeline.fit(Xtrain_wtitle2, ytrain)

Pipeline(memory=None,
         steps=[('maxabsscaler', MaxAbsScaler(copy=True)),
                ('logisticregression',
                 LogisticRegression(C=0.5, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=6, penalty='l2',
                                    random_state=0, solver='lbfgs', tol=0.0001,
                                    verbose=0, warm_start=False))],
         verbose=False)

In [0]:
p_lr = lr_pipeline.predict_proba(Xval_wtitle2)[:, 1]

In [0]:
average_precision_score(yval, p_lr), roc_auc_score(yval, p_lr)

(0.5470119603597027, 0.705349794238683)

## Ensemble

In [0]:
p = 0.5*p_rf + 0.5*p_lgbm
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.5794819983268916, 0.6996736199801333)

In [0]:
p = (p_lr + p_rf + p_lgbm)/3
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.5873575133266696, 0.712813963388676)

In [0]:
pd.DataFrame({"LR": p_lr, "RF": p_rf, "LGBM": p_lgbm}).corr()

Unnamed: 0,LR,RF,LGBM
LR,1.0,0.682666,0.613553
RF,0.682666,1.0,0.675156
LGBM,0.613553,0.675156,1.0


In [0]:
jb.dump(mdl_lgbm, "./5-web-app/mllgbm_20200420.pkl.z")
jb.dump(mdl_rf, "./5-web-app/mlrandom_forest_20200420.pkl.z")
jb.dump(lr_pipeline, "./5-web-app/mllogistic_reg_20200420.pkl.z")
jb.dump(title_vec, "./5-web-app/title_vectorizer_20200420.pkl.z")

['./5-web-app/title_vectorizer_20200420.pkl.z']