In [1]:
import pandas as pd
import numpy as np
import re
import time

import bs4 as bs4
import json 

import glob
import tqdm

pd.set_option("max.columns", 131)

%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("labels_curso - to_label_2.csv", index_col=0).dropna(subset=["y"])

In [3]:
df.duplicated().mean()

0.0

In [4]:
df.duplicated(['watch-title']).mean()

0.0

In [5]:
df.shape

(1164, 16)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [7]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['watch-title']

# 1. Limpeza de dados

In [8]:
clean_date = df["watch-time-text"].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x) == 1 else x)

mapa_meses = {"jan": "Jan",
              "fev": "Feb", 
              "mar": "Mar",
              "abr": "Apr",
              "mai": "May",
              "jun": "Jun",
              "jul": "Jul",
              "ago": "Aug",
              "set": "Sep",
              "out": "Oct",
              "nov": "Nov",
              "dez": "Dec"}

clean_date[1] = clean_date[1].map(mapa_meses)

clean_date = clean_date.apply(lambda x: " ".join(x), axis = 1)
df_limpo['date'] = pd.to_datetime(clean_date, format="%d %b %Y") 

# 2. Limpeza de views

In [9]:
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False).str.replace(".", "").fillna(0).astype(int)
df_limpo['views'] = views

# 3. Features

In [10]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()
features['tempo_desde_pub'] = (pd.to_datetime("2019-12-3") - df_limpo["date"]) / np.timedelta64(1, "D")
features['views'] = df_limpo['views']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']
features = features.drop(['tempo_desde_pub'], axis = 1)

In [11]:
features.head()

Unnamed: 0,views,views_por_dia
0,28028,61.464912
394,1161,21.109091
393,141646,809.405714
392,325,21.666667
391,61,7.625


In [12]:
mask_train = df_limpo['date'] < "2019-04-01"
mask_val = (df_limpo['date'] >= "2019-04-01")

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((555, 2), (609, 2), (555,), (609,))

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']
# ngram_range -> número de palavras que ele vai pegar para criar a bag of words, número de palavras juntas
# que ele vai ter em uma coluna.
title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,4))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [14]:
title_bow_train.shape

(555, 1333)

In [15]:
from scipy.sparse import hstack, vstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [16]:
title_bow_val.shape, Xval.shape

((609, 1333), (609, 2))

In [17]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((555, 1335), (609, 1335))

In [18]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=1, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=6, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [19]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [20]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [21]:
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.20564709615419985, 0.6848794008374124)

## 5. LightGBM

In [22]:
from lightgbm import LGBMClassifier

In [23]:
mdl = LGBMClassifier(random_state=0, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=6, num_leaves=31,
               objective=None, random_state=0, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [24]:
Xval_wtitle.shape

(609, 1335)

In [25]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]



In [26]:
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.19802541702844073, 0.6238131745002063)

## 6. Bayesian Optimization

In [27]:
Xval_wtitle

<609x1335 sparse matrix of type '<class 'numpy.float64'>'
	with 6132 stored elements in COOrdinate format>

In [28]:
from skopt import forest_minimize

In [29]:
def tune_lgbm(params):
    print(params)
    lr = params[0] # learning rate
    max_depth = params[1] # profundidade máxima das árvores
    min_child_samples = params[2] # número mínimo de exemplos em cada nó
    subsample = params[3] # proporção da subsample, proporção de dados da arvore, para evitar overfitting
    colsample_bytree = params[4] # igual subsample, porém em termos de coluna, a cada arvore criada ele seleciona apenas algumas colunas, novamente para evitar o everfitting
    n_estimators = params[5] # número de árvores que irá ser usada
    
    min_df = params[6] # número mínimo que a palavra aparece no dataset de treino para considerar um coluna
    ngram_range = (1, params[7]) # número máximo de ngramas que iremos querer em nossa matriz
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_val = title_vec.transform(title_val)
    
    Xtrain_wtitle = hstack([Xtrain, title_bow_train])
    Xval_wtitle = hstack([Xval, title_bow_val])
    
    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=6)
    mdl.fit(Xtrain_wtitle, ytrain)
    
    p = mdl.predict_proba(Xval_wtitle)[:, 1]
    
    print(roc_auc_score(yval, p))
    
    return -average_precision_score(yval, p)


space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272, 3, 1]
0.6016099545910243
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.1559
Function value obtained: -0.1404
Current minimum: -0.1404
Iteration No: 2 started. Evaluating function at random point.
[0.053887464791860025, 1, 15, 0.7437489153990157, 0.8675167974293533, 549, 3, 4]




0.5507312614259597
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1802
Function value obtained: -0.1203
Current minimum: -0.1404
Iteration No: 3 started. Evaluating function at random point.
[0.004151454520895999, 6, 20, 0.8682075103820793, 0.9491436163200662, 411, 4, 3]




0.587884059680368
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.2377
Function value obtained: -0.1487
Current minimum: -0.1487
Iteration No: 4 started. Evaluating function at random point.
[0.0014099928811969545, 9, 9, 0.6502182010234373, 0.6866210554187129, 828, 5, 2]




0.6245355900218199
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.6481
Function value obtained: -0.1555
Current minimum: -0.1555
Iteration No: 5 started. Evaluating function at random point.
[0.08530558241838007, 8, 19, 0.2137736299768322, 0.1313765544201984, 961, 4, 1]
0.5
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.1558
Function value obtained: -0.1018
Current minimum: -0.1555
Iteration No: 6 started. Evaluating function at random point.
[0.003567949451535685, 10, 19, 0.7232951768944309, 0.7298538828427115, 939, 4, 3]




0.5755882526390281
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 0.6383
Function value obtained: -0.1388
Current minimum: -0.1555
Iteration No: 7 started. Evaluating function at random point.
[0.014828577273549474, 7, 1, 0.18428087097824575, 0.3261556557915816, 274, 1, 2]




0.585421949637318
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 0.4732
Function value obtained: -0.1444
Current minimum: -0.1555
Iteration No: 8 started. Evaluating function at random point.
[0.0015212976972079912, 3, 12, 0.44234694306528044, 0.399351303640462, 272, 3, 5]
0.5672288730317863
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 0.1586
Function value obtained: -0.1394
Current minimum: -0.1555
Iteration No: 9 started. Evaluating function at random point.
[0.01946212855369041, 9, 18, 0.5235636153223084, 0.6728679300083596, 747, 4, 5]




0.5670372117709501
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 0.4198
Function value obtained: -0.1260
Current minimum: -0.1555
Iteration No: 10 started. Evaluating function at random point.
[0.0012116790683302117, 3, 2, 0.06616307483844217, 0.23025600705315752, 677, 2, 5]




0.5789644394645279
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 0.2870
Function value obtained: -0.1356
Current minimum: -0.1555
Iteration No: 11 started. Evaluating function at random point.
[0.0053139776214487944, 6, 9, 0.14251441334450304, 0.8175761405215897, 297, 1, 5]




0.5532081146429204
Iteration No: 11 ended. Evaluation done at random point.
Time taken: 0.2087
Function value obtained: -0.1155
Current minimum: -0.1555
Iteration No: 12 started. Evaluating function at random point.
[0.0068572961982704935, 10, 5, 0.2390386584472456, 0.49053406102209746, 176, 2, 4]




0.5947986082443828
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 0.2542
Function value obtained: -0.1593
Current minimum: -0.1593
Iteration No: 13 started. Evaluating function at random point.
[0.00781968225875022, 3, 4, 0.7078936710077383, 0.31818755505678337, 275, 4, 4]
0.6085392463289496
Iteration No: 13 ended. Evaluation done at random point.
Time taken: 0.1466
Function value obtained: -0.1478
Current minimum: -0.1593
Iteration No: 14 started. Evaluating function at random point.
[0.017293945600511968, 2, 15, 0.9007557574888567, 0.41026441194439994, 316, 5, 1]




0.5622014507283127
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 0.0778
Function value obtained: -0.1563
Current minimum: -0.1593
Iteration No: 15 started. Evaluating function at random point.
[0.012250750764764855, 8, 6, 0.5976582413192033, 0.2474882432951916, 516, 4, 4]




0.6094828094592204
Iteration No: 15 ended. Evaluation done at random point.
Time taken: 0.3616
Function value obtained: -0.1570
Current minimum: -0.1593
Iteration No: 16 started. Evaluating function at random point.
[0.018353598126553926, 4, 3, 0.47305622526323254, 0.1404164811277527, 133, 4, 1]
0.6056495842424958
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 0.0708
Function value obtained: -0.1442
Current minimum: -0.1593
Iteration No: 17 started. Evaluating function at random point.
[0.0010383234748454694, 9, 19, 0.9256771571832196, 0.9321438677645206, 312, 4, 3]




0.5769741109866131
Iteration No: 17 ended. Evaluation done at random point.
Time taken: 0.2715
Function value obtained: -0.1442
Current minimum: -0.1593
Iteration No: 18 started. Evaluating function at random point.
[0.004955229758078229, 5, 5, 0.06939551310802591, 0.4193273080472823, 725, 4, 1]
0.5457775549920386
Iteration No: 18 ended. Evaluation done at random point.
Time taken: 0.1865
Function value obtained: -0.1091
Current minimum: -0.1593
Iteration No: 19 started. Evaluating function at random point.
[0.0699516121742407, 9, 10, 0.6477856515609233, 0.8594430701440198, 616, 1, 1]




0.5662558235536947
Iteration No: 19 ended. Evaluation done at random point.
Time taken: 0.4415
Function value obtained: -0.1169
Current minimum: -0.1593
Iteration No: 20 started. Evaluating function at random point.
[0.0014752743467850462, 5, 4, 0.9747950537021096, 0.982207187458162, 909, 2, 4]




0.5634841068585246
Iteration No: 20 ended. Evaluation done at random point.
Time taken: 0.9525
Function value obtained: -0.1357
Current minimum: -0.1593
Iteration No: 21 started. Searching for the next optimal point.
[0.010039791164582725, 9, 11, 0.6596823900439108, 0.18671284566187135, 467, 4, 1]




0.5734210060741877
Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.4768
Function value obtained: -0.1295
Current minimum: -0.1593
Iteration No: 22 started. Searching for the next optimal point.
[0.0027864871304710045, 10, 2, 0.07791079181091931, 0.43600928176626413, 321, 2, 4]




0.5840066049419118
Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.6293
Function value obtained: -0.1501
Current minimum: -0.1593
Iteration No: 23 started. Searching for the next optimal point.
[0.06619381737473454, 10, 6, 0.44652512546757567, 0.41805106564623606, 182, 3, 4]




0.5749248098130565
Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 0.4936
Function value obtained: -0.1360
Current minimum: -0.1593
Iteration No: 24 started. Searching for the next optimal point.
[0.005553759621868936, 8, 5, 0.28639722115326405, 0.5253328438063181, 545, 5, 4]




0.5883263548976824
Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 0.5972
Function value obtained: -0.1373
Current minimum: -0.1593
Iteration No: 25 started. Searching for the next optimal point.
[0.03915771474743115, 10, 4, 0.8745284933940919, 0.609455271676343, 317, 2, 4]




0.6329539423247036
Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 0.7847
Function value obtained: -0.1732
Current minimum: -0.1732
Iteration No: 26 started. Searching for the next optimal point.
[0.051594007805739604, 10, 7, 0.8482581530088819, 0.546993868790632, 271, 1, 5]




0.5723742407265435
Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.5868
Function value obtained: -0.1226
Current minimum: -0.1732
Iteration No: 27 started. Searching for the next optimal point.
[0.06301493913876255, 9, 12, 0.8793393161309506, 0.5415154595335046, 354, 2, 4]




0.6143185705018577
Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 0.4949
Function value obtained: -0.1822
Current minimum: -0.1822
Iteration No: 28 started. Searching for the next optimal point.
[0.025670207809790957, 9, 12, 0.9590961508588095, 0.08714971326599637, 345, 2, 4]
0.6340449371940791




Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 0.4375
Function value obtained: -0.1694
Current minimum: -0.1822
Iteration No: 29 started. Searching for the next optimal point.
[0.08947810585132653, 10, 17, 0.8597011370253026, 0.19795847958523116, 418, 1, 4]




0.5857462994633484
Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 0.5489
Function value obtained: -0.1341
Current minimum: -0.1822
Iteration No: 30 started. Searching for the next optimal point.
[0.057178386105103396, 10, 14, 0.8772743749758142, 0.5707545808135022, 213, 2, 4]




0.6109276405024473
Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 0.4722
Function value obtained: -0.1685
Current minimum: -0.1822
Iteration No: 31 started. Searching for the next optimal point.
[0.08935049424620435, 10, 11, 0.9228414973696509, 0.4992991894823078, 115, 2, 5]
0.6397947750191662




Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 0.4085
Function value obtained: -0.1793
Current minimum: -0.1822
Iteration No: 32 started. Searching for the next optimal point.
[0.09696964106847213, 9, 11, 0.8844839452157665, 0.15179031797507975, 522, 2, 4]




0.6117237718936133
Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 0.5850
Function value obtained: -0.1693
Current minimum: -0.1822
Iteration No: 33 started. Searching for the next optimal point.
[0.06792396653884321, 9, 7, 0.9492958295647879, 0.26326143728995494, 548, 2, 2]




0.6359910361502625
Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 0.6234
Function value obtained: -0.1841
Current minimum: -0.1841
Iteration No: 34 started. Searching for the next optimal point.
[0.06541663789826244, 9, 5, 0.9403090579268377, 0.2599634417141055, 649, 2, 4]




0.636197440585009
Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 0.8004
Function value obtained: -0.1739
Current minimum: -0.1841
Iteration No: 35 started. Searching for the next optimal point.
[0.07739399592421241, 7, 4, 0.951939801117928, 0.7388973312601077, 459, 2, 1]




0.6567494250162175
Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 0.5241
Function value obtained: -0.1666
Current minimum: -0.1841
Iteration No: 36 started. Searching for the next optimal point.
[0.03674104387174218, 9, 14, 0.9006417514600535, 0.1878246674936721, 937, 2, 1]




0.5986760629828389
Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 0.6962
Function value obtained: -0.1308
Current minimum: -0.1841
Iteration No: 37 started. Searching for the next optimal point.
[0.06949429312990789, 8, 3, 0.9126835754696586, 0.28961086840957667, 747, 2, 5]




0.6657132747537889
Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 0.8870
Function value obtained: -0.2206
Current minimum: -0.2206
Iteration No: 38 started. Searching for the next optimal point.
[0.06440926762537982, 6, 5, 0.9133600121086718, 0.46308936621869917, 851, 1, 5]




0.554490770773132
Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 0.7221
Function value obtained: -0.1148
Current minimum: -0.2206
Iteration No: 39 started. Searching for the next optimal point.
[0.07638183016628648, 2, 3, 0.9956258846934342, 0.24672482296735465, 612, 2, 5]
0.6581647697116235




Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 0.4404
Function value obtained: -0.1876
Current minimum: -0.2206
Iteration No: 40 started. Searching for the next optimal point.
[0.0894811429274582, 7, 1, 0.8563476470190547, 0.07065078561194435, 947, 2, 5]




0.688683139706316
Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 0.9496
Function value obtained: -0.1988
Current minimum: -0.2206
Iteration No: 41 started. Searching for the next optimal point.
[0.07720974927372037, 6, 3, 0.9197621539898518, 0.6449917451643387, 964, 2, 5]




0.6638261484932476
Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 0.9479
Function value obtained: -0.1959
Current minimum: -0.2206
Iteration No: 42 started. Searching for the next optimal point.
[0.075306870015126, 3, 1, 0.7757620213157056, 0.9417420093037455, 948, 2, 5]




0.6387037801497907
Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 0.7623
Function value obtained: -0.1834
Current minimum: -0.2206
Iteration No: 43 started. Searching for the next optimal point.
[0.08265121231498246, 7, 1, 0.7251351011494334, 0.07547006552546137, 839, 2, 5]




0.6883293035324645
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 0.9270
Function value obtained: -0.2378
Current minimum: -0.2378
Iteration No: 44 started. Searching for the next optimal point.
[0.09833170318391853, 1, 1, 0.754182980619871, 0.08605352068337667, 870, 3, 5]
0.6422863714100371




Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 0.4549
Function value obtained: -0.1818
Current minimum: -0.2378
Iteration No: 45 started. Searching for the next optimal point.
[0.07989873876843957, 8, 1, 0.07404579732974367, 0.15481933223031255, 980, 2, 5]




0.6324821607595683
Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 0.8471
Function value obtained: -0.1613
Current minimum: -0.2378
Iteration No: 46 started. Searching for the next optimal point.
[0.0410589708939533, 9, 2, 0.9405453517375768, 0.1321498356955883, 911, 2, 5]




0.6750014743173911
Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 1.3385
Function value obtained: -0.1792
Current minimum: -0.2378
Iteration No: 47 started. Searching for the next optimal point.
[0.075934313018497, 8, 2, 0.9363697345353915, 0.15129274897268502, 765, 1, 5]




0.59317685911423
Iteration No: 47 ended. Search finished for the next optimal point.
Time taken: 0.9462
Function value obtained: -0.1513
Current minimum: -0.2378
Iteration No: 48 started. Searching for the next optimal point.
[0.08403274099312374, 10, 1, 0.7305976215123771, 0.18239803719666697, 589, 4, 5]




0.5950344990269505
Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 1.1025
Function value obtained: -0.1604
Current minimum: -0.2378
Iteration No: 49 started. Searching for the next optimal point.
[0.06892466655245144, 7, 5, 0.8261588403741187, 0.07729616979076112, 853, 2, 5]




0.6603467594503744
Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 0.6705
Function value obtained: -0.1719
Current minimum: -0.2378
Iteration No: 50 started. Searching for the next optimal point.
[0.06602730762014003, 10, 2, 0.6724229168392452, 0.07445872598097698, 855, 2, 5]




0.6782449725776964
Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 1.3191
Function value obtained: -0.2132
Current minimum: -0.2378


Lista de parametros com melhor resultados

In [30]:
res.x

[0.08265121231498246, 7, 1, 0.7251351011494334, 0.07547006552546137, 839, 2, 5]

## 7. Logistic Reg

In [27]:
# MaxAbsScaler = vai escalar os valores pelo valor máximo encontrado em cada coluna
# StandardScaler = subtrai a média e divide pelo desvio padrão (perde a esparssidade)
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

In [46]:
# transformando os dataset em sparse
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

#scaler = StandardScaler()
scaler = MaxAbsScaler()

Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
Xval_wtitle2 = scaler.transform(Xval_wtitle2)

#Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
#Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())

In [47]:
Xval_wtitle2.shape

(609, 1335)

In [54]:
# C = coeficiente de regularização que é multiplicado pela penalidade de regularização de Logistic regression
mdl = LogisticRegression(C=0.5, n_jobs=6, random_state=8)
mdl.fit(Xtrain_wtitle2, ytrain)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=6, penalty='l2', random_state=8,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [55]:
p = mdl.predict_proba(Xval_wtitle2)[:,1]

In [56]:
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.2162535946903888, 0.6861178274458927)