In [287]:
import pandas as pd
import numpy as np
import time

pd.set_option('max.columns', 131)
%matplotlib inline
%pylab inline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

from sklearn.metrics import roc_auc_score, average_precision_score
from skopt import forest_minimize

from sklearn.pipeline import make_pipeline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [168]:
df = pd.read_csv('labels_curso - to_label_2.csv', index_col=0
                ).dropna(subset=['y'])

In [169]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['watch-title']

## 1. Limpeza da data

In [170]:
clean_date = df['watch-time-text'].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x) == 1 else x)
#clean_date[1] = clean_date[1].map(lambda x: x[0].upper()+x[1:])

mapa_meses = {"jan": "Jan",
              "fev": "Feb",
              "mar": "Mar", 
              "abr": "Apr", 
              "mai": "May", 
              "jun": "Jun",
              "jul": "Jul",
              "ago": "Aug", 
              "set": "Sep", 
              "out": "Oct", 
              "nov": "Nov",
              "dez": "Dec"}

clean_date[1] = clean_date[1].map(mapa_meses)

clean_date = clean_date.apply(lambda x: " ".join(x), axis=1)
clean_date.head()
df_limpo['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

## 2. Limpeza de Views

In [171]:
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False).str.replace(".", "").fillna(0).astype(int)
df_limpo['views'] = views

## 3. Features

In [172]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

In [173]:
features['tempo_desde_pub'] = (pd.to_datetime("2019-12-03") - df_limpo['date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['views']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']
features = features.drop(['tempo_desde_pub'], axis=1)

In [174]:
mask_train = df_limpo['date'] < "2019-04-01"
mask_val = (df_limpo['date'] >= "2019-04-01")

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((555, 2), (609, 2), (555,), (609,))

In [175]:
title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

## Vectorizer

In [176]:
title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,3))

In [177]:
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [178]:
title_bow_train.shape

(555, 1144)

### Hstack

In [179]:
Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [180]:
Xtrain_wtitle.shape # included two more columns

(555, 1146)

# Random Forest

In [283]:
mdl_rf = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=1, class_weight="balanced", n_jobs=6)
mdl_rf.fit(Xtrain_wtitle, ytrain)

p_rf = mdl_rf.predict_proba(Xval_wtitle)[:,1]

average_precision_score(yval, p_rf), roc_auc_score(yval, p_rf)

(0.2284201947891743, 0.6926785398360559)

# LGBM

In [190]:
mdl = LGBMClassifier(random_state=0, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

p_lg = mdl.predict_proba(Xval_wtitle)[:, 1]

average_precision_score(yval, p_lg), roc_auc_score(yval, p_lg)



(0.17930936519845178, 0.6177389868490889)

# Bayesian Optimazation
    LGBM

In [279]:
def tune_lgbm(params, tune=True):
    
    # Parameters
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    min_df = params[6]
    ngram_range = params[7]
    
    # Vectorization
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=(1, params[7]))
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_val = title_vec.transform(title_val)
    
    # hstack to concat
    Xtrain_wtitle = hstack([Xtrain, title_bow_train])
    Xval_wtitle = hstack([Xval, title_bow_val])
    
    # Model
    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2, max_depth=max_depth, 
                        min_child_samples=min_child_samples, subsample=subsample,
                        colsample_bytree=colsample_bytree, bagging_freq=1, 
                        n_estimators=n_estimators, random_state=0,
                        class_weight='balanced', n_jobs=6)
    mdl.fit(Xtrain_wtitle, ytrain)
    
    # Probability prediction
    p_lgbm = mdl.predict_proba(Xval_wtitle)[:, 1]
    
    # if tune is True than execute for tunning else, it's validation or production
    if tune: 
        return -average_precision_score(yval, p_lgbm)
    else:
        print(average_precision_score(yval, p_lgbm), roc_auc_score(yval, p_lgbm)) 
        return p_lgbm, mdl, title_vec

## Space and Minimize for Logistic Regression

In [184]:
# space is a range or dimension that it's possible combine for get the best combination for best score
space = [
    (1e-3, 1e-1, 'log-uniform'), # lr
    (1, 11), # max_depth
    (1, 20), # min_child_samples
    (0.05, 1.), # subsample
    (0.05, 1.), # colsample_bytree
    (100, 1000), # n_estimators
    (1, 5), # min_df
    (1, 5), # ngram_range
]
# Function that you can get the best parameters
res = forest_minimize(func=tune_lgbm, dimensions=space, random_state=160745, n_random_starts=20, n_calls=100, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.1232
Function value obtained: -0.1185
Current minimum: -0.1185
Iteration No: 2 started. Evaluating function at random point.




Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.2740
Function value obtained: -0.1203
Current minimum: -0.1203
Iteration No: 3 started. Evaluating function at random point.




Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.2065
Function value obtained: -0.1175
Current minimum: -0.1203
Iteration No: 4 started. Evaluating function at random point.




Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.2826
Function value obtained: -0.1222
Current minimum: -0.1222
Iteration No: 5 started. Evaluating function at random point.




Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.2335
Function value obtained: -0.1030
Current minimum: -0.1222
Iteration No: 6 started. Evaluating function at random point.




Iteration No: 6 ended. Evaluation done at random point.
Time taken: 0.3339
Function value obtained: -0.1218
Current minimum: -0.1222
Iteration No: 7 started. Evaluating function at random point.




Iteration No: 7 ended. Evaluation done at random point.
Time taken: 0.3059
Function value obtained: -0.1413
Current minimum: -0.1413
Iteration No: 8 started. Evaluating function at random point.
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 0.1863
Function value obtained: -0.1094
Current minimum: -0.1413
Iteration No: 9 started. Evaluating function at random point.




Iteration No: 9 ended. Evaluation done at random point.
Time taken: 0.3631
Function value obtained: -0.1261
Current minimum: -0.1413
Iteration No: 10 started. Evaluating function at random point.




Iteration No: 10 ended. Evaluation done at random point.
Time taken: 0.3444
Function value obtained: -0.1155
Current minimum: -0.1413
Iteration No: 11 started. Evaluating function at random point.




Iteration No: 11 ended. Evaluation done at random point.
Time taken: 0.2563
Function value obtained: -0.1344
Current minimum: -0.1413
Iteration No: 12 started. Evaluating function at random point.
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 0.1626
Function value obtained: -0.1273
Current minimum: -0.1413
Iteration No: 13 started. Evaluating function at random point.




Iteration No: 13 ended. Evaluation done at random point.
Time taken: 0.2015
Function value obtained: -0.1270
Current minimum: -0.1413
Iteration No: 14 started. Evaluating function at random point.
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 0.1277
Function value obtained: -0.1170
Current minimum: -0.1413
Iteration No: 15 started. Evaluating function at random point.




Iteration No: 15 ended. Evaluation done at random point.
Time taken: 0.2374
Function value obtained: -0.1327
Current minimum: -0.1413
Iteration No: 16 started. Evaluating function at random point.
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 0.0858
Function value obtained: -0.1192
Current minimum: -0.1413
Iteration No: 17 started. Evaluating function at random point.




Iteration No: 17 ended. Evaluation done at random point.
Time taken: 0.1795
Function value obtained: -0.1140
Current minimum: -0.1413
Iteration No: 18 started. Evaluating function at random point.




Iteration No: 18 ended. Evaluation done at random point.
Time taken: 0.1985
Function value obtained: -0.1073
Current minimum: -0.1413
Iteration No: 19 started. Evaluating function at random point.




Iteration No: 19 ended. Evaluation done at random point.
Time taken: 0.2577
Function value obtained: -0.1415
Current minimum: -0.1415
Iteration No: 20 started. Evaluating function at random point.




Iteration No: 20 ended. Evaluation done at random point.
Time taken: 0.6269
Function value obtained: -0.1111
Current minimum: -0.1415
Iteration No: 21 started. Searching for the next optimal point.




Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.4685
Function value obtained: -0.1151
Current minimum: -0.1415
Iteration No: 22 started. Searching for the next optimal point.




Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.5425
Function value obtained: -0.1342
Current minimum: -0.1415
Iteration No: 23 started. Searching for the next optimal point.




Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 0.6579
Function value obtained: -0.1165
Current minimum: -0.1415
Iteration No: 24 started. Searching for the next optimal point.




Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 0.4279
Function value obtained: -0.1124
Current minimum: -0.1415
Iteration No: 25 started. Searching for the next optimal point.




Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 0.4129
Function value obtained: -0.1427
Current minimum: -0.1427
Iteration No: 26 started. Searching for the next optimal point.




Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.3937
Function value obtained: -0.1123
Current minimum: -0.1427
Iteration No: 27 started. Searching for the next optimal point.




Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 0.4424
Function value obtained: -0.1545
Current minimum: -0.1545
Iteration No: 28 started. Searching for the next optimal point.




Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 0.4329
Function value obtained: -0.1528
Current minimum: -0.1545
Iteration No: 29 started. Searching for the next optimal point.




Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 0.4219
Function value obtained: -0.1202
Current minimum: -0.1545
Iteration No: 30 started. Searching for the next optimal point.




Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 0.5408
Function value obtained: -0.1194
Current minimum: -0.1545
Iteration No: 31 started. Searching for the next optimal point.




Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 0.4170
Function value obtained: -0.1491
Current minimum: -0.1545
Iteration No: 32 started. Searching for the next optimal point.




Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 0.4494
Function value obtained: -0.1312
Current minimum: -0.1545
Iteration No: 33 started. Searching for the next optimal point.




Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 0.4423
Function value obtained: -0.1338
Current minimum: -0.1545
Iteration No: 34 started. Searching for the next optimal point.




Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 0.4329
Function value obtained: -0.1107
Current minimum: -0.1545
Iteration No: 35 started. Searching for the next optimal point.




Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 0.4312
Function value obtained: -0.1192
Current minimum: -0.1545
Iteration No: 36 started. Searching for the next optimal point.




Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 0.4099
Function value obtained: -0.1227
Current minimum: -0.1545
Iteration No: 37 started. Searching for the next optimal point.




Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 0.4298
Function value obtained: -0.1268
Current minimum: -0.1545
Iteration No: 38 started. Searching for the next optimal point.




Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 0.4434
Function value obtained: -0.1252
Current minimum: -0.1545
Iteration No: 39 started. Searching for the next optimal point.




Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 0.5464
Function value obtained: -0.1086
Current minimum: -0.1545
Iteration No: 40 started. Searching for the next optimal point.




Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 0.4412
Function value obtained: -0.1317
Current minimum: -0.1545
Iteration No: 41 started. Searching for the next optimal point.




Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 0.4188
Function value obtained: -0.1498
Current minimum: -0.1545
Iteration No: 42 started. Searching for the next optimal point.




Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 0.4296
Function value obtained: -0.1368
Current minimum: -0.1545
Iteration No: 43 started. Searching for the next optimal point.




Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 0.4265
Function value obtained: -0.1374
Current minimum: -0.1545
Iteration No: 44 started. Searching for the next optimal point.




Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 0.3901
Function value obtained: -0.1547
Current minimum: -0.1547
Iteration No: 45 started. Searching for the next optimal point.




Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 0.4372
Function value obtained: -0.1311
Current minimum: -0.1547
Iteration No: 46 started. Searching for the next optimal point.




Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 0.4333
Function value obtained: -0.1519
Current minimum: -0.1547
Iteration No: 47 started. Searching for the next optimal point.




Iteration No: 47 ended. Search finished for the next optimal point.
Time taken: 0.5166
Function value obtained: -0.1376
Current minimum: -0.1547
Iteration No: 48 started. Searching for the next optimal point.




Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 0.4620
Function value obtained: -0.1261
Current minimum: -0.1547
Iteration No: 49 started. Searching for the next optimal point.




Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 0.4303
Function value obtained: -0.1389
Current minimum: -0.1547
Iteration No: 50 started. Searching for the next optimal point.




Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 0.4313
Function value obtained: -0.1715
Current minimum: -0.1715
Iteration No: 51 started. Searching for the next optimal point.




Iteration No: 51 ended. Search finished for the next optimal point.
Time taken: 0.4429
Function value obtained: -0.1525
Current minimum: -0.1715
Iteration No: 52 started. Searching for the next optimal point.




Iteration No: 52 ended. Search finished for the next optimal point.
Time taken: 0.4423
Function value obtained: -0.1508
Current minimum: -0.1715
Iteration No: 53 started. Searching for the next optimal point.




Iteration No: 53 ended. Search finished for the next optimal point.
Time taken: 0.4423
Function value obtained: -0.1166
Current minimum: -0.1715
Iteration No: 54 started. Searching for the next optimal point.




Iteration No: 54 ended. Search finished for the next optimal point.
Time taken: 0.4225
Function value obtained: -0.1659
Current minimum: -0.1715
Iteration No: 55 started. Searching for the next optimal point.




Iteration No: 55 ended. Search finished for the next optimal point.
Time taken: 0.4197
Function value obtained: -0.1578
Current minimum: -0.1715
Iteration No: 56 started. Searching for the next optimal point.




Iteration No: 56 ended. Search finished for the next optimal point.
Time taken: 0.5925
Function value obtained: -0.1278
Current minimum: -0.1715
Iteration No: 57 started. Searching for the next optimal point.




Iteration No: 57 ended. Search finished for the next optimal point.
Time taken: 0.4250
Function value obtained: -0.1592
Current minimum: -0.1715
Iteration No: 58 started. Searching for the next optimal point.




Iteration No: 58 ended. Search finished for the next optimal point.
Time taken: 0.4551
Function value obtained: -0.1644
Current minimum: -0.1715
Iteration No: 59 started. Searching for the next optimal point.




Iteration No: 59 ended. Search finished for the next optimal point.
Time taken: 0.4174
Function value obtained: -0.1285
Current minimum: -0.1715
Iteration No: 60 started. Searching for the next optimal point.




Iteration No: 60 ended. Search finished for the next optimal point.
Time taken: 0.4023
Function value obtained: -0.1442
Current minimum: -0.1715
Iteration No: 61 started. Searching for the next optimal point.




Iteration No: 61 ended. Search finished for the next optimal point.
Time taken: 0.4244
Function value obtained: -0.1143
Current minimum: -0.1715
Iteration No: 62 started. Searching for the next optimal point.




Iteration No: 62 ended. Search finished for the next optimal point.
Time taken: 0.4577
Function value obtained: -0.1523
Current minimum: -0.1715
Iteration No: 63 started. Searching for the next optimal point.




Iteration No: 63 ended. Search finished for the next optimal point.
Time taken: 0.4842
Function value obtained: -0.1185
Current minimum: -0.1715
Iteration No: 64 started. Searching for the next optimal point.




Iteration No: 64 ended. Search finished for the next optimal point.
Time taken: 0.4229
Function value obtained: -0.1213
Current minimum: -0.1715
Iteration No: 65 started. Searching for the next optimal point.




Iteration No: 65 ended. Search finished for the next optimal point.
Time taken: 0.4384
Function value obtained: -0.1316
Current minimum: -0.1715
Iteration No: 66 started. Searching for the next optimal point.




Iteration No: 66 ended. Search finished for the next optimal point.
Time taken: 0.4932
Function value obtained: -0.1094
Current minimum: -0.1715
Iteration No: 67 started. Searching for the next optimal point.




Iteration No: 67 ended. Search finished for the next optimal point.
Time taken: 0.4775
Function value obtained: -0.1092
Current minimum: -0.1715
Iteration No: 68 started. Searching for the next optimal point.




Iteration No: 68 ended. Search finished for the next optimal point.
Time taken: 0.5615
Function value obtained: -0.1521
Current minimum: -0.1715
Iteration No: 69 started. Searching for the next optimal point.




Iteration No: 69 ended. Search finished for the next optimal point.
Time taken: 0.5290
Function value obtained: -0.1807
Current minimum: -0.1807
Iteration No: 70 started. Searching for the next optimal point.




Iteration No: 70 ended. Search finished for the next optimal point.
Time taken: 0.4324
Function value obtained: -0.1344
Current minimum: -0.1807
Iteration No: 71 started. Searching for the next optimal point.




Iteration No: 71 ended. Search finished for the next optimal point.
Time taken: 0.5074
Function value obtained: -0.1717
Current minimum: -0.1807
Iteration No: 72 started. Searching for the next optimal point.




Iteration No: 72 ended. Search finished for the next optimal point.
Time taken: 0.7181
Function value obtained: -0.1486
Current minimum: -0.1807
Iteration No: 73 started. Searching for the next optimal point.




Iteration No: 73 ended. Search finished for the next optimal point.
Time taken: 0.5471
Function value obtained: -0.1390
Current minimum: -0.1807
Iteration No: 74 started. Searching for the next optimal point.




Iteration No: 74 ended. Search finished for the next optimal point.
Time taken: 0.5299
Function value obtained: -0.1506
Current minimum: -0.1807
Iteration No: 75 started. Searching for the next optimal point.




Iteration No: 75 ended. Search finished for the next optimal point.
Time taken: 0.5964
Function value obtained: -0.1532
Current minimum: -0.1807
Iteration No: 76 started. Searching for the next optimal point.




Iteration No: 76 ended. Search finished for the next optimal point.
Time taken: 0.5096
Function value obtained: -0.1351
Current minimum: -0.1807
Iteration No: 77 started. Searching for the next optimal point.




Iteration No: 77 ended. Search finished for the next optimal point.
Time taken: 0.5705
Function value obtained: -0.1922
Current minimum: -0.1922
Iteration No: 78 started. Searching for the next optimal point.




Iteration No: 78 ended. Search finished for the next optimal point.
Time taken: 0.5929
Function value obtained: -0.1459
Current minimum: -0.1922
Iteration No: 79 started. Searching for the next optimal point.




Iteration No: 79 ended. Search finished for the next optimal point.
Time taken: 0.5272
Function value obtained: -0.1562
Current minimum: -0.1922
Iteration No: 80 started. Searching for the next optimal point.




Iteration No: 80 ended. Search finished for the next optimal point.
Time taken: 0.5339
Function value obtained: -0.1576
Current minimum: -0.1922
Iteration No: 81 started. Searching for the next optimal point.




Iteration No: 81 ended. Search finished for the next optimal point.
Time taken: 0.5357
Function value obtained: -0.1535
Current minimum: -0.1922
Iteration No: 82 started. Searching for the next optimal point.




Iteration No: 82 ended. Search finished for the next optimal point.
Time taken: 0.5624
Function value obtained: -0.1484
Current minimum: -0.1922
Iteration No: 83 started. Searching for the next optimal point.




Iteration No: 83 ended. Search finished for the next optimal point.
Time taken: 0.5555
Function value obtained: -0.2120
Current minimum: -0.2120
Iteration No: 84 started. Searching for the next optimal point.




Iteration No: 84 ended. Search finished for the next optimal point.
Time taken: 0.5774
Function value obtained: -0.2181
Current minimum: -0.2181
Iteration No: 85 started. Searching for the next optimal point.




Iteration No: 85 ended. Search finished for the next optimal point.
Time taken: 0.6085
Function value obtained: -0.1785
Current minimum: -0.2181
Iteration No: 86 started. Searching for the next optimal point.




Iteration No: 86 ended. Search finished for the next optimal point.
Time taken: 0.6203
Function value obtained: -0.1962
Current minimum: -0.2181
Iteration No: 87 started. Searching for the next optimal point.




Iteration No: 87 ended. Search finished for the next optimal point.
Time taken: 0.7283
Function value obtained: -0.1797
Current minimum: -0.2181
Iteration No: 88 started. Searching for the next optimal point.




Iteration No: 88 ended. Search finished for the next optimal point.
Time taken: 0.5625
Function value obtained: -0.1962
Current minimum: -0.2181
Iteration No: 89 started. Searching for the next optimal point.




Iteration No: 89 ended. Search finished for the next optimal point.
Time taken: 0.6556
Function value obtained: -0.1983
Current minimum: -0.2181
Iteration No: 90 started. Searching for the next optimal point.




Iteration No: 90 ended. Search finished for the next optimal point.
Time taken: 0.5720
Function value obtained: -0.1842
Current minimum: -0.2181
Iteration No: 91 started. Searching for the next optimal point.




Iteration No: 91 ended. Search finished for the next optimal point.
Time taken: 0.6869
Function value obtained: -0.1964
Current minimum: -0.2181
Iteration No: 92 started. Searching for the next optimal point.




Iteration No: 92 ended. Search finished for the next optimal point.
Time taken: 0.6662
Function value obtained: -0.1182
Current minimum: -0.2181
Iteration No: 93 started. Searching for the next optimal point.




Iteration No: 93 ended. Search finished for the next optimal point.
Time taken: 0.6389
Function value obtained: -0.1763
Current minimum: -0.2181
Iteration No: 94 started. Searching for the next optimal point.




Iteration No: 94 ended. Search finished for the next optimal point.
Time taken: 0.6765
Function value obtained: -0.1284
Current minimum: -0.2181
Iteration No: 95 started. Searching for the next optimal point.




Iteration No: 95 ended. Search finished for the next optimal point.
Time taken: 0.6413
Function value obtained: -0.1940
Current minimum: -0.2181
Iteration No: 96 started. Searching for the next optimal point.




Iteration No: 96 ended. Search finished for the next optimal point.
Time taken: 0.5735
Function value obtained: -0.1917
Current minimum: -0.2181
Iteration No: 97 started. Searching for the next optimal point.




Iteration No: 97 ended. Search finished for the next optimal point.
Time taken: 0.6438
Function value obtained: -0.1581
Current minimum: -0.2181
Iteration No: 98 started. Searching for the next optimal point.




Iteration No: 98 ended. Search finished for the next optimal point.
Time taken: 0.6146
Function value obtained: -0.1698
Current minimum: -0.2181
Iteration No: 99 started. Searching for the next optimal point.




Iteration No: 99 ended. Search finished for the next optimal point.
Time taken: 0.5952
Function value obtained: -0.2282
Current minimum: -0.2282
Iteration No: 100 started. Searching for the next optimal point.




Iteration No: 100 ended. Search finished for the next optimal point.
Time taken: 0.5649
Function value obtained: -0.2024
Current minimum: -0.2282


In [280]:
p_lgbm, mdl_lgbm, title_vec = tune_lgbm(res.x, tune=False)

0.22817943310199457 0.6058854750250634




    0.19202825084877856 0.6055021525033909 - 1,2 n_gram
    0.22817943310199457 0.6058854750250634 - 1,3
    0.230251348614118   0.6037624579819543 - 1,4
    0.22778106565523465 0.6025830040691159 - 1,5

# Logistic Regression

In [194]:
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

# in this case was warst than MaxAbsScaler
#scaler = StandardScaler()
#Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
#Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())
scaler = MaxAbsScaler()
Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2) # with max we can put all columns easier than standardscaler
Xval_wtitle2 = scaler.transform(Xval_wtitle2)


mdl = LogisticRegression(C=0.3, random_state=0, n_jobs=6, penalty='l2')
mdl.fit(Xtrain_wtitle2, ytrain)

p_lr = mdl.predict_proba(Xval_wtitle2)[:, 1]

average_precision_score(yval, p_lr), roc_auc_score(yval, p_lr)

(0.22125869093931344, 0.6827858701421242)

### Metrics Logistic Regression
    ap -> 0.20739038717036196 roc -> 0.662027481276169 -> StandardScaler()
    ap -> 0.2052824213278625  roc -> 0.686530636315386 -> MaxAbsScaler()
    ap -> 0.2150545436183453  roc -> 0.687031904228342 -> MaxAbsScaler() params(c=0.5)
    ap -> 0.2212586909393134  roc -> 0.682785870142124 -> MaxAbsScaler() params(c=0.3)

# Pipeline

In [195]:
lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C=0.3, penalty='l2', n_jobs=6, random_state=0))
lr_pipeline.fit(Xtrain_wtitle2, ytrain)

p_lr = lr_pipeline.predict_proba(Xval_wtitle2)[:, 1]

average_precision_score(yval, p_lr), roc_auc_score(yval, p_lr)

(0.22125869093931344, 0.6827858701421242)

# Metrics Report

    Random Forest
        ap -> 0.19423876065453383 roc -> 0.677802677360382 -> min_df=2, ngram_range=(1,1)
        ap -> 0.20385285274265763 roc -> 0.680721825794657 -> min_df=1, ngram_range=(1,2)
        ap -> 0.21799797498952303 roc -> 0.683242908533349 -> min_df=2, ngram_range=(1,2)
        ap -> 0.2284201947891743  roc -> 0.692678539836055 -> min_df=2, ngram_range=(1,3)

    LightGBM
        ap -> 0.17930936519845178 roc -> 0.617738986849088o -> no paramas
        ap -> 0.22817943310199457 roc -> 0.6058854750250634 -> [0.06584043384269299, 4, 20, 
                                                                0.8668305344607656, 0.07301246723740631,
                                                                822, 5, 3]

    LogisticRegression
        ap -> 0.21516167484835547 roc -> 0.6830807336203338 -> no params
        ap -> 0.2212586909393134  roc -> 0.682785870142124  -> MaxAbsScaler() params(c=0.3)

# Ensemble

    LG = (0.22125869093931344, 0.6827858701421242)
    lgbm = (0.22817943310199457, 0.6058854750250634)
    RF - (0.2284201947891743, 0.6926785398360559)

In [260]:
p = (p_rf + p_lgbm + p_lr)
average_precision_score(yval, p), roc_auc_score(yval, p)

In [284]:
pd.DataFrame({'RF':p_rf, 'LR':p_lr, 'LGBM':p_lgbm}).corr()
# This case is important analyse that how less correlation between them it will be better for ensemble
# because it's diferent solution but the some performance, to get more simple way solution because the deploy
# let's doing ensemble just with RF and LGBM

Unnamed: 0,RF,LR,LGBM
RF,1.0,0.812706,0.377971
LR,0.812706,1.0,0.481204
LGBM,0.377971,0.481204,1.0


In [273]:
p = p_rf*0.1 + p_lgbm*0.9
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.2398470587807427, 0.6143480568496785)

    ngram = 1,3
    lr + rf + lgbm -> (0.2583693577701163,  0.6576045291030252)
         rf + lgbm -> (0.24187724466569893, 0.6233708792828919) 0.2 + 0.8
         rf + lgbm -> (0.25879704689816807, 0.6321283245857168) 0.3 + 0.7
         rf + lgbm -> (0.25381192004628994, 0.6567199386683965) 0.4 + 0.6
         rf + lgbm -> (0.2687715019710023,  0.6479624933655718) 0.5 + 0.5  <--- best
         rf + lgbm -> (0.25381192004628994, 0.6567199386683965) 0.6 + 0.4
         rf + lgbm -> (0.24708998095614865, 0.6670106740579111) 0.7 + 0.3
         rf + lgbm -> (0.236449556145549,   0.6750899333608539) 0.8 + 0.2
         rf + lgbm -> (0.23366883200760735, 0.6845550510113818) 0.9 + 0.1

# Save Models

In [285]:
import joblib as jb

In [286]:
jb.dump(mdl_lgbm, "lgbm_20200611.pkl.z")
jb.dump(mdl_rf, "random_forest_20200611.pkl.z")
jb.dump(title_vec, "title_vectorizer_20200611.pkl.z")

['title_vectorizer_20200611.pkl.z']