In [371]:
# Import libraries and set desired options
import pickle
from pathlib2 import Path
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
%config InlineBackend.figure_format = 'retina'

In [2]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio = 0.9):
    # Split the data into the training and validation sets
    idx = int(round(X.shape[0] * ratio))
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed, solver='liblinear').fit(X[:idx, :], y[:idx])
    # Prediction for validation set
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    # Calculate the quality
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

In [52]:
def time_cross_val(X_train, y_train, estimator, n_splits=10):
    time_split = TimeSeriesSplit(n_splits=n_splits)
    cv_scores = cross_val_score(estimator, X_train, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=1)
    return cv_scores

In [50]:
def new_features(df_in):
    df = pd.DataFrame()
    df['morning'] = df_in['time1'].apply(lambda ts: 1 if ts.hour >= 7 and ts.hour <= 11 else -1)
    df['day'] = df_in['time1'].apply(lambda ts: 1 if ts.hour >= 12 and ts.hour <= 18 else -1)
    df['evening'] = df_in['time1'].apply(lambda ts: 1 if ts.hour >= 19 and ts.hour <= 23 else -1)
    df['night'] = df_in['time1'].apply(lambda ts: 1 if ts.hour >= 0 and ts.hour <= 6 else -1)
    
    df['start_month'] = df_in['time1'].apply(lambda ts: 100 * ts.year + ts.month).astype('float64')
    df['start_month'] = StandardScaler().fit_transform(df[['start_month']])
     
    return df

In [56]:
def add_feature_and_df_split(base_sparce_df, new_feature_df, split:int):
    X_train = hstack([base_sparce_df[:split,:], new_feature_df[:split]])
    X_test = hstack([base_sparce_df[split:,:], new_feature_df[split:]])
    return X_train, X_test

In [480]:
def add_feature_and_df_split_NOT_SPARCE(base_sparce_df, new_feature_df, split:int):
    X_train = hstack([base_sparce_df[:split], new_feature_df[:split]])
    X_test = hstack([base_sparce_df[split:], new_feature_df[split:]])
    return X_train, X_test

In [208]:
def delta_times(times_df):
    delta_time_df = pd.DataFrame()
    count_of_colums = len(times_df.columns)
    for i in range(1, count_of_colums):
        column_name = f'd_time{i}'
        delta_time_df[column_name] = times_df.apply(lambda ts_columns: (ts_columns[f'time{i+1}'] - ts_columns[f'time{i}']).total_seconds() if not pd.isnull(ts_columns[f'time{i+1}']) else 0, axis=1)
        delta_time_df[column_name] = StandardScaler().fit_transform(delta_time_df[column_name].values.reshape(-1, 1))
    return delta_time_df

In [216]:
def count_of_top_alice_sites(sites):
    count = 0
    for site in sites:
        if site in [77, 80, 76, 29, 21, 81, 879, 22, 75, 82]:
            count += 1
    return count

In [6]:
# Read the training and test data sets, change paths if needed
PATH_TO_DATA = Path(r'D:\Programming\DS\mlcourse\course\jupyter_english\assignments_fall2019\Alice')

times = ['time%s' % i for i in range(1, 11)]
train_df = pd.read_csv(PATH_TO_DATA / 'train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv(PATH_TO_DATA / 'test_sessions.csv',
                      index_col='session_id', parse_dates=times)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

In [18]:
# United dataframe of the initial data 
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [19]:
# Change site1, ..., site10 columns type to integer and fill NA-values with zeros
sites = ['site%s' % i for i in range(1, 11)]
full_df[sites] = full_df[sites].fillna(0).astype(np.uint16)

# Load websites dictionary
with open(PATH_TO_DATA / 'site_dic.pkl', "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), 
                          columns=['site'])

In [20]:
# create an inverse id _> site mapping
id2site = {v:k for (k, v) in site_dict.items()}
# we treat site with id 0 as "unknown"
id2site[0] = 'unknown'

In [25]:
full_sites = full_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                 ' '.join([id2site[i] for i in row]), axis=1).tolist()

In [28]:
# we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
# so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
# to be split into 'mail', 'google' and 'com')

vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}

vectorizer = TfidfVectorizer(**vectorizer_params)
full_sites_vect = vectorizer.fit_transform(full_sites)
y_train = train_df['target'].astype('int').values

In [29]:
# we'll need site visit times for further feature engineering
full_times = full_df[times]

In [210]:
full_df[sites]

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947
...,...,...,...,...,...,...,...,...,...,...
82793,812,1039,676,0,0,0,0,0,0,0
82794,300,302,302,300,300,1222,302,1218,1221,1216
82795,29,33,35,22,37,6779,30,21,23,6780
82796,5828,23,21,804,21,3350,23,894,21,961


In [354]:
temp_df = pd.DataFrame()

In [355]:
temp_df['day_of_week'] = full_df['time1'].apply(lambda ts: ts.dayofweek).astype('float64')
temp_df = pd.get_dummies(temp_df, columns=['day_of_week']) 
week_days = [f'day_of_week_{i}.0' for i in range(7)]

In [356]:
temp_df['day_of_week'] = full_df['time1'].apply(lambda ts: ts.dayofweek).astype('float64')

In [357]:
temp_df['session_lenth'] = (full_times.max(axis=1) - full_times.min(axis=1)).astype('timedelta64[ms]').astype(int)
temp_df['session_lenth'] = StandardScaler().fit_transform(temp_df['session_lenth'].values.reshape(-1, 1))

In [358]:
#temp_time_df = delta_times(full_times)

In [380]:
new_features_df = new_features(full_df)
# new_features_df['day_of_week'] = temp_df['day_of_week']
new_features_df[week_days] = temp_df[week_days]
new_features_df['session_lenth'] = temp_df['session_lenth']

In [490]:
from sklearn.decomposition import TruncatedSVD
TSVD = TruncatedSVD(n_components=10, n_iter=7, random_state=17)

In [491]:
new_full_matrix = pd.DataFrame(TSVD.fit_transform(full_sites_vect), index=full_df.index)
new_full_matrix

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,0.944698,-0.007223,-0.004987,-0.030575,-0.000096,-0.006571,-0.006559,0.003026,-0.009081,-0.002674
54843,0.767276,-0.003518,-0.002069,-0.004090,0.000670,-0.008406,-0.007788,0.010437,-0.046411,-0.009691
77292,0.001919,0.000050,0.000146,0.002087,0.000008,-0.000150,-0.000105,0.000463,-0.002150,-0.000697
114021,0.000336,0.000022,0.000125,0.001612,-0.000026,-0.000143,-0.000100,0.000239,-0.001578,-0.000519
146670,0.000221,0.000015,0.000084,0.001079,-0.000017,-0.000087,-0.000055,0.000164,-0.001102,-0.000364
...,...,...,...,...,...,...,...,...,...,...
82793,0.871045,-0.007434,-0.004210,-0.021953,-0.000778,-0.004538,-0.004277,0.003835,-0.025910,-0.016115
82794,0.003921,0.000275,0.001064,0.013208,-0.000112,0.002379,0.002504,0.002017,-0.066283,0.137178
82795,0.005503,0.003462,0.013496,0.162772,-0.003778,-0.011102,-0.008023,0.020745,-0.112064,-0.057624
82796,0.008590,0.006189,0.020552,0.234021,-0.005364,-0.027294,-0.024536,-0.031294,0.051672,0.020494


In [494]:
new_features_df = pd.concat([new_features_df, new_full_matrix], axis=1)

In [495]:
X_train, X_test = add_feature_and_df_split(full_sites_vect, new_features_df, idx_split)

In [496]:
logit = LogisticRegression(C=1.67, random_state=17, solver='liblinear')

In [497]:
%%time
cv_scores = time_cross_val(X_train, y_train, logit)
cv_scores, cv_scores.mean()

Wall time: 46.6 s


(array([0.81890012, 0.87214036, 0.87288527, 0.97610829, 0.94116934,
        0.97318594, 0.92599188, 0.9601468 , 0.78188639, 0.96845756]),
 0.9090871947539227)

In [484]:
%%time
from sklearn import svm
SVM = svm.LinearSVC(random_state=17)
cv_scores = time_cross_val(X_train, y_train, SVM)
cv_scores, cv_scores.mean()



(array([0.79364885, 0.86104848, 0.8019889 , 0.93895836, 0.91667376,
        0.95841077, 0.87097843, 0.95049107, 0.49201354, 0.9164637 ]),
 0.8500675869114229)

In [486]:
# from sklearn.ensemble import RandomForestClassifier
# RF = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=17)
# cv_scores = time_cross_val(X_train, y_train, RF)
# cv_scores, cv_scores.mean()

In [None]:
%%time
from sklearn import linear_model
SGDC = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, loss='log', random_state=17, alpha=0.0001, penalty='l2')
cv_scores = time_cross_val(X_train, y_train, SGDC)
cv_scores, cv_scores.mean()

In [None]:
from sklearn.ensemble import VotingClassifier
clf1 = logit
clf2 = RF
clf3 = SGDC
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('sgd', clf3)], voting='soft') 

In [None]:
%%time
cv_scores = time_cross_val(X_train, y_train, eclf)
cv_scores, cv_scores.mean()

In [445]:
c_values = [0.1, 1.17, 1.67, 3.35, 4.5]
time_split = TimeSeriesSplit(n_splits=5)
params = {'lr__C': c_values, 'lr__class_weight': [None,'balanced'],
          'rf__max_depth': [5, 20],
          'sgd__penalty': ['l2', 'l1'], 'sgd__alpha': [0.0001, 0.001, 0.01]}

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=time_split, scoring='roc_auc', n_jobs=1, verbose=10)

In [446]:
%%time
grid.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2, score=0.730, total=   2.8s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.7s remaining:    0.0s


[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2, score=0.904, total=   5.2s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.0s remaining:    0.0s


[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2, score=0.955, total=   7.4s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   15.3s remaining:    0.0s


[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2, score=0.958, total=  10.2s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   25.5s remaining:    0.0s


[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2, score=0.918, total=  12.4s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   37.9s remaining:    0.0s


[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1, score=0.674, total=   2.9s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   40.8s remaining:    0.0s


[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1, score=0.900, total=   5.3s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   46.1s remaining:    0.0s


[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1, score=0.955, total=   7.5s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   53.6s remaining:    0.0s


[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1, score=0.959, total=  10.4s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.1min remaining:    0.0s


[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1, score=0.914, total=  12.6s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2, score=0.670, total=   2.8s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2, score=0.897, total=   5.2s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2, score=0.947, total=   7.3s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2, score=0.955, total=  10.3s
[CV] lr__C=0.1, lr__class_weight=None, r

[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1, score=0.960, total=  33.6s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1 
[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1, score=0.935, total=  41.5s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2 
[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2, score=0.660, total=   9.3s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2 
[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2, score=0.915, total=  17.9s
[CV] lr__C=0.1, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2 
[CV]  lr__C=0.1, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2, score=0.937, total=  24.7s
[CV] lr__C=0.1, lr__class_weight=None,

[CV]  lr__C=0.1, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l1, score=0.894, total=   5.6s
[CV] lr__C=0.1, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l1 
[CV]  lr__C=0.1, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l1, score=0.958, total=   8.1s
[CV] lr__C=0.1, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l1 
[CV]  lr__C=0.1, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l1, score=0.958, total=  11.5s
[CV] lr__C=0.1, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l1 
[CV]  lr__C=0.1, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l1, score=0.910, total=  13.8s
[CV] lr__C=0.1, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l2 
[CV]  lr__C=0.1, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l2, score=0.852, total=   9.5s
[CV] lr_

[CV]  lr__C=1.17, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2, score=0.931, total=  14.5s
[CV] lr__C=1.17, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1 
[CV]  lr__C=1.17, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1, score=0.700, total=   3.2s
[CV] lr__C=1.17, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1 
[CV]  lr__C=1.17, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1, score=0.911, total=   6.3s
[CV] lr__C=1.17, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1 
[CV]  lr__C=1.17, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1, score=0.962, total=   9.0s
[CV] lr__C=1.17, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1 
[CV]  lr__C=1.17, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1, score=0.961, total=  12.6s
[CV] lr__C=1.17, lr__cl

[CV]  lr__C=1.17, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l2, score=0.956, total=  37.6s
[CV] lr__C=1.17, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=1.17, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l2, score=0.937, total=  44.7s
[CV] lr__C=1.17, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1 
[CV]  lr__C=1.17, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1, score=0.674, total=   9.9s
[CV] lr__C=1.17, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1 
[CV]  lr__C=1.17, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1, score=0.928, total=  19.5s
[CV] lr__C=1.17, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1 
[CV]  lr__C=1.17, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1, score=0.961, total=  28.0s
[CV] lr__C=1.17, lr__cl

[CV]  lr__C=1.17, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l2, score=0.928, total=   6.7s
[CV] lr__C=1.17, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l2 
[CV]  lr__C=1.17, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l2, score=0.952, total=   9.7s
[CV] lr__C=1.17, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l2 
[CV]  lr__C=1.17, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l2, score=0.950, total=  14.6s
[CV] lr__C=1.17, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l2 
[CV]  lr__C=1.17, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l2, score=0.925, total=  16.5s
[CV] lr__C=1.17, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l1 
[CV]  lr__C=1.17, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l1, score=0.743, total=   3.3s
[CV] 

[CV]  lr__C=1.17, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l1, score=0.935, total=  45.8s
[CV] lr__C=1.67, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2 
[CV]  lr__C=1.67, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2, score=0.771, total=   3.2s
[CV] lr__C=1.67, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2 
[CV]  lr__C=1.67, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2, score=0.920, total=   6.0s
[CV] lr__C=1.67, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2 
[CV]  lr__C=1.67, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2, score=0.961, total=   8.5s
[CV] lr__C=1.67, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2 
[CV]  lr__C=1.67, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l2, score=0.961, total=  12.4s
[CV] lr__C=1.67, lr_

[CV]  lr__C=1.67, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l1, score=0.960, total=  36.1s
[CV] lr__C=1.67, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l1 
[CV]  lr__C=1.67, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l1, score=0.938, total=  43.7s
[CV] lr__C=1.67, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=1.67, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l2, score=0.711, total=   9.8s
[CV] lr__C=1.67, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=1.67, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l2, score=0.935, total=  18.9s
[CV] lr__C=1.67, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=1.67, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l2, score=0.953, total=  26.3s
[CV] lr__C=1.67, lr_

[CV]  lr__C=1.67, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l1, score=0.922, total=   7.2s
[CV] lr__C=1.67, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l1 
[CV]  lr__C=1.67, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l1, score=0.958, total=  10.6s
[CV] lr__C=1.67, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l1 
[CV]  lr__C=1.67, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l1, score=0.958, total=  15.2s
[CV] lr__C=1.67, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l1 
[CV]  lr__C=1.67, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l1, score=0.931, total=  18.1s
[CV] lr__C=1.67, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l2 
[CV]  lr__C=1.67, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l2, score=0.766, total=   3.4

[CV]  lr__C=1.67, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2, score=0.931, total=  47.3s
[CV] lr__C=1.67, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l1 
[CV]  lr__C=1.67, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l1, score=0.737, total=  10.0s
[CV] lr__C=1.67, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l1 
[CV]  lr__C=1.67, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l1, score=0.910, total=  19.9s
[CV] lr__C=1.67, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l1 
[CV]  lr__C=1.67, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l1, score=0.952, total=  28.3s
[CV] lr__C=1.67, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l1 
[CV]  lr__C=1.67, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l1, score=0.955, total=  38

[CV]  lr__C=3.35, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l2, score=0.960, total=  38.0s
[CV] lr__C=3.35, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l2 
[CV]  lr__C=3.35, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l2, score=0.940, total=  45.1s
[CV] lr__C=3.35, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l1 
[CV]  lr__C=3.35, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l1, score=0.699, total=  10.0s
[CV] lr__C=3.35, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l1 
[CV]  lr__C=3.35, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l1, score=0.904, total=  19.4s
[CV] lr__C=3.35, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l1 
[CV]  lr__C=3.35, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l1, score=0.958, total=  26.6s
[CV] lr__C=3.3

[CV]  lr__C=3.35, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2, score=0.936, total=   7.4s
[CV] lr__C=3.35, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=3.35, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2, score=0.953, total=  11.3s
[CV] lr__C=3.35, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=3.35, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2, score=0.954, total=  15.1s
[CV] lr__C=3.35, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=3.35, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2, score=0.926, total=  18.7s
[CV] lr__C=3.35, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l1 
[CV]  lr__C=3.35, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l1, score=0.722, total=   3

[CV]  lr__C=3.35, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1, score=0.936, total=  49.7s
[CV] lr__C=3.35, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2 
[CV]  lr__C=3.35, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2, score=0.748, total=  10.3s
[CV] lr__C=3.35, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2 
[CV]  lr__C=3.35, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2, score=0.933, total=  21.1s
[CV] lr__C=3.35, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2 
[CV]  lr__C=3.35, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2, score=0.946, total=  30.3s
[CV] lr__C=3.35, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2 
[CV]  lr__C=3.35, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.01, sgd__penalty=l2, score=0.944, total=  4

[CV]  lr__C=4.5, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l1, score=0.958, total=  14.8s
[CV] lr__C=4.5, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l1 
[CV]  lr__C=4.5, lr__class_weight=None, rf__max_depth=5, sgd__alpha=0.01, sgd__penalty=l1, score=0.944, total=  16.1s
[CV] lr__C=4.5, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l2 
[CV]  lr__C=4.5, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l2, score=0.771, total=  10.1s
[CV] lr__C=4.5, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l2 
[CV]  lr__C=4.5, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l2, score=0.914, total=  19.5s
[CV] lr__C=4.5, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l2 
[CV]  lr__C=4.5, lr__class_weight=None, rf__max_depth=20, sgd__alpha=0.0001, sgd__penalty=l2, score=0.957, total=  28.2s
[CV] lr__C=4.5, lr__class_weight

[CV]  lr__C=4.5, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1, score=0.956, total=  12.3s
[CV] lr__C=4.5, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1 
[CV]  lr__C=4.5, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1, score=0.957, total=  17.2s
[CV] lr__C=4.5, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1 
[CV]  lr__C=4.5, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.0001, sgd__penalty=l1, score=0.925, total=  20.2s
[CV] lr__C=4.5, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=4.5, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2, score=0.755, total=   3.7s
[CV] lr__C=4.5, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2 
[CV]  lr__C=4.5, lr__class_weight=balanced, rf__max_depth=5, sgd__alpha=0.001, sgd__penalty=l2, score=0.938, total=   7.7s


[CV]  lr__C=4.5, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1, score=0.717, total=  10.7s
[CV] lr__C=4.5, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1 
[CV]  lr__C=4.5, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1, score=0.923, total=  21.0s
[CV] lr__C=4.5, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1 
[CV]  lr__C=4.5, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1, score=0.954, total=  30.7s
[CV] lr__C=4.5, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1 
[CV]  lr__C=4.5, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1, score=0.956, total=  42.5s
[CV] lr__C=4.5, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1 
[CV]  lr__C=4.5, lr__class_weight=balanced, rf__max_depth=20, sgd__alpha=0.001, sgd__penalty=l1, score=0.936, total=  51

[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed: 190.6min finished


Wall time: 3h 11min 26s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=5),
             error_score='raise-deprecating',
             estimator=VotingClassifier(estimators=[('lr',
                                                     LogisticRegression(C=1.668,
                                                                        class_weight=None,
                                                                        dual=False,
                                                                        fit_intercept=True,
                                                                        intercept_scaling=1,
                                                                        l1_ratio=None,
                                                                        max_iter=100,
                                                                        multi_class='warn',
                                                                        n_jobs=None,
                                      

In [448]:
grid.best_params_, grid.best_score_

({'lr__C': 0.1,
  'lr__class_weight': 'balanced',
  'rf__max_depth': 20,
  'sgd__alpha': 0.0001,
  'sgd__penalty': 'l2'},
 0.9168072110799572)

In [465]:
%%time
eclf.fit(X_train, y_train)

Wall time: 19min 15s


VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.67, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=17,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gi...
                                        

In [466]:
votting_best = eclf
y_test = votting_best.predict_proba(X_test)[:, 1]

# Write it to the file which could be submitted
write_to_submission_file(y_test, 'baseline_8.csv')

In [365]:
logit.fit(X_train, y_train)

LogisticRegression(C=4.641, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [498]:
c_values = np.logspace(-2, 2, 10)
time_split = TimeSeriesSplit(n_splits=10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=1, cv=time_split, verbose=1)

In [499]:
%%time
logit_grid_searcher.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 10.1min finished


Wall time: 10min 16s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
             error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.67, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=17, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=1,
             param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])},
             pre_dispatch='2*n_jobs', refit=True, return_train_scor

In [500]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_, logit_grid_searcher.best_estimator_

(0.9090834189957644,
 {'C': 1.6681005372000592},
 LogisticRegression(C=1.6681005372000592, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False))

In [501]:
logit_best = logit_grid_searcher.best_estimator_
y_test = logit_best.predict_proba(X_test)[:, 1]

# Write it to the file which could be submitted
write_to_submission_file(y_test, 'baseline_9.csv')