**Возьмем `LinearSVC`, показавший лучшее качество на кросс-валидации в 1 части, и проверим его работу еще на 8 выборках для 10 пользователей (с разными сочетаниями параметров *session_length* и *window_size*). Поскольку тут уже вычислений побольше, мы не будем каждый раз заново подбирать параметр регуляризации `C`.**

**Определите функцию `model_assessment`, ее документация описана ниже. Обратите внимание на все детали. Например, на то, что разбиение  выборки с `train_test_split` должно быть стратифицированным. Не теряйте нигде `random_state`.**

In [5]:
from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')
from time import time
import itertools
import os
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
import pickle
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, make_scorer

In [33]:
def model_assessment(estimator, path_to_X_pickle, path_to_y_pickle, cv, random_state=17, test_size=0.3):
    '''
    Estimates CV-accuracy for (1 - test_size) share of (X_sparse, y) 
    loaded from path_to_X_pickle and path_to_y_pickle and holdout accuracy for (test_size) share of (X_sparse, y).
    The split is made with stratified train_test_split with params random_state and test_size.

    :param estimator – Scikit-learn estimator (classifier or regressor)
    :param path_to_X_pickle – path to pickled sparse X (instances and their features)
    :param path_to_y_pickle – path to pickled y (responses)
    :param cv – cross-validation as in cross_val_score (use StratifiedKFold here)
    :param random_state –  for train_test_split
    :param test_size –  for train_test_split

    :returns mean CV-accuracy for (X_train, y_train) and accuracy for (X_valid, y_valid) where (X_train, y_train)
    and (X_valid, y_valid) are (1 - test_size) and (testsize) shares of (X_sparse, y).
    '''

#     load data
    with open(path_to_X_pickle, 'rb') as X_pkl:
        X = pickle.load(X_pkl)
    with open(path_to_y_pickle, 'rb') as y_pkl:
        y = pickle.load(y_pkl)

#     split data
    X_train, X_valid, y_train, y_valid = train_test_split(
        X,
        y,
        test_size = test_size,
        random_state = random_state,
        stratify=y)

#     fit data to estimator
    estimator.fit(X_train, y_train)

#     cross_val scoring
    cv_scoring = cross_val_score(
        estimator,
        X_train,
        y_train,
        scoring='accuracy',
        cv=cv,
        n_jobs=-1)
    
#     test scoring
    test_score = estimator.score(X_valid,y_valid)
    
    return np.mean(cv_scoring), test_score

In [7]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17)

In [8]:
# Поменяйте на свой путь к данным
PATH_TO_DATA = 'capstone_user_identification'

Test function

In [9]:
with open(os.path.join(PATH_TO_DATA, 
         'X_sparse_10users.pkl'), 'rb') as X_sparse_10users_pkl:
    X_sparse_10users = pickle.load(X_sparse_10users_pkl)
with open(os.path.join(PATH_TO_DATA, 
                       'y_10users.pkl'), 'rb') as y_10users_pkl:
    y_10users = pickle.load(y_10users_pkl)

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X_sparse_10users, y_10users, 
                                                      test_size=0.3, 
                                                     random_state=17, stratify=y_10users)

In [11]:
from sklearn.svm import LinearSVC

svm = LinearSVC(C = 1, random_state=17)

In [12]:
%%time
svm_params2 = {'C': np.linspace(1e-3, 1, 30)}

svm_grid_searcher2 =  GridSearchCV(svm, svm_params2, n_jobs = -1, cv = skf, return_train_score = True)
svm_grid_searcher2.fit(X_train, y_train)

CPU times: user 1.18 s, sys: 273 ms, total: 1.45 s
Wall time: 33 s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=17, shuffle=True),
             error_score=nan,
             estimator=LinearSVC(C=1, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=17, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([0.001     , 0.03544828...
       0.17324138, 0.20768966, 0.24213793, 0.27658621, 0.31103448,
       0.34548276, 0.37993103, 0.41437931, 0.44882759, 0.48327586,
       0.51772414, 0.55217241, 0.58662069, 0.62106897, 0.65551724,
       0.68996552, 0.72441379, 0.75886207, 0.79331034, 0.82775862,
       0.8622069 , 0.89665517, 0.93110345, 0.96555172, 1.        ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=Non

Test

In [13]:
model_assessment(svm_grid_searcher2.best_estimator_, 
                 os.path.join(PATH_TO_DATA, 'X_sparse_10users.pkl'),
        os.path.join(PATH_TO_DATA, 'y_10users.pkl'), skf, random_state=17, test_size=0.3)

(0.7670206386611259, 0.7807537331121118)

Примените функцию model_assessment для лучшего алгоритма из предыдущей части (а именно, svm_grid_searcher2.best_estimator_) и 9 выборок вида с разными сочетаниями параметров session_length и window_size для 10 пользователей. Выведите в цикле параметры session_length и window_size, а также результат вывода функции model_assessment. Удобно сделать так, чтоб model_assessment возвращала 3-им элементом время, за которое она выполнилась. На моем ноуте этот участок кода выполнился за 20 секунд. Но со 150 пользователями каждая итерация занимает уже несколько минут.

prepare

In [18]:
%%time
estimator = svm_grid_searcher2.best_estimator_

for window_size, session_length in itertools.product([10, 7, 5], [15, 10, 7, 5]):
    if window_size <= session_length:
        path_to_X_pkl = os.path.join(
            PATH_TO_DATA, f"X_sparse_10users_s{session_length}_w{window_size}.pkl")
        path_to_y_pkl = os.path.join(
            PATH_TO_DATA, f"y_10users_s{session_length}_w{window_size}.pkl")
        r = model_assessment(svm_grid_searcher2.best_estimator_,
                             path_to_X_pkl,
                             path_to_y_pkl,
                             skf,
                             random_state=17,
                             test_size=0.3)
        print(f" {window_size}: {session_length} : {r} ")
        pass

 10: 15 : (0.8243252292702751, 0.8404835269021095) 
 10: 10 : (0.7670206386611259, 0.7807537331121118) 
 7: 15 : (0.8495024256089474, 0.8543222166915547) 
 7: 10 : (0.7983645917156946, 0.8073668491786958) 
 7: 7 : (0.754765400423003, 0.7617388418782147) 
 5: 15 : (0.8670355547005402, 0.8752963489805595) 
 5: 10 : (0.8177520250854086, 0.8245614035087719) 
 5: 7 : (0.772939529035208, 0.7853247984826932) 
 5: 5 : (0.7254849424351582, 0.7362494073020389) 
CPU times: user 10.3 s, sys: 88.8 ms, total: 10.4 s
Wall time: 18.9 s


In [20]:
def write_answer(i, answer):
    with open(f"task4_{i}_answer", mode = "w") as file:
        file.write(str(answer))

In [26]:
answer5 = " ".join(["0.867", "0.875"])
write_answer(5, answer5)

In [22]:
import time

In [24]:
%%time
estimator = svm_grid_searcher2.best_estimator_

for window_size, session_length in [(5, 5), (7, 7), (10, 10)]:
    start_time = time.time()
    path_to_X_pkl = os.path.join(
        PATH_TO_DATA, f"X_sparse_150users_s{session_length}_w{window_size}.pkl")
    path_to_y_pkl = os.path.join(
        PATH_TO_DATA, f"y_150users_s{session_length}_w{window_size}.pkl")
    r = model_assessment(svm_grid_searcher2.best_estimator_, 
                         path_to_X_pkl,
                         path_to_y_pkl,
                         skf,
                         random_state=17,
                         test_size=0.3)
    print(f" {window_size}: {session_length} : {r} ")
    print("--- %s seconds ---" % (time.time() - start_time))

 5: 5 : (0.4083611011164474, 0.42171606560568453) 
--- 238.28687167167664 seconds ---
 7: 7 : (0.4366487102001489, 0.45295840855673264) 
--- 238.87375807762146 seconds ---
 10: 10 : (0.46307591254574465, 0.4836276942538802) 
--- 227.43375897407532 seconds ---
CPU times: user 6min 26s, sys: 336 ms, total: 6min 26s
Wall time: 11min 44s


In [29]:
answer6 = " ".join(["0.408", "0.422","0.437", "0.453", "0.463", "0.484"])
write_answer(6, answer6)

In [34]:
%%time

path_to_X_pkl = os.path.join(
    PATH_TO_DATA, f"X_sparse_150users.pkl")
path_to_y_pkl = os.path.join(
    PATH_TO_DATA, f"y_150users.pkl")
r = model_assessment(svm_grid_searcher2.best_estimator_,
                     path_to_X_pkl,
                     path_to_y_pkl,
                     skf,
                     random_state=17,
                     test_size=0.3)
print(f"{r}")

(0.46307591254574465, 0.4836276942538802)
CPU times: user 1min 41s, sys: 144 ms, total: 1min 42s
Wall time: 3min 25s


In [35]:
answer6 = " ".join(["0.463", "0.484"])
write_answer(6, answer6)