# Libraries

In [1]:
from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')
from time import time
import itertools
import os
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
import pickle
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, make_scorer


# Data

In [2]:
# Поменяйте на свой путь к данным
PATH_TO_DATA = 'capstone_user_identification'

In [3]:
with open(os.path.join(PATH_TO_DATA, 
         'X_sparse_10users.pkl'), 'rb') as X_sparse_10users_pkl:
    X_sparse_10users = pickle.load(X_sparse_10users_pkl)
with open(os.path.join(PATH_TO_DATA, 
                       'y_10users.pkl'), 'rb') as y_10users_pkl:
    y_10users = pickle.load(y_10users_pkl)

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X_sparse_10users, y_10users, 
                                                      test_size=0.3, 
                                                     random_state=17, stratify=y_10users)

stratification for cv

In [5]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17)

# model #1

In [14]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [15]:
logit = LogisticRegression(random_state=17, n_jobs=-1)

logit.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=17,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [16]:
logit.score(X_train,y_train)

0.8753302174354806

In [17]:
cv_scoring = cross_val_score(logit, X_train, y_train, scoring = 'accuracy', cv = skf)

cv_scoring

array([0.76104846, 0.74824749, 0.77256098])

In [18]:
logit.score(X_valid,y_valid)

0.7767243422611994

# model #2

In [21]:
%%time
logit_c_values2 = np.linspace(0.1, 7, 20)

logit_grid_searcher2 = LogisticRegressionCV(
    Cs=logit_c_values2,
    random_state=17,
    n_jobs=-1,
    multi_class='multinomial',
    cv=skf)
logit_grid_searcher2.fit(X_train, y_train)

CPU times: user 7.4 s, sys: 432 ms, total: 7.83 s
Wall time: 40.4 s


LogisticRegressionCV(Cs=array([0.1       , 0.46315789, 0.82631579, 1.18947368, 1.55263158,
       1.91578947, 2.27894737, 2.64210526, 3.00526316, 3.36842105,
       3.73157895, 4.09473684, 4.45789474, 4.82105263, 5.18421053,
       5.54736842, 5.91052632, 6.27368421, 6.63684211, 7.        ]),
                     class_weight=None,
                     cv=StratifiedKFold(n_splits=3, random_state=17, shuffle=True),
                     dual=False, fit_intercept=True, intercept_scaling=1.0,
                     l1_ratios=None, max_iter=100, multi_class='multinomial',
                     n_jobs=-1, penalty='l2', random_state=17, refit=True,
                     scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [92]:
m = list()
for k, v  in logit_grid_searcher2.scores_.items():
    m.extend(v.tolist())
    
np.max(m)

0.774390243902439

In [94]:
logit_grid_searcher2.scores_

{31: array([[0.73483694, 0.75921975, 0.76165803, 0.76043889, 0.76135325,
         0.76074368, 0.75921975, 0.75891496, 0.75921975, 0.75952454,
         0.75830539, 0.75739104, 0.75708625, 0.75525754, 0.7561719 ,
         0.75525754, 0.75434319, 0.75464797, 0.75312405, 0.75251448],
        [0.72325511, 0.74702835, 0.74824749, 0.74885706, 0.74977141,
         0.74916184, 0.7479427 , 0.74763792, 0.74824749, 0.74641877,
         0.74733313, 0.74672356, 0.74459006, 0.74459006, 0.74367571,
         0.74215178, 0.74123743, 0.74062786, 0.73940872, 0.73849436],
        [0.74634146, 0.77134146, 0.77317073, 0.77256098, 0.77378049,
         0.77439024, 0.7722561 , 0.77164634, 0.77195122, 0.7722561 ,
         0.7722561 , 0.77164634, 0.77164634, 0.77164634, 0.77134146,
         0.77042683, 0.77073171, 0.77012195, 0.76981707, 0.76829268]]),
 33: array([[0.73483694, 0.75921975, 0.76165803, 0.76043889, 0.76135325,
         0.76074368, 0.75921975, 0.75891496, 0.75921975, 0.75952454,
         0.75830539, 

In [37]:
logit_grid_searcher2.C_

array([1.55263158, 1.55263158, 1.55263158, 1.55263158, 1.55263158,
       1.55263158, 1.55263158, 1.55263158, 1.55263158, 1.55263158])

In [62]:
cs = []
for i in range(len(logit_grid_searcher2.Cs_)):
    c = logit_grid_searcher2.Cs_[i]
    scores = list()
    for k,v in logit_grid_searcher2.scores_.items():
        scores.extend(v[:,i])
    cs.append((c, np.max(scores)))
cs = np.array(cs)
cs


array([[0.1       , 0.74634146],
       [0.46315789, 0.77134146],
       [0.82631579, 0.77317073],
       [1.18947368, 0.77256098],
       [1.55263158, 0.77378049],
       [1.91578947, 0.77439024],
       [2.27894737, 0.7722561 ],
       [2.64210526, 0.77164634],
       [3.00526316, 0.77195122],
       [3.36842105, 0.7722561 ],
       [3.73157895, 0.7722561 ],
       [4.09473684, 0.77164634],
       [4.45789474, 0.77164634],
       [4.82105263, 0.77164634],
       [5.18421053, 0.77134146],
       [5.54736842, 0.77042683],
       [5.91052632, 0.77073171],
       [6.27368421, 0.77012195],
       [6.63684211, 0.76981707],
       [7.        , 0.76829268]])

In [63]:
c = sorted(cs, key = lambda x : x[1])[-1][0]
c

1.9157894736842107

# model #3

In [30]:
with open(os.path.join(PATH_TO_DATA, 'X_sparse_150users.pkl'), 'rb') as X_sparse_150users_pkl:
     X_sparse_150users = pickle.load(X_sparse_150users_pkl)
with open(os.path.join(PATH_TO_DATA, 'y_150users.pkl'), 'rb') as y_150users_pkl:
    y_150users = pickle.load(y_150users_pkl)

In [31]:
X_train_150, X_valid_150, y_train_150, y_valid_150 = train_test_split(X_sparse_150users, 
                                                                      y_150users, test_size=0.3, 
                                                     random_state=17, stratify=y_150users)

In [32]:
%%time
logit_cv_150users = LogisticRegressionCV(
    Cs=logit_c_values2,
    multi_class='ovr',
    random_state=17,
    n_jobs=-1,
    cv=skf)
logit_cv_150users.fit(X_train_150, y_train_150)

CPU times: user 15min 48s, sys: 50.4 s, total: 16min 39s
Wall time: 56min 10s


LogisticRegressionCV(Cs=array([0.1       , 0.46315789, 0.82631579, 1.18947368, 1.55263158,
       1.91578947, 2.27894737, 2.64210526, 3.00526316, 3.36842105,
       3.73157895, 4.09473684, 4.45789474, 4.82105263, 5.18421053,
       5.54736842, 5.91052632, 6.27368421, 6.63684211, 7.        ]),
                     class_weight=None,
                     cv=StratifiedKFold(n_splits=3, random_state=17, shuffle=True),
                     dual=False, fit_intercept=True, intercept_scaling=1.0,
                     l1_ratios=None, max_iter=100, multi_class='ovr', n_jobs=-1,
                     penalty='l2', random_state=17, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [49]:
logit_cv_150users.scores_

{6: array([[0.99521441, 0.99568359, 0.9958087 , 0.99590254, 0.99596509,
         0.99596509, 0.99596509, 0.99596509, 0.99593382, 0.99596509,
         0.99596509, 0.99593382, 0.99596509, 0.99596509, 0.99590254,
         0.99590254, 0.99590254, 0.99590254, 0.99590254, 0.99590254],
        [0.99577742, 0.99640299, 0.99649683, 0.99643427, 0.99643427,
         0.99646555, 0.99646555, 0.99649683, 0.99649683, 0.99649683,
         0.99655938, 0.99649683, 0.9965281 , 0.99649683, 0.9965281 ,
         0.99649683, 0.99649683, 0.99649683, 0.99649683, 0.99646555],
        [0.99499546, 0.99568359, 0.99574614, 0.99577742, 0.99583998,
         0.9958087 , 0.99577742, 0.99583998, 0.99587126, 0.99583998,
         0.99583998, 0.99583998, 0.99583998, 0.99583998, 0.99587126,
         0.99583998, 0.99583998, 0.99583998, 0.99583998, 0.99583998]]),
 13: array([[0.99590254, 0.99627788, 0.99643427, 0.99637171, 0.99627788,
         0.99627788, 0.99630916, 0.99627788, 0.99634043, 0.99640299,
         0.99637171, 0

In [39]:
logit_cv_150users.Cs_

array([0.1       , 0.46315789, 0.82631579, 1.18947368, 1.55263158,
       1.91578947, 2.27894737, 2.64210526, 3.00526316, 3.36842105,
       3.73157895, 4.09473684, 4.45789474, 4.82105263, 5.18421053,
       5.54736842, 5.91052632, 6.27368421, 6.63684211, 7.        ])

In [70]:
logit_cv_150users.Cs_[10]

3.7315789473684213

In [64]:
c

1.9157894736842107

In [71]:
%%time
logit_cv_150users2 = LogisticRegressionCV(
    Cs=[logit_cv_150users.Cs_[10]],
    multi_class='ovr',
    random_state=17,
    n_jobs=-1,
    cv=skf)
logit_cv_150users2.fit(X_train_150, y_train_150)

CPU times: user 17min 27s, sys: 51.2 s, total: 18min 18s
Wall time: 7min 25s


LogisticRegressionCV(Cs=[3.7315789473684213], class_weight=None,
                     cv=StratifiedKFold(n_splits=3, random_state=17, shuffle=True),
                     dual=False, fit_intercept=True, intercept_scaling=1.0,
                     l1_ratios=None, max_iter=100, multi_class='ovr', n_jobs=-1,
                     penalty='l2', random_state=17, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [72]:
cv_scores_by_user = {}
for user_id in logit_cv_150users2.scores_:
#     print('User {}, CV score: {}'.format(user_id, np.mean(logit_cv_150users.scores_[user_id])))
    cv_scores_by_user[user_id]  = np.mean(logit_cv_150users2.scores_[user_id])

class_distr = np.bincount(y_train_150.astype('int'))

var = []
for user_id in np.unique(y_train_150):
    var.append(cv_scores_by_user[user_id] - ((sum(class_distr) - class_distr[user_id])/sum(class_distr)))

In [77]:
num_better_than_default = (np.array(var)>0).sum()

users_number = len(np.unique(y_train_150))


a = round(num_better_than_default/users_number,3)
a

0.813

In [74]:
num_better_than_default

122

In [78]:
def write_answer(i, answer):
    with open(f"task4_{i}_answer", mode = "w") as file:
        file.write(str(answer))

In [79]:
write_answer(7,a)