# Описание
Здесь представлена модель для [соревнования](https://stepik.org/lesson/226979/step/1?unit=199528), 
который дается в конце курса ["Введение в Data Science и машинное обучение"](https://stepik.org/course/4852/) на stepic.

ROC score на сабмите 0.8926

Особенность модели в том, что использованы признаки, которые **автоматически** сгенерированы с помощью библотек [featuretools](https://github.com/Featuretools/featuretools) и [tsfresh](https://tsfresh.readthedocs.io/en/latest/).

У меня было желание проверить какое качество можно получить с помощью подхода автоматического создания признаков. 

# Импорт

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.externals import joblib

sys.path.append(os.path.join(sys.path[0], '../'))

import libs.config as conf
import libs.data_helpers as dh
import libs.data_iter_auto as di
import libs.utils.model_utils as mu
import libs.submit_report as rep

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



# Загрузка и подготовка данных

In [2]:
# загрузка данных
events  = pd.read_csv(f"{conf.DATA_DIR}/event_data_train.zip")
submissions = pd.read_csv(f"{conf.DATA_DIR}/submissions_data_train.zip")

# генерация признаков
X_cv, y_cv = di.get_x_y(events, submissions)
print ('X_cv shape', X_cv.shape)
y_cv.value_counts(dropna=False)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


X_cv shape (19234, 354)


False    17266
True      1968
Name: is_gone, dtype: int64

In [3]:
# разделение данных для обучения на train и test
X_train, X_test, y_train, y_test = train_test_split(X_cv, y_cv, test_size=0.2, stratify=y_cv)
print ('X_train shape', X_train.shape)
y_train.value_counts(dropna=False)

X_train shape (15387, 354)


False    13813
True      1574
Name: is_gone, dtype: int64

# Обучение и проверка модели

## RandomForest

In [4]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=2, 
                            min_samples_leaf=10, min_samples_split=10, 
                            class_weight='balanced')

rf.fit(X_train, y_train)
pred_proba = rf.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)
# должны получить на тест roc 0.902  +- 0.02

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=10,
                       min_samples_split=10, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=2, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

roc на test 0.8995657421413881


In [5]:
# важность фич
fimp = mu.get_feature_importances_df(rf.feature_importances_, X_train.columns)
fimp.head(15)

Unnamed: 0,weight
correct__autocorrelation__lag_0,0.042068
correct__maximum,0.03271
correct__variance,0.030867
discovered__length,0.028925
viewed__mean,0.02373
viewed__minimum,0.021863
correct__mean,0.020449
passed__length,0.018527
started_attempt__length,0.016037
AVG_TIME_BETWEEN(events.date WHERE action = correct),0.015941


## кроссвалидация

In [6]:
# значение к метрике на кроссвалидации коррелирует к метрике на степике

rfcv = RandomForestClassifier(**rf.get_params())

cv_scores = cross_val_score(rfcv, X_cv, y_cv, scoring='roc_auc', cv=10, n_jobs=-1)
mean_cv_scores = np.mean(cv_scores)
print ('mean score', mean_cv_scores)
# должны получить срдений score 0.900 +- 0.02

mean score 0.9019103605131816


## Сохранение модели

In [7]:
# сохранить модель
!mkdir $conf.BIN_MODELS_DIR
model_fname = f"{conf.BIN_MODELS_DIR}/final_auto.bin"
joblib.dump(rf, model_fname)

# загрузка модели
# rf = joblib.load(model_fname)

mkdir: cannot create directory ‘../bin_models’: File exists


['../bin_models/final_auto.bin']

# Сделать предсказание по сабмиту

In [8]:
SUBMIT_NUM = 5

events_pred  = pd.read_csv(f"{conf.DATA_DIR}/events_data_test.zip")
submissions_pred = pd.read_csv(f"{conf.DATA_DIR}/submission_data_test.zip")
X_pred , _ = di.get_x_y(events_pred, submissions_pred)

pred_proba = rf.predict_proba(X_pred)[:, 1]
rep_df = rep.create_report(X_pred.index, pred_proba)
assert rep_df.user_id.nunique() == X_pred.index.nunique()
print ('Прогноз сохранен в файл ', rep.save_report(rep_df, SUBMIT_NUM))

print ('Распределение "вероятностей" модели')
pd.cut(pred_proba, 10).value_counts()

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Прогноз сохранен в файл  ../reports/predict_2019-06-06_submit_5.csv
Распределение "вероятностей" модели


(0.0242, 0.117]    3443
(0.117, 0.209]      358
(0.209, 0.301]      547
(0.301, 0.392]      663
(0.392, 0.484]      364
(0.484, 0.576]      255
(0.576, 0.668]      185
(0.668, 0.76]       156
(0.76, 0.852]       132
(0.852, 0.944]       81
dtype: int64