# Описание
Здесь представлен baseline для [соревнования](https://stepik.org/lesson/226979/step/1?unit=199528), 
который дается в конце курса ["Введение в Data Science и машинное обучение"](https://stepik.org/course/4852/) на stepic.

За основу взять признаки которые создавали в течении курса. ROC_AUC = 0.8845

# Импорт

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm, tqdm_notebook
import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.externals import joblib

sys.path.append(os.path.join(sys.path[0], '../'))

import libs.config as conf
import libs.data_helpers as dh
import libs.data_iter1 as di
import libs.utils.model_utils as mu
import libs.submit_report as rep

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



# Загрузка и подготовка данных

In [2]:
# загрузка данных
events  = pd.read_csv(f"{conf.DATA_DIR}/event_data_train.zip")
submissions = pd.read_csv(f"{conf.DATA_DIR}/submissions_data_train.zip")

# генерация признаков
X_cv, y_cv = di.get_x_y(events, submissions)
print ('X_cv shape', X_cv.shape)
y_cv.value_counts(dropna=False)

X_cv shape (19234, 7)


False    17266
True      1968
Name: is_gone, dtype: int64

In [3]:
# разделение данных для обучения на train и test
X_train, X_test, y_train, y_test = train_test_split(X_cv, y_cv, test_size=0.1, stratify=y_cv)
print ('X_train shape', X_train.shape)
y_train.value_counts(dropna=False)

X_train shape (17310, 7)


False    15539
True      1771
Name: is_gone, dtype: int64

# Обучение и проверка модели

## RandomForest

In [4]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=2, 
                            min_samples_leaf=10, min_samples_split=10, 
                            class_weight='balanced')

rf.fit(X_train, y_train)
pred_proba = rf.predict_proba(X_test)
roc_score = roc_auc_score(y_test, pred_proba[:, 1])
print('roc на test', roc_score)
# должны получить на тест roc 0.875  +- 0.02

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=10,
                       min_samples_split=10, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=2, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

roc на test 0.8755169464374417


In [5]:
# важность фич
fimp = mu.get_feature_importances_df(rf.feature_importances_, X_train.columns)
fimp.head(15)

Unnamed: 0,weight
correct,0.421497
passed,0.13983
started_attempt,0.133958
wrong,0.126391
viewed,0.083641
discovered,0.080576
day,0.014106


## кроссвалидация

In [6]:
# значение к метрике на кроссвалидации коррелирует к метрике на степике

rfcv = RandomForestClassifier(**rf.get_params())

cv_scores = cross_val_score(rfcv, X_cv, y_cv, scoring='roc_auc', cv=10, n_jobs=-1)
mean_cv_scores = np.mean(cv_scores)
print ('mean score', mean_cv_scores)
# должны получить срдений score 0.8834 +- 0.01

mean score 0.8834922054704091


## Сохранение модели

In [7]:
# сохранить модель
!mkdir $conf.BIN_MODELS_DIR
model_fname = f"{conf.BIN_MODELS_DIR}/baseline_random_forest.bin"
joblib.dump(rf, model_fname)

# загрузка модели
# rf = joblib.load(model_fname)

mkdir: cannot create directory ‘../bin_models’: File exists


['../bin_models/baseline_random_forest.bin']

# Сделать предсказание по сабмиту

In [8]:
SUBMIT_NUM = 1

events_pred  = pd.read_csv(f"{conf.DATA_DIR}/events_data_test.zip")
submissions_pred = pd.read_csv(f"{conf.DATA_DIR}/submission_data_test.zip")
X_pred , _ = di.get_x_y(events_pred, submissions_pred)

pred_proba = rf.predict_proba(X_pred)[:, 1]
rep_df = rep.create_report(X_pred.index, pred_proba)
assert rep_df.user_id.nunique() == X_pred.index.nunique()
print ('Прогноз сохранен в файл ', rep.save_report(rep_df, SUBMIT_NUM))

print ('Распределение "вероятностей" модели')
pd.cut(pred_proba, 10).value_counts()

Прогноз сохранен в файл  ../reports/predict_2019-06-06_submit_1.csv
Распределение "вероятностей" модели


(-0.000983, 0.0983]    3482
(0.0983, 0.197]         201
(0.197, 0.295]          325
(0.295, 0.393]          377
(0.393, 0.491]          362
(0.491, 0.59]           337
(0.59, 0.688]           439
(0.688, 0.786]          277
(0.786, 0.885]          264
(0.885, 0.983]          120
dtype: int64