In [1]:
import pandas as pd
import numpy as np

In [2]:
event_data = pd.read_csv('test/contest/event_data_train.csv')
submissions_data = pd.read_csv('test/contest/submissions_data_train.csv')

In [3]:
event_data.head()

Unnamed: 0,step_id,timestamp,action,user_id
0,32815,1434340848,viewed,17632
1,32815,1434340848,passed,17632
2,32815,1434340848,discovered,17632
3,32811,1434340895,discovered,17632
4,32811,1434340895,viewed,17632


In [4]:
submissions_data.head()

Unnamed: 0,step_id,timestamp,submission_status,user_id
0,31971,1434349275,correct,15853
1,31972,1434348300,correct,15853
2,31972,1478852149,wrong,15853
3,31972,1478852164,correct,15853
4,31976,1434348123,wrong,15853


In [5]:
event_data.user_id.nunique()

19234

In [6]:
submissions_data.user_id.nunique()

9940

---
Отберем первые 2 дня по event_data

In [7]:
user_min_time = event_data.groupby('user_id', as_index=False) \
    .agg(min_timestamp=('timestamp', 'min'))

In [8]:
time_limit = 2 * 24 * 60 * 60
event_data_train = event_data.merge(user_min_time, how='outer')\
    .query('timestamp <= min_timestamp + @time_limit')\
    .drop('min_timestamp', axis=1)

In [9]:
print(event_data.user_id.nunique())
print(event_data.shape)
print(event_data_train.user_id.nunique())
print(event_data_train.shape)

19234
(3480703, 4)
19234
(906203, 4)


In [10]:
event_data_train.head()

Unnamed: 0,step_id,timestamp,action,user_id
0,32815,1434340848,viewed,17632
1,32815,1434340848,passed,17632
2,32815,1434340848,discovered,17632
3,32811,1434340895,discovered,17632
4,32811,1434340895,viewed,17632


---
Соберем статистику event_data

In [11]:
users_data = pd.crosstab(event_data_train.user_id, event_data_train.action)\
    .reset_index()

In [12]:
users_data.head()

action,user_id,discovered,passed,started_attempt,viewed
0,1,1,0,0,1
1,2,9,9,2,9
2,3,15,15,4,20
3,5,1,1,0,1
4,7,1,1,0,1


In [13]:
users_data.shape

(19234, 5)

---
Отберем первые 2 дня по submissions_data

In [14]:
user_min_time = submissions_data.groupby('user_id', as_index=False) \
    .agg(min_timestamp=('timestamp', 'min'))

In [15]:
time_limit = 2 * 24 * 60 * 60
submissions_data_train = submissions_data.merge(user_min_time, how='outer')\
    .query('timestamp <= min_timestamp + @time_limit')\
    .drop('min_timestamp', axis=1)

In [16]:
print(submissions_data.user_id.nunique())
print(submissions_data.shape)
print(submissions_data_train.user_id.nunique())
print(submissions_data_train.shape)

9940
(509104, 4)
9940
(134612, 4)


---
Добавим статистику submissions_data

In [17]:
users_data_add = pd.crosstab(submissions_data_train.user_id, submissions_data_train.submission_status)\
                   .reset_index()

In [18]:
users_data_add.shape

(9940, 3)

In [19]:
users_data = users_data.merge(users_data_add, on='user_id', how='outer').fillna(0)

In [20]:
users_data.head()

Unnamed: 0,user_id,discovered,passed,started_attempt,viewed,correct,wrong
0,1,1,0,0,1,0.0,0.0
1,2,9,9,2,9,2.0,0.0
2,3,15,15,4,20,4.0,4.0
3,5,1,1,0,1,2.0,2.0
4,7,1,1,0,1,0.0,0.0


---
Прошел ли пользователь курс (по всем данным)

In [96]:
passed_by_users = pd.crosstab(submissions_data.user_id, submissions_data.submission_status)\
    .reset_index()

In [22]:
passed_by_users['passed_course'] = passed_by_users.correct > 40

In [23]:
users_data = users_data\
    .merge(passed_by_users[['user_id', 'passed_course']], on='user_id', how='outer')\
    .fillna(False)

In [24]:
users_data.head()

Unnamed: 0,user_id,discovered,passed,started_attempt,viewed,correct,wrong,passed_course
0,1,1,0,0,1,0.0,0.0,False
1,2,9,9,2,9,2.0,0.0,False
2,3,15,15,4,20,4.0,4.0,False
3,5,1,1,0,1,2.0,2.0,False
4,7,1,1,0,1,0.0,0.0,False


In [25]:
users_data.passed_course.sum()

1968

---
Собираем те же данные для predict

In [26]:
event_data_pred = pd.read_csv('test/contest/events_data_test.csv')
submissions_data_pred = pd.read_csv('test/contest/submission_data_test.csv')

In [27]:
print(event_data_pred.user_id.nunique())
print(submissions_data_pred.user_id.nunique())

6184
2803


In [28]:
users_data_pred = pd.crosstab(event_data_pred.user_id, event_data_pred.action)\
    .reset_index()

In [29]:
users_data_pred_add = pd.crosstab(submissions_data_pred.user_id, submissions_data_pred.submission_status)\
                   .reset_index()

In [30]:
users_data_pred = users_data_pred.merge(users_data_pred_add, on='user_id', how='outer').fillna(0)

In [31]:
users_data_pred

Unnamed: 0,user_id,discovered,passed,started_attempt,viewed,correct,wrong
0,4,1,1,0,1,0.0,0.0
1,6,1,1,0,1,0.0,0.0
2,10,2,2,0,6,0.0,0.0
3,12,11,9,4,14,1.0,0.0
4,13,70,70,35,105,29.0,36.0
...,...,...,...,...,...,...,...
6179,26791,1,1,0,1,0.0,0.0
6180,26795,1,1,0,1,0.0,0.0
6181,26796,6,4,2,12,2.0,3.0
6182,26799,6,6,2,6,2.0,0.0


---
Готовим наборы

In [32]:
X = users_data.drop(['user_id', 'passed_course'], axis=1).set_index(users_data.user_id)

In [33]:
y = users_data.passed_course.map(int)

In [34]:
X_pred = users_data_pred.drop('user_id', axis=1).set_index(users_data_pred.user_id)

In [35]:
X_pred

Unnamed: 0_level_0,discovered,passed,started_attempt,viewed,correct,wrong
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,1,1,0,1,0.0,0.0
6,1,1,0,1,0.0,0.0
10,2,2,0,6,0.0,0.0
12,11,9,4,14,1.0,0.0
13,70,70,35,105,29.0,36.0
...,...,...,...,...,...,...
26791,1,1,0,1,0.0,0.0
26795,1,1,0,1,0.0,0.0
26796,6,4,2,12,2.0,3.0
26799,6,6,2,6,2.0,0.0


---
Готовим модель и предсказываем

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [213]:
rf = RandomForestClassifier(random_state=0)

In [214]:
params = {'n_estimators': range(10, 51, 10),
          'max_depth': range(1, 12, 2),
          'min_samples_leaf': range(1, 8),
          'min_samples_split': range(2, 9, 2)}

In [215]:
search = GridSearchCV(rf, params, cv=3, n_jobs=-1)

In [216]:
search.fit(X, y)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(1, 12, 2),
                         'min_samples_leaf': range(1, 8),
                         'min_samples_split': range(2, 9, 2),
                         'n_estimators': range(10, 51, 10)})

In [217]:
best_rf = search.best_estimator_

In [218]:
search.best_params_

{'max_depth': 9,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 20}

In [222]:
y_pred = best_rf.predict_proba(X_pred)

In [231]:
out_pred = pd.DataFrame({'user_id': X_pred.index, 'is_gone': y_pred[:,1]})

In [234]:
out_pred.head()

Unnamed: 0,user_id,is_gone
0,4,0.000432
1,6,0.000432
2,10,0.001087
3,12,0.024938
4,13,0.511438


In [236]:
out_pred.to_csv('test/contest/attempt1.csv', index=False)

Your ROC score is 0.8874603430333944

---
Пробуем линейную регрессию

In [237]:
from sklearn.linear_model import LogisticRegressionCV

In [238]:
clf = LogisticRegressionCV(cv=5)

In [239]:
clf.fit(X, y)

LogisticRegressionCV(cv=5)

In [240]:
y_pred = clf.predict_proba(X_pred)

In [241]:
out_pred = pd.DataFrame({'user_id': X_pred.index, 'is_gone': y_pred[:,1]})

In [242]:
out_pred.head()

Unnamed: 0,user_id,is_gone
0,4,0.038791
1,6,0.038791
2,10,0.040292
3,12,0.042275
4,13,0.823822


In [243]:
out_pred.to_csv('test/contest/attempt2.csv', index=False)

Your ROC score is 0.8768935958790868

---
Увеличим рандом форест

Не перезадал рандом - получил случайный результат

In [250]:
params = {'n_estimators': range(100, 501, 100),
          'max_depth': range(5, 12),
          'min_samples_leaf': range(1, 3),
          'min_samples_split': range(3, 7)}

In [251]:
search = GridSearchCV(rf, params, cv=3, n_jobs=-1)

In [252]:
search.fit(X, y)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(5, 12),
                         'min_samples_leaf': range(1, 3),
                         'min_samples_split': range(3, 7),
                         'n_estimators': range(100, 501, 100)})

In [253]:
search.best_params_

{'max_depth': 6,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 200}

In [256]:
best_rf = search.best_estimator_
y_pred = best_rf.predict_proba(X_pred)
out_pred = pd.DataFrame({'user_id': X_pred.index, 'is_gone': y_pred[:,1]})
out_pred.to_csv('test/contest/attempt3.csv', index=False)

Your ROC score is 0.8883326020880851

---
Поищем оптимальный лес в районе {'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 20}

In [None]:
rf = RandomForestClassifier(random_state=0)

In [257]:
params = {'n_estimators': range(13, 27),
          'max_depth': range(7, 12),
          'min_samples_leaf': range(1, 3),
          'min_samples_split': range(3, 7)}

In [258]:
search = GridSearchCV(rf, params, cv=3, n_jobs=-1)

In [259]:
search.fit(X, y)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(7, 12),
                         'min_samples_leaf': range(1, 3),
                         'min_samples_split': range(3, 7),
                         'n_estimators': range(13, 27)})

In [260]:
search.best_params_

{'max_depth': 9,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 23}

In [261]:
best_rf = search.best_estimator_
y_pred = best_rf.predict_proba(X_pred)
out_pred = pd.DataFrame({'user_id': X_pred.index, 'is_gone': y_pred[:,1]})
out_pred.to_csv('test/contest/attempt4.csv', index=False)

Your ROC score is 0.8877151811579885


---
и в районе {'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 200}

In [263]:
rf = RandomForestClassifier(random_state=0)

In [264]:
params = {'n_estimators': range(150, 251, 10),
          'max_depth': range(4, 9),
          'min_samples_leaf': range(1, 3),
          'min_samples_split': range(3, 7)}

In [265]:
search = GridSearchCV(rf, params, cv=3, n_jobs=-1)

In [266]:
search.fit(X, y)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(4, 9),
                         'min_samples_leaf': range(1, 3),
                         'min_samples_split': range(3, 7),
                         'n_estimators': range(150, 251, 10)})

In [267]:
search.best_params_

{'max_depth': 6,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 200}

In [268]:
best_rf = search.best_estimator_
y_pred = best_rf.predict_proba(X_pred)
out_pred = pd.DataFrame({'user_id': X_pred.index, 'is_gone': y_pred[:,1]})
out_pred.to_csv('test/contest/attempt5.csv', index=False)

закидывать не будем, еще потюним

In [270]:
rf = RandomForestClassifier(random_state=0)

In [271]:
params = {'n_estimators': range(191, 210),
          'max_depth': range(5, 8),
          'min_samples_leaf': range(1, 3),
          'min_samples_split': range(3, 6)}

In [272]:
search = GridSearchCV(rf, params, cv=3, n_jobs=-1)

In [273]:
search.fit(X, y)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(5, 8),
                         'min_samples_leaf': range(1, 3),
                         'min_samples_split': range(3, 6),
                         'n_estimators': range(191, 210)})

In [274]:
search.best_params_

{'max_depth': 6,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 202}

In [275]:
best_rf = search.best_estimator_
y_pred = best_rf.predict_proba(X_pred)
out_pred = pd.DataFrame({'user_id': X_pred.index, 'is_gone': y_pred[:,1]})
out_pred.to_csv('test/contest/attempt5.csv', index=False)

Your ROC score is 0.8882390432272335


---
Добавим фичей: сколько времени затрачено на просмотр

In [37]:
discovered_time = event_data_train[event_data_train.action == 'discovered']\
    .groupby(['user_id', 'step_id'])\
    .agg({'timestamp': 'min'}).reset_index()\
    .rename({'timestamp': 'mintime'}, axis=1)

In [38]:
viewed_time = event_data_train[event_data_train.action == 'viewed']\
    .groupby(['user_id', 'step_id'])\
    .agg({'timestamp': 'max'}).reset_index()\
    .rename({'timestamp': 'maxtime'}, axis=1)

In [39]:
viewed_step_time = discovered_time.merge(viewed_time, on=['user_id', 'step_id'], how='inner')

In [40]:
viewed_step_time['total'] = viewed_step_time.maxtime - viewed_step_time.mintime

In [41]:
view_total = viewed_step_time.groupby('user_id').agg({'total': 'sum'})

In [42]:
discovered_time_pred = event_data_pred[event_data_pred.action == 'discovered']\
    .groupby(['user_id', 'step_id'])\
    .agg({'timestamp': 'min'}).reset_index()\
    .rename({'timestamp': 'mintime'}, axis=1)

In [43]:
viewed_time_pred = event_data_pred[event_data_pred.action == 'viewed']\
    .groupby(['user_id', 'step_id'])\
    .agg({'timestamp': 'max'}).reset_index()\
    .rename({'timestamp': 'maxtime'}, axis=1)

In [44]:
viewed_step_time_pred = discovered_time_pred.merge(viewed_time_pred, on=['user_id', 'step_id'], how='inner')

In [45]:
viewed_step_time_pred['total'] = viewed_step_time_pred.maxtime - viewed_step_time_pred.mintime

In [46]:
view_total_pred = viewed_step_time_pred.groupby('user_id').agg({'total': 'sum'})

In [47]:
X = X.merge(view_total, left_index=True, right_index=True, how='left').fillna('0')

In [48]:
X_pred = X_pred.merge(view_total_pred, left_index=True, right_index=True, how='left').fillna('0')

In [49]:
X.head()

Unnamed: 0_level_0,discovered,passed,started_attempt,viewed,correct,wrong,total
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,0,0,1,0.0,0.0,0.0
2,9,9,2,9,2.0,0.0,0.0
3,15,15,4,20,4.0,4.0,114.0
5,1,1,0,1,2.0,2.0,0.0
7,1,1,0,1,0.0,0.0,0.0


In [54]:
X_pred.isna().sum()

discovered         0
passed             0
started_attempt    0
viewed             0
correct            0
wrong              0
total              0
dtype: int64

---
Пробуем предсказать

In [172]:
def search_fit_predict(params, X, y, X_pred, path_to_result, random_state=0):
    rf = RandomForestClassifier(random_state)
    search = GridSearchCV(rf, params, cv=3, n_jobs=-1)
    search.fit(X, y)
    best_rf = search.best_estimator_
    y_pred = best_rf.predict_proba(X_pred)
    out_pred = pd.DataFrame({'user_id': X_pred.index, 'is_gone': y_pred[:,1]})
    out_pred.to_csv(path_to_result, index=False)
    return best_rf, search.best_params_

In [404]:
params = {'n_estimators': [10, 20, 50, 100, 150, 200, 250, 300],
          'max_depth': range(5, 10),
          'min_samples_leaf': range(1, 3),
          'min_samples_split': range(3, 6)}
best_rf, best_params = search_fit_predict(params, X, y, X_pred, 'test/contest/attempt6.csv')
best_params

{'max_depth': 9,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 20}

Your ROC score is 0.8851977047343148

Промахнулись с глубиной, увеличим

In [56]:
params = {'n_estimators': [10, 20, 50, 100, 150, 200, 250, 300],
          'max_depth': range(8, 14),
          'min_samples_leaf': [1],
          'min_samples_split': [4]}
best_rf, best_params = search_fit_predict(params, X, y, X_pred, 'test/contest/attempt7.csv')
best_params

{'max_depth': 9,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 20}

Промахнулись с метрикой

---
Сколько времени затрачено на решение задач

In [63]:
started_attempt_time = event_data_train[event_data_train.action == 'started_attempt']\
    .groupby(['user_id', 'step_id'])\
    .agg({'timestamp': 'min'}).reset_index()\
    .rename({'timestamp': 'mintime'}, axis=1)

In [64]:
passed_time = event_data_train[event_data_train.action == 'passed']\
    .groupby(['user_id', 'step_id'])\
    .agg({'timestamp': 'min'}).reset_index()\
    .rename({'timestamp': 'maxtime'}, axis=1)

In [65]:
task_time = started_attempt_time.merge(passed_time, on=['user_id', 'step_id'], how='inner')

In [66]:
task_time['total'] = task_time.maxtime - task_time.mintime

In [67]:
task_total = task_time.groupby('user_id').agg({'total': 'sum'})

In [69]:
started_attempt_time_pred = event_data_pred[event_data_pred.action == 'started_attempt']\
    .groupby(['user_id', 'step_id'])\
    .agg({'timestamp': 'min'}).reset_index()\
    .rename({'timestamp': 'mintime'}, axis=1)

In [70]:
passed_time_pred = event_data_pred[event_data_pred.action == 'passed']\
    .groupby(['user_id', 'step_id'])\
    .agg({'timestamp': 'min'}).reset_index()\
    .rename({'timestamp': 'maxtime'}, axis=1)

In [71]:
task_time_pred = started_attempt_time_pred.merge(passed_time_pred, on=['user_id', 'step_id'], how='inner')

In [72]:
task_time_pred['total'] = task_time_pred.maxtime - task_time_pred.mintime

In [73]:
task_total_pred = task_time_pred.groupby('user_id').agg({'total': 'sum'})

In [90]:
X_pred = X_pred.drop(['total_x', 'total_y'],axis=1)

In [84]:
X = X.merge(task_total, left_index=True, right_index=True, how='left').fillna('0')

In [78]:
X_pred.shape

(6184, 7)

In [91]:
X_pred = X_pred.merge(task_total_pred, left_index=True, right_index=True, how='left').fillna('0')

In [92]:
X_pred

Unnamed: 0_level_0,discovered,passed,started_attempt,viewed,correct,wrong,total
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4,1,1,0,1,0.0,0.0,0
6,1,1,0,1,0.0,0.0,0
10,2,2,0,6,0.0,0.0,0
12,11,9,4,14,1.0,0.0,52.0
13,70,70,35,105,29.0,36.0,75490.0
...,...,...,...,...,...,...,...
26791,1,1,0,1,0.0,0.0,0
26795,1,1,0,1,0.0,0.0,0
26796,6,4,2,12,2.0,3.0,0
26799,6,6,2,6,2.0,0.0,183.0


In [93]:
params = {'n_estimators': [10, 20, 50, 100, 150, 200, 250, 300],
          'max_depth': range(8, 14),
          'min_samples_leaf': [1],
          'min_samples_split': [4]}
best_rf, best_params = search_fit_predict(params, X, y, X_pred, 'test/contest/attempt7.csv')
best_params

{'max_depth': 8,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 100}

Your ROC score is 0.8875451201961155


In [94]:
params = {'n_estimators': [10, 20, 50, 100, 150, 200, 250, 300],
          'max_depth': range(4, 14),
          'min_samples_leaf': [1],
          'min_samples_split': [4]}
best_rf, best_params = search_fit_predict(params, X, y, X_pred, 'test/contest/attempt8.csv')
best_params

{'max_depth': 6,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 150}

Your ROC score is 0.8875768694124335

---
Чутка схимичим и понизим планку

In [98]:
y.shape

(19234,)

In [146]:
#correct = passed_by_users[passed_by_users.correct > 39].drop('wrong', axis=1)
correct = passed_by_users[passed_by_users.correct > 41].drop('wrong', axis=1)
correct = correct.set_index(correct.user_id).drop('user_id', axis=1)

In [147]:
correct.correct = True

In [148]:
y2 = X.drop(['discovered', 'passed', 'started_attempt', 'viewed', 'correct', 'wrong', 'total'],axis=1)\
      .merge(correct, left_index=True, right_index=True, how='left').fillna(False).correct

In [149]:
y2.sum()

1930

In [150]:
y2.shape

(19234,)

In [151]:
params = {'n_estimators': [10, 20, 50, 100, 150, 200, 250, 300],
          'max_depth': range(4, 14),
          'min_samples_leaf': [1],
          'min_samples_split': [4]}
best_rf, best_params = search_fit_predict(params, X, y2, X_pred, 'test/contest/attempt10.csv')
best_params

{'max_depth': 6,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 300}

39: Your ROC score is 0.886211146474334
41: Your ROC score is 0.8872853845860257

---
Введем рейт

In [166]:
#X = X.drop('total', axis=1)
X_pred = X_pred.drop('total',axis=1)

In [155]:
X['correct_ratio'] = X.correct / (X.correct + X.wrong)

In [156]:
X_pred['correct_ratio'] = X_pred.correct / (X_pred.correct + X_pred.wrong)

In [160]:
X = X.fillna(0)
X_pred = X_pred.fillna(0)

In [168]:
X

Unnamed: 0_level_0,discovered,passed,started_attempt,viewed,correct,wrong,correct_ratio
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,0,0,1,0.0,0.0,0.000000
2,9,9,2,9,2.0,0.0,1.000000
3,15,15,4,20,4.0,4.0,0.500000
5,1,1,0,1,2.0,2.0,0.500000
7,1,1,0,1,0.0,0.0,0.000000
...,...,...,...,...,...,...,...
26790,2,2,0,2,1.0,0.0,1.000000
26793,1,0,1,1,0.0,0.0,0.000000
26794,50,50,24,90,24.0,7.0,0.774194
26797,10,10,2,10,2.0,0.0,1.000000


In [167]:
params = {'n_estimators': range(150, 251, 10),
          'max_depth': range(4, 9),
          'min_samples_leaf': range(1, 3),
          'min_samples_split': range(3, 7)}
best_rf, best_params = search_fit_predict(params, X, y, X_pred, 'test/contest/attempt11.csv')
best_params

{'max_depth': 6,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 160}

Your ROC score is 0.8875280634362852

---
Eще 1 тест

In [170]:
X = X.drop('correct_ratio', axis=1)
X_pred = X_pred.drop('correct_ratio', axis=1)

In [173]:
params = {'n_estimators': range(191, 210),
          'max_depth': range(5, 8),
          'min_samples_leaf': range(1, 3),
          'min_samples_split': range(3, 6)}
best_rf, best_params = search_fit_predict(params, X, y, X_pred, 'test/contest/attempt11.csv', 42)
best_params

{'max_depth': 6,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 209}

Your ROC score is 0.8886602269798761

In [176]:
params = {'n_estimators': range(100, 501, 100),
          'criterion': ['entropy', 'gini'],
          'max_depth': range(5, 12),
          'min_samples_leaf': range(1, 3),
          'min_samples_split': range(3, 7)}
best_rf, best_params = search_fit_predict(params, X, y, X_pred, 'test/contest/attempt13.csv', 42)
best_params

{'criterion': 'gini',
 'max_depth': 7,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 100}

Your ROC score is 0.8877545299207653

In [179]:
params = {'n_estimators': range(191, 300),
          'max_depth': range(5, 10),
          'min_samples_leaf': range(1, 5),
          'min_samples_split': range(3, 10)}
best_rf, best_params = search_fit_predict(params, X, y, X_pred, 'test/contest/attempt14.csv', 42)
best_params

{'max_depth': 9,
 'min_samples_leaf': 4,
 'min_samples_split': 6,
 'n_estimators': 291}

Your ROC score is 0.8881993567068363