In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
def gone_user(events_data):
    
    users_data = events_data.groupby('user_id', as_index=False).agg({'timestamp': 'max'}) \
        .rename(columns={'timestamp': 'last_timestamp'})

    now = events_data.iloc[-1, 1]
    drop_out_treshhold = 30 * 24 * 60 * 60

    users_data['is_gone_user'] = (now - users_data.last_timestamp) > drop_out_treshhold
    users_data.drop('last_timestamp', axis=1, inplace=True)
    
    return users_data

In [3]:
def passed_course(users_data, submissions_data):

    user_scores = submissions_data.pivot_table(index='user_id',
                                               columns='submission_status',
                                               values='step_id',
                                               aggfunc='count',
                                               fill_value=0).reset_index()
    
    users_data = users_data.merge(user_scores, on='user_id', how='outer')
    users_data.fillna(0, inplace=True)

    users_data['passed_course'] = users_data.correct > 40
    users_data.drop(['correct', 'wrong'], axis=1, inplace=True)
    
    return users_data

In [4]:
def first_2_days_data(events_data, submissions_data):
    
    user_min_time = events_data.groupby('user_id', as_index=False) \
        .agg({'timestamp': 'min'}) \
        .rename({'timestamp': 'min_timestamp'}, axis=1)
    
    learninig_time_threshold = 2 * 24 * 60 * 60
    
    events_data = events_data.merge(user_min_time, on='user_id', how='outer')
    events_data_train = \
        events_data[events_data.timestamp <= (events_data.min_timestamp + learninig_time_threshold)]
    events_data_train = events_data_train.drop('min_timestamp', axis=1)

    submissions_data = submissions_data.merge(user_min_time, on='user_id', how='outer')
    submissions_data_train = \
        submissions_data[submissions_data.timestamp <= (submissions_data.min_timestamp + learninig_time_threshold)]
    submissions_data_train = submissions_data_train.drop('min_timestamp', axis=1)

    return events_data_train, submissions_data_train

In [5]:
def select_features(events_data_train, submissions_data_train):
    
    events_data_train['date'] = pd.to_datetime(events_data_train.timestamp, unit='s')
    events_data_train['day'] = events_data_train.date.dt.date

    submissions_data_train['date'] = pd.to_datetime(submissions_data_train.timestamp, unit='s')
    submissions_data_train['day'] = submissions_data_train.date.dt.date

    X = submissions_data_train.groupby('user_id').day.nunique().to_frame().reset_index() \
        .rename(columns={'day': 'days'})  

    steps_tried = submissions_data_train.groupby('user_id').step_id.nunique().to_frame().reset_index() \
        .rename(columns={'step_id': 'steps_tried'})

    X = X.merge(steps_tried, on='user_id', how='outer')

    X = X.merge(submissions_data_train.pivot_table(index='user_id',
                                               columns='submission_status',
                                               values='step_id',
                                               aggfunc='count',
                                               fill_value=0).reset_index())

    X['correct_ration'] = X.correct / (X.correct + X.wrong)

    X = X.merge(events_data_train.pivot_table(index='user_id',
                                               columns='action',
                                               values='step_id',
                                               aggfunc='count',
                                               fill_value=0).reset_index()[['user_id', 'viewed']], how='outer')

    X = X.fillna(0)

    return X

In [6]:
def obrabotka_train(events_data, submissions_data):
    
    users_data = gone_user(events_data)
    
    final_users_data = passed_course(users_data, submissions_data)
    
    events_data_train, submissions_data_train = first_2_days_data(events_data, submissions_data)
    
    X = select_features(events_data_train, submissions_data_train)
    
    X = X.merge(final_users_data, on='user_id', how='outer')
    X = X[~((X.is_gone_user == False) & (X.passed_course == False))]

    y = X.passed_course
    y = y.map(int)

    X = X.drop(['passed_course', 'is_gone_user'], axis=1)  
    X = X.set_index(X.user_id)
    X.drop('user_id', axis=1, inplace=True)
    
    return X, y

In [7]:
def obrabotka_test(events_data_train, submissions_data_train):
    
    X = select_features(events_data_train, submissions_data_train)
    
    X = X.set_index(X.user_id)
    X.drop('user_id', axis=1, inplace=True)
    
    return X

In [8]:
events_data = pd.read_csv('https://stepik.org/media/attachments/course/4852/event_data_train.zip')
submissions_data = pd.read_csv('https://stepik.org/media/attachments/course/4852/submissions_data_train.zip')

In [9]:
events_data_test = pd.read_csv('https://stepik.org/media/attachments/course/4852/events_data_test.csv')
submissions_data_test = pd.read_csv('https://stepik.org/media/attachments/course/4852/submission_data_test.csv')

In [10]:
X, y = obrabotka_train(events_data, submissions_data)

In [11]:
X_test = obrabotka_test(events_data_test, submissions_data_test)

In [12]:
X

Unnamed: 0_level_0,days,steps_tried,correct,wrong,correct_ration,viewed
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1.0,2.0,2.0,0.0,1.000000,9
3,1.0,4.0,4.0,4.0,0.500000,20
8,1.0,11.0,9.0,21.0,0.300000,154
14,1.0,1.0,0.0,1.0,0.000000,9
16,2.0,18.0,18.0,23.0,0.439024,117
...,...,...,...,...,...,...
26781,0.0,0.0,0.0,0.0,0.000000,6
26788,0.0,0.0,0.0,0.0,0.000000,1
26789,0.0,0.0,0.0,0.0,0.000000,2
26790,0.0,0.0,0.0,0.0,0.000000,2


In [13]:
y

0        0
1        0
2        0
3        0
4        1
        ..
19229    1
19230    0
19231    0
19232    0
19233    0
Name: passed_course, Length: 18066, dtype: int64

In [14]:
X_test

Unnamed: 0_level_0,days,steps_tried,correct,wrong,correct_ration,viewed
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12,1.0,1.0,1.0,0.0,1.000000,14
13,2.0,29.0,29.0,36.0,0.446154,105
15,3.0,11.0,10.0,30.0,0.250000,1
21,1.0,30.0,24.0,103.0,0.188976,98
35,3.0,9.0,7.0,35.0,0.166667,70
...,...,...,...,...,...,...
26745,0.0,0.0,0.0,0.0,0.000000,1
26768,0.0,0.0,0.0,0.0,0.000000,1
26791,0.0,0.0,0.0,0.0,0.000000,1
26795,0.0,0.0,0.0,0.0,0.000000,1


In [15]:
clf = RandomForestClassifier(criterion='entropy')

In [33]:
parameters = {
              'n_estimators':range(10, 101, 10),
              'max_depth': [1, 3, 6, 9, 12, 15, 18, 21],
              'min_samples_leaf': [1, 3, 5, 9, 16, 20, 32, 50],
              'min_samples_split': [2, 4, 7, 12, 20, 29, 40, 54, 75, 150]
             }

In [28]:
# parameters = {
#               'n_estimators':range(10, 101, 10),
#               'max_depth':range(1, 20, 2),
#               'min_samples_leaf': [1, 2, 3, 4, 5, 7, 9, 12, 16, 20, 25, 32, 40, 50, 65, 80],
#               'min_samples_split': [2, 3, 4, 5, 7, 9, 12, 16, 20, 25, 32, 40, 50, 65, 80]
#              }

In [125]:
gclf = GridSearchCV(clf, parameters, n_jobs=-1, verbose=10)

In [30]:
%%time
gclf.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  41 out of  50 | elapsed:    2.9s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  47 out of  50 | elapsed:    3.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.2s finished


CPU times: user 577 ms, sys: 51.5 ms, total: 629 ms
Wall time: 3.44 s


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='entropy',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
         

In [126]:
raise 

RuntimeError: No active exception to reraise

In [31]:
bclf = gclf.best_estimator_

In [27]:
bclf

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=1, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [32]:
# rclf

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=1, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=65,
                       min_weight_fraction_leaf=0.0, n_estimators=80,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [113]:
pred = pd.DataFrame({'is_gone': bclf.predict_proba(X_test)[:, 1]})

In [114]:
res = pd.DataFrame(X_test.index).join(pred)

In [115]:
res = res.set_index(res.columns[0])

In [116]:
for i in list(X_test[X_test.correct > 39].index):
    res.loc[i, 'is_gone'] = 1.0

In [117]:
res = res.reset_index()

In [120]:
res

Unnamed: 0,user_id,is_gone
0,12,0.047761
1,13,0.353726
2,15,0.300109
3,21,0.353726
4,35,0.353726
...,...,...
6179,26745,0.047761
6180,26768,0.047761
6181,26791,0.047761
6182,26795,0.047761


In [121]:
res.to_csv('predict.scv', index=False)

In [122]:
df = pd.read_csv('/home/pavel/learnpython/predict.scv')

In [123]:
df

Unnamed: 0,user_id,is_gone
0,12,0.047761
1,13,0.353726
2,15,0.300109
3,21,0.353726
4,35,0.353726
...,...,...
6179,26745,0.047761
6180,26768,0.047761
6181,26791,0.047761
6182,26795,0.047761
