In [59]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize': (9, 6)})

In [60]:
events_data = pd.read_csv('event_data_train.zip')
submissions_data = pd.read_csv('submissions_data_train.zip')
events_data['date'] = pd.to_datetime(events_data.timestamp, unit='s')
events_data['day'] = events_data.date.dt.date
submissions_data['date'] = pd.to_datetime(submissions_data.timestamp, unit='s')
submissions_data['day'] = submissions_data.date.dt.date
users_events_data = events_data.pivot_table(index='user_id', 
                        columns='action', 
                        values='step_id', 
                        aggfunc='count', 
                        fill_value=0).reset_index()
users_scores = submissions_data.pivot_table(index='user_id', 
                        columns='submission_status', 
                        values='step_id', 
                        aggfunc='count', 
                        fill_value=0).reset_index()

In [61]:
now = 1526772811
drop_out_threshold = 2592000
gap_data = events_data[['user_id', 'day', 'timestamp']].drop_duplicates(subset=['user_id', 'day']) \
    .groupby('user_id')['timestamp'].apply(list) \
    .apply(np.diff).values
gap_data = pd.Series(np.concatenate(gap_data, axis=0))
gap_data = gap_data / (24 * 60 * 60)
users_data = events_data.groupby('user_id', as_index=False) \
    .agg({'timestamp': 'max'}).rename(columns={'timestamp': 'last_timestamp'})
users_data['is_gone_user'] = (now - users_data.last_timestamp) > drop_out_threshold
users_data = users_data.merge(users_scores, how='outer').fillna(0)
users_data = users_data.merge(users_events_data, how='outer')
users_days = events_data.groupby('user_id').day.nunique().to_frame().reset_index()
users_data = users_data.merge(users_days, how='outer')
users_data['passed_corse'] = users_data.passed > 40 # Мы будем считать, что пользователь успешно закончил курс, если он правильно решил больше 40 практических заданий.

In [62]:
# Используя данные о первых двух днях активности на курсе вам нужно предсказать, наберет ли пользователь более 40 баллов на курсе или нет.
learning_time_threshold = 2 * 24 * 60 * 60
user_min_time = events_data.groupby('user_id', as_index=False) \
    .agg({'timestamp': 'min'}) \
    .rename(columns={'timestamp': 'min_timestamp'})
users_data = users_data.merge(user_min_time, how='outer')
events_data['user_time'] = events_data.user_id.map(str) + '_' + events_data.timestamp.map(str)
user_learning_time_threshold = user_min_time.user_id.map(str) + '_' + (user_min_time.min_timestamp + learning_time_threshold).map(str)
user_min_time['user_learning_time_threshold'] = user_learning_time_threshold
events_data = events_data.merge(user_min_time[['user_id', 'user_learning_time_threshold']], how='outer')
events_data_train = events_data[events_data.user_time <= events_data.user_learning_time_threshold]

submissions_data['user_time'] = submissions_data.user_id.map(str) + '_' + submissions_data.timestamp.map(str)
submissions_data = submissions_data.merge(user_min_time[['user_id', 'user_learning_time_threshold']], how='outer')
submissions_data_train = submissions_data[submissions_data.user_time <= submissions_data.user_learning_time_threshold]

In [63]:
X = submissions_data_train.groupby('user_id').day.nunique().to_frame().reset_index() \
    .rename(columns={'day': 'days'})
steps_tried = submissions_data_train.groupby('user_id').step_id.nunique().to_frame().reset_index() \
    .rename(columns={'step_id': 'steps_tried'})
X = X.merge(steps_tried, on='user_id', how='outer')
X = X.merge(submissions_data_train.pivot_table(index='user_id',
                        columns='submission_status',
                        values='step_id',
                        aggfunc='count',
                        fill_value=0).reset_index())
X['correct_ratio'] = X.correct / (X.correct + X.wrong)
X = X.merge(events_data_train.pivot_table(index='user_id',
                        columns='action',
                        values='step_id',
                        aggfunc='count',
                        fill_value=0).reset_index()[['user_id', 'viewed']], how='outer')
X = X.fillna(0)
X = X.merge(users_data[['user_id', 'passed_corse', 'is_gone_user']], how='outer')
X = X[~((X.is_gone_user == False) & (X.passed_corse == False))]

In [64]:
y = X.passed_corse.map(int)
X = X.drop(['passed_corse', 'is_gone_user'], axis=1)
X = X.set_index(X.user_id)
X = X.drop('user_id', axis=1)

In [68]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X, y)

RandomForestClassifier()

In [98]:
events_data_test = pd.read_csv('https://stepik.org/media/attachments/course/4852/events_data_test.csv')
submissions_data_test = pd.read_csv('https://stepik.org/media/attachments/course/4852/submission_data_test.csv')
events_data_test['date'] = pd.to_datetime(events_data_test.timestamp, unit='s')
events_data_test['day'] = events_data_test.date.dt.date
submissions_data_test['date'] = pd.to_datetime(submissions_data_test.timestamp, unit='s')
submissions_data_test['day'] = submissions_data_test.date.dt.date
users_events_data_test = events_data_test.pivot_table(index='user_id', 
                        columns='action', 
                        values='step_id', 
                        aggfunc='count', 
                        fill_value=0).reset_index()
users_scores_test = submissions_data_test.pivot_table(index='user_id', 
                        columns='submission_status', 
                        values='step_id', 
                        aggfunc='count', 
                        fill_value=0).reset_index()
X_test = submissions_data_test.groupby('user_id').day.nunique().to_frame().reset_index() \
    .rename(columns={'day': 'days'})
steps_tried_test = submissions_data_test.groupby('user_id').step_id.nunique().to_frame().reset_index() \
    .rename(columns={'step_id': 'steps_tried'})
X_test = X_test.merge(steps_tried_test, on='user_id', how='outer')
X_test = X_test.merge(submissions_data_test.pivot_table(index='user_id',
                        columns='submission_status',
                        values='step_id',
                        aggfunc='count',
                        fill_value=0).reset_index())
X_test['correct_ratio'] = X_test.correct / (X_test.correct + X_test.wrong)
X_test = X_test.merge(events_data_test.pivot_table(index='user_id',
                        columns='action',
                        values='step_id',
                        aggfunc='count',
                        fill_value=0).reset_index()[['user_id', 'viewed']], how='outer')
X_test = X_test.fillna(0)
X_test = X_test.set_index(X_test.user_id)
X_test = X_test.drop('user_id', axis=1)

In [99]:
predict_proba = rf.predict_proba(X_test)
result = X_test.reset_index()
result = result['user_id'].to_frame()
result['is_gone'] = predict_proba[:, 1]
result[['user_id', 'is_gone']].to_csv(f'my_predict.csv', index=False)

In [None]:
# Your ROC score is 0.846676785884974