In [64]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import numpy as np
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import classification_report
from sklearn import preprocessing

import time
from datetime import datetime

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

%matplotlib inline

from catboost import CatBoostClassifier,Pool

In [32]:
df = pd.read_csv("./content/train.csv")

In [33]:
abc_le = preprocessing.LabelEncoder()
df["ABC"] = abc_le.fit_transform(df["ABC"])

In [34]:
df['control_year'] = pd.to_datetime(df['month_id']).dt.strftime("%Y").astype(int)
df['control_month'] = pd.to_datetime(df['month_id']).dt.strftime("%m").astype(int)

df['cart_day'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%d").astype(int)
df['cart_month'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%m").astype(int)
df['cart_year'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%Y").astype(int)
df['cart_timestamp'] = df['carts_created_at'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [35]:
df['promo']=df['promo'].apply(lambda promo: promo =='+').astype(int)

In [36]:
ct_le = preprocessing.LabelEncoder()
df["communication_type"] = ct_le.fit_transform(df["communication_type"])

In [37]:
os_le = preprocessing.LabelEncoder()
df["os"] = os_le.fit_transform(df["os"])

browser_le = preprocessing.LabelEncoder()
df["browser"] = browser_le.fit_transform(df["browser"])

platform_le = preprocessing.LabelEncoder()
df["platform"] = platform_le.fit_transform(df["platform"])

In [38]:
def replace_country(row):
    country=row['country']
    if country in ['Россия', '<span>Россия</span>', 'Росссия','РФ']:
         row['country']= "Россия"
    return row

In [39]:
df = df.apply(lambda row: replace_country(row), axis=1)

In [40]:
country_le = preprocessing.LabelEncoder()
df["country"] = country_le.fit_transform(df["country"])

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 65 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         200000 non-null  int64  
 1   age_indicator              159123 non-null  float64
 2   month_id                   200000 non-null  object 
 3   student_id                 200000 non-null  int64  
 4   program_id                 200000 non-null  int64  
 5   carts_created_at           200000 non-null  object 
 6   spent_time_total           86309 non-null   float64
 7   spent_time_to_complete_hw  42467 non-null   float64
 8   completed_hw               97599 non-null   float64
 9   failed_hw                  97599 non-null   float64
 10  reworked_hw                97599 non-null   float64
 11  interacted_hw              97599 non-null   float64
 12  avg_hw_mark                44496 non-null   float64
 13  test_with_good_mark        97

In [43]:
df.dtypes[df.dtypes == "object"].values

array([dtype('O'), dtype('O'), dtype('O')], dtype=object)

In [44]:
mass_object = df.dtypes[df.dtypes == "object"].index.values

In [45]:
mass_object

array(['month_id', 'carts_created_at', 'city'], dtype=object)

In [46]:
df = df.drop(mass_object, axis = 1)

In [47]:
df = df.fillna(df.mean())

# imp = IterativeImputer(initial_strategy='median')
# df = pd.DataFrame(data=imp.fit_transform(df), columns=df.columns)

In [48]:
df.gender = df.gender.astype(int)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 62 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         200000 non-null  int64  
 1   age_indicator              200000 non-null  float64
 2   student_id                 200000 non-null  int64  
 3   program_id                 200000 non-null  int64  
 4   spent_time_total           200000 non-null  float64
 5   spent_time_to_complete_hw  200000 non-null  float64
 6   completed_hw               200000 non-null  float64
 7   failed_hw                  200000 non-null  float64
 8   reworked_hw                200000 non-null  float64
 9   interacted_hw              200000 non-null  float64
 10  avg_hw_mark                200000 non-null  float64
 11  test_with_good_mark        200000 non-null  float64
 12  test_with_great_mark       200000 non-null  float64
 13  webinars                   20

In [50]:
X =  df.drop(["target"], axis = 1)
y = df[["target"]]

In [51]:
X.columns

Index(['id', 'age_indicator', 'student_id', 'program_id', 'spent_time_total',
       'spent_time_to_complete_hw', 'completed_hw', 'failed_hw', 'reworked_hw',
       'interacted_hw', 'avg_hw_mark', 'test_with_good_mark',
       'test_with_great_mark', 'webinars', 'avg_quiz_result', 'notes',
       'hw_leader', 'lessons', 'activity', 'bought_d1', 'bought_d2',
       'bought_d3', 'bought_d4', 'bought_d5', 'bought_avg_duration',
       'payment_type', 'promo', 'price', 'communication_type', 'auto_payment',
       'ABC', 'country', 'gender', 'speed_recall', 'os', 'browser', 'platform',
       'm_avg_talk_duration', 'm_avg_duration', 'm_missed_calls',
       'm_total_calls', 'm_was_conversations', 'm_total_duration',
       'p_avg_talk_duration', 'p_avg_duration', 'p_missed_calls',
       'p_total_calls', 'p_was_conversations', 'p_total_duration',
       'support_feedback_avg', 'feedback_avg_d1', 'feedback_avg_d2',
       'feedback_avg_d3', 'feedback_avg_d4', 'feedback_avg_d5', 'control_year

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [53]:
clf = CatBoostClassifier(
    iterations=500,
#     learning_rate=0.15,
    custom_loss=['Precision', 'Recall'], 
)
#RandomForestClassifier(random_state=0, max_depth=50, n_estimators=500, max_features = 25)

In [54]:
clf.fit(X_train, y_train, cat_features=["ABC", "promo", "communication_type", "browser", "os", "platform", "country", "gender", "payment_type"],  plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.182872
0:	learn: 1.2004316	total: 777ms	remaining: 6m 27s
1:	learn: 0.9837494	total: 1.71s	remaining: 7m 4s
2:	learn: 0.8525680	total: 2.5s	remaining: 6m 54s
3:	learn: 0.7636192	total: 3.27s	remaining: 6m 45s
4:	learn: 0.6971031	total: 4.02s	remaining: 6m 37s
5:	learn: 0.6478911	total: 4.78s	remaining: 6m 33s
6:	learn: 0.6116002	total: 5.52s	remaining: 6m 28s
7:	learn: 0.5843006	total: 6.27s	remaining: 6m 25s
8:	learn: 0.5590122	total: 7.1s	remaining: 6m 27s
9:	learn: 0.5395431	total: 7.86s	remaining: 6m 25s
10:	learn: 0.5264407	total: 8.65s	remaining: 6m 24s
11:	learn: 0.5142728	total: 9.51s	remaining: 6m 26s
12:	learn: 0.5037419	total: 10.3s	remaining: 6m 25s
13:	learn: 0.4940226	total: 11.1s	remaining: 6m 24s
14:	learn: 0.4876416	total: 11.8s	remaining: 6m 21s
15:	learn: 0.4826909	total: 12.6s	remaining: 6m 20s
16:	learn: 0.4774797	total: 13.4s	remaining: 6m 19s
17:	learn: 0.4727884	total: 14.1s	remaining: 6m 18s
18:	learn: 0.4683668	total: 14.9s	remaining: 6m

156:	learn: 0.3749524	total: 2m 4s	remaining: 4m 31s
157:	learn: 0.3747805	total: 2m 5s	remaining: 4m 30s
158:	learn: 0.3743369	total: 2m 5s	remaining: 4m 29s
159:	learn: 0.3740372	total: 2m 6s	remaining: 4m 29s
160:	learn: 0.3738702	total: 2m 7s	remaining: 4m 28s
161:	learn: 0.3737477	total: 2m 8s	remaining: 4m 27s
162:	learn: 0.3734537	total: 2m 8s	remaining: 4m 26s
163:	learn: 0.3732735	total: 2m 9s	remaining: 4m 25s
164:	learn: 0.3730565	total: 2m 10s	remaining: 4m 24s
165:	learn: 0.3728380	total: 2m 11s	remaining: 4m 24s
166:	learn: 0.3727857	total: 2m 12s	remaining: 4m 23s
167:	learn: 0.3727015	total: 2m 12s	remaining: 4m 22s
168:	learn: 0.3725283	total: 2m 13s	remaining: 4m 21s
169:	learn: 0.3723326	total: 2m 14s	remaining: 4m 21s
170:	learn: 0.3721879	total: 2m 15s	remaining: 4m 20s
171:	learn: 0.3720211	total: 2m 16s	remaining: 4m 19s
172:	learn: 0.3718750	total: 2m 16s	remaining: 4m 18s
173:	learn: 0.3715411	total: 2m 17s	remaining: 4m 17s
174:	learn: 0.3714683	total: 2m 18s	

309:	learn: 0.3479735	total: 4m 2s	remaining: 2m 28s
310:	learn: 0.3478165	total: 4m 3s	remaining: 2m 28s
311:	learn: 0.3477276	total: 4m 4s	remaining: 2m 27s
312:	learn: 0.3475582	total: 4m 5s	remaining: 2m 26s
313:	learn: 0.3475404	total: 4m 6s	remaining: 2m 25s
314:	learn: 0.3473156	total: 4m 7s	remaining: 2m 25s
315:	learn: 0.3472293	total: 4m 7s	remaining: 2m 24s
316:	learn: 0.3471742	total: 4m 8s	remaining: 2m 23s
317:	learn: 0.3468424	total: 4m 9s	remaining: 2m 22s
318:	learn: 0.3465402	total: 4m 10s	remaining: 2m 22s
319:	learn: 0.3462340	total: 4m 11s	remaining: 2m 21s
320:	learn: 0.3461515	total: 4m 12s	remaining: 2m 20s
321:	learn: 0.3460213	total: 4m 13s	remaining: 2m 20s
322:	learn: 0.3458394	total: 4m 14s	remaining: 2m 19s
323:	learn: 0.3457712	total: 4m 15s	remaining: 2m 18s
324:	learn: 0.3455907	total: 4m 16s	remaining: 2m 17s
325:	learn: 0.3454078	total: 4m 16s	remaining: 2m 17s
326:	learn: 0.3451305	total: 4m 17s	remaining: 2m 16s
327:	learn: 0.3449592	total: 4m 18s	r

463:	learn: 0.3259361	total: 6m 7s	remaining: 28.5s
464:	learn: 0.3257010	total: 6m 8s	remaining: 27.7s
465:	learn: 0.3255687	total: 6m 9s	remaining: 27s
466:	learn: 0.3255244	total: 6m 10s	remaining: 26.2s
467:	learn: 0.3251620	total: 6m 11s	remaining: 25.4s
468:	learn: 0.3249331	total: 6m 12s	remaining: 24.6s
469:	learn: 0.3248887	total: 6m 13s	remaining: 23.8s
470:	learn: 0.3247807	total: 6m 13s	remaining: 23s
471:	learn: 0.3246669	total: 6m 14s	remaining: 22.2s
472:	learn: 0.3245518	total: 6m 15s	remaining: 21.4s
473:	learn: 0.3242680	total: 6m 16s	remaining: 20.7s
474:	learn: 0.3241048	total: 6m 17s	remaining: 19.9s
475:	learn: 0.3238474	total: 6m 18s	remaining: 19.1s
476:	learn: 0.3237357	total: 6m 19s	remaining: 18.3s
477:	learn: 0.3237105	total: 6m 20s	remaining: 17.5s
478:	learn: 0.3233366	total: 6m 21s	remaining: 16.7s
479:	learn: 0.3232275	total: 6m 22s	remaining: 15.9s
480:	learn: 0.3231300	total: 6m 22s	remaining: 15.1s
481:	learn: 0.3229858	total: 6m 23s	remaining: 14.3s


<catboost.core.CatBoostClassifier at 0x22a827cc5e0>

In [55]:
pred = clf.predict(X_test)

In [56]:
0.2* recall_score(y_test, pred, average='macro') + 0.8* precision_score(y_test, pred, average='macro')

0.6296971365223166

In [57]:
clf.classes_

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [58]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94     52331
           1       0.67      0.13      0.21      4005
           2       0.78      0.05      0.09       548
           3       0.66      0.20      0.31       915
           4       0.70      0.16      0.26      1097
           5       0.61      0.11      0.19      1104

    accuracy                           0.88     60000
   macro avg       0.72      0.27      0.33     60000
weighted avg       0.86      0.88      0.85     60000



In [65]:
train_pool = Pool(X, y, cat_features=["ABC", "promo", "communication_type", "browser", "os", "platform", "country", "gender", "payment_type"])
feature_importances = clf.get_feature_importance(train_pool)
feature_names = X.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

program_id: 11.163165211859917
price: 8.251083565819348
cart_month: 8.136060509182002
cart_timestamp: 8.027525860028472
control_month: 5.457576882256347
id: 5.297300887263198
student_id: 4.96599715234736
age_indicator: 4.699769791108958
cart_day: 3.899937714764899
feedback_avg_d1: 3.3774513717865493
feedback_avg_d3: 3.340787767616071
feedback_avg_d5: 2.620671443979935
feedback_avg_d4: 2.4716479203181994
feedback_avg_d2: 2.293228850548491
control_year: 1.8863248762453864
support_feedback_avg: 1.5615924380687454
ABC: 1.5614216448942053
m_avg_duration: 1.3953319469145127
communication_type: 1.2216362919061212
gender: 1.2188608256412337
m_total_duration: 1.0569587603917951
m_was_conversations: 1.0403707775957642
promo: 1.0053918457253956
platform: 0.9169918573155971
m_avg_talk_duration: 0.8961887632310213
bought_avg_duration: 0.8178708502321239
m_total_calls: 0.7723560173093221
os: 0.7601543468826673
m_missed_calls: 0.6902536964333753
payment_type: 0.6758808991301997
activity: 0.5939465980

#Validation

In [None]:
df_test = pd.read_csv("./content/test.csv")

In [None]:
df_test["ABC"] = abc_le.transform(df_test["ABC"])

In [None]:
df_test['control_year'] = pd.to_datetime(df_test['month_id']).dt.strftime("%Y").astype(int)
df_test['control_month'] = pd.to_datetime(df_test['month_id']).dt.strftime("%m").astype(int)

df_test['cart_day'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%d").astype(int)
df_test['cart_month'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%m").astype(int)
df_test['cart_year'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%Y").astype(int)
df_test['cart_timestamp'] = df_test['carts_created_at'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [None]:
df_test['promo']=df['promo'].apply(lambda promo: promo =='+').astype(int)

In [None]:
df_test["communication_type"] = ct_le.transform(df_test["communication_type"])

In [None]:
df_test["os"] = os_le.transform(df_test["os"])
df_test["platform"] = platform_le.transform(df_test["platform"])

In [None]:
browser_le_dict = dict(zip(browser_le.classes_, browser_le.transform(browser_le.classes_)))
df_test['browser'] = df_test['browser'].apply(lambda browser: browser_le_dict.get(browser, -1))

In [None]:
df_test = df_test.apply(lambda row: replace_country(row), axis=1)

In [None]:
df_test["country"] = country_le.transform(df_test["country"])

In [None]:
df_test.info()

In [None]:
df_test = df_test.fillna(df_test.mean())

In [None]:
mass_object_v = mass_object[mass_object!="target"]

In [None]:
X_val = df_test.drop(mass_object_v, axis = 1)

In [None]:
# X_val.drop(columns=["id", "student_id", "spent_time_to_complete_hw"], inplace=True)

In [None]:
pred_test = clf.predict(X_val)

In [None]:
df_test["target"] = pred_test

In [None]:
df_test=df_test[["id", "target"]]

In [None]:
df_test.to_csv("./content/pred.csv", index=False)