In [1]:
import time
from collections import Counter
from datetime import datetime

import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
df = pd.read_csv("./content/train_enriched.csv")

In [3]:
abc_le = preprocessing.LabelEncoder()
df["ABC"] = abc_le.fit_transform(df["ABC"])

In [4]:
df['control_year'] = pd.to_datetime(df['month_id']).dt.strftime("%Y").astype(int)
df['control_month'] = pd.to_datetime(df['month_id']).dt.strftime("%m").astype(int)
df['control_timestamp'] = df['month_id'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

df['cart_day'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%d").astype(int)
df['cart_month'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%m").astype(int)
df['cart_year'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%Y").astype(int)
df['cart_timestamp'] = df['carts_created_at'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [5]:
df['promo'] = df['promo'].apply(lambda promo: promo == '+').astype(int)

In [6]:
ct_le = preprocessing.LabelEncoder()
df["communication_type"] = ct_le.fit_transform(df["communication_type"])

In [7]:
cats = ['Windows', 'Mac OS X', 'iOS', 'Android', 'Linux', 'Ubuntu',
        'Chrome OS', 'Fedora']


def os_trans(row):
    os = row.os
    if os == 'iOS':
        os = 'Mac OS X'
    elif os == 'Ubuntu' or os == 'Fedora':
        os = 'Linux'
    row["os"] = os
    return row

In [8]:
os_le = preprocessing.LabelEncoder()
df = df.apply(lambda row: os_trans(row), axis=1)
df["os"] = os_le.fit_transform(df["os"])

browser_le = preprocessing.LabelEncoder()
df["browser"] = browser_le.fit_transform(df["browser"])

platform_le = preprocessing.LabelEncoder()
df.platform = df.platform.apply(lambda p: 'mobile' if p == 'tablet' else p)
df["platform"] = platform_le.fit_transform(df["platform"])

program_family_price_type_le = preprocessing.LabelEncoder()
df["program_family_price_type"] = program_family_price_type_le.fit_transform(df["program_family_price_type"])

program_type_le = preprocessing.LabelEncoder()
df["program_type"] = program_type_le.fit_transform(df["program_type"])

program_starting_soon_le = preprocessing.LabelEncoder()
df["program_starting_soon"] = program_starting_soon_le.fit_transform(df["program_starting_soon"])

In [9]:
df['current_program_starts_on_day'] = pd.to_datetime(df['current_program_starts_on']).dt.strftime("%d")
df['current_program_starts_on_month'] = pd.to_datetime(df['current_program_starts_on']).dt.strftime("%m")
df['current_program_starts_on_year'] = pd.to_datetime(df['current_program_starts_on']).dt.strftime("%Y")

In [10]:
def to_timestamp(d):
    if not isinstance(d, float):
        d = time.mktime(datetime.strptime(d, "%Y-%m-%d").timetuple())
    return d

In [11]:
df['current_program_starts_on_timestamp'] = df['current_program_starts_on'].apply(lambda d: to_timestamp(d))

In [12]:
df['current_program_starts_on_timestamp']

0         1.602450e+09
1                  NaN
2         1.659560e+09
3         1.664140e+09
4                  NaN
              ...     
199995             NaN
199996    1.650834e+09
199997    1.660165e+09
199998             NaN
199999    1.660252e+09
Name: current_program_starts_on_timestamp, Length: 200000, dtype: float64

In [13]:
def transform_duration(row):
    program_duration = str(row.program_duration)
    splt = program_duration.split(' ')
    unit = splt[1][0:3]
    number = int(splt[0])
    if unit == 'мес':
        number = number * 4
    row['program_duration'] = number
    return row


In [14]:
df[['program_duration']] = df[['program_duration']].fillna('0 недель')
df = df.apply(lambda row: transform_duration(row), axis=1)

In [15]:
def replace_country(row):
    country = row['country']
    if country in ['Россия', '<span>Россия</span>', 'Росссия', 'РФ']:
        row['country'] = "Россия"
    return row

In [16]:
x_feats = ["program_id",
           "price",
           "cart_timestamp",
           "control_month",
           "student_id",
           "age_indicator",
           "feedback_avg_d1",
           "feedback_avg_d3",
           "feedback_avg_d5",
           "feedback_avg_d4",
           "feedback_avg_d2",
           "control_year",
           "support_feedback_avg",
           "ABC",
           "promo",
           'bought_d1',
           'bought_d2',
           'bought_d3',
           'bought_d4',
           'bought_d5',
           'bought_avg_duration',
           "m_was_conversations",
           "p_was_conversations",
           "program_family_main_direction_id",
           "program_type",
           "program_starting_soon",
           "current_program_starts_on_timestamp",
           "current_program_starts_on_day",
           "current_program_starts_on_month",
           "current_program_starts_on_year",
           "time_diff",
           "auto_payment",
           "gender",
           "platform",
           "os",
           "program_duration",
           "communication_type",
           ]
feats = x_feats + ["target"]

In [17]:
feats

['program_id',
 'price',
 'cart_timestamp',
 'control_month',
 'student_id',
 'age_indicator',
 'feedback_avg_d1',
 'feedback_avg_d3',
 'feedback_avg_d5',
 'feedback_avg_d4',
 'feedback_avg_d2',
 'control_year',
 'support_feedback_avg',
 'ABC',
 'promo',
 'bought_d1',
 'bought_d2',
 'bought_d3',
 'bought_d4',
 'bought_d5',
 'bought_avg_duration',
 'm_was_conversations',
 'p_was_conversations',
 'program_family_main_direction_id',
 'program_type',
 'program_starting_soon',
 'current_program_starts_on_timestamp',
 'current_program_starts_on_day',
 'current_program_starts_on_month',
 'current_program_starts_on_year',
 'time_diff',
 'auto_payment',
 'gender',
 'platform',
 'os',
 'program_duration',
 'communication_type',
 'target']

In [18]:
df = df.fillna(0)

In [19]:
df["time_diff"] = df["current_program_starts_on_timestamp"] - df["cart_timestamp"]

In [20]:
df.gender = df.gender.astype(int)

In [21]:
df = df[feats]

In [22]:
df['current_program_starts_on_day'] = df['current_program_starts_on_day'].astype(int)
df['current_program_starts_on_month'] = df['current_program_starts_on_month'].astype(int)
df['current_program_starts_on_year'] = df['current_program_starts_on_year'].astype(int)

In [23]:
df

Unnamed: 0,program_id,price,cart_timestamp,control_month,student_id,age_indicator,feedback_avg_d1,feedback_avg_d3,feedback_avg_d5,feedback_avg_d4,...,current_program_starts_on_month,current_program_starts_on_year,time_diff,auto_payment,gender,platform,os,program_duration,communication_type,target
0,1469,20042.959300,1.598389e+09,9,6694527,32.0,5.0,0.0,0.0,0.0,...,10,2020,4.060800e+06,0,1,2,5,0,1,0
1,1392,15057.315000,1.596575e+09,6,6712877,0.0,0.0,0.0,0.0,0.0,...,0,0,-1.596575e+09,1,0,2,5,0,0,0
2,376,23389.029300,1.592600e+09,2,6659444,0.0,0.0,0.0,0.0,0.0,...,8,2022,6.696000e+07,0,0,2,5,48,2,0
3,1160,22260.632220,1.618348e+09,11,7151591,1.0,0.0,0.0,0.0,0.0,...,9,2022,4.579200e+07,1,1,1,4,16,0,0
4,952,7255.515915,1.595106e+09,10,6705666,30.0,5.0,0.0,0.0,0.0,...,0,0,-1.595106e+09,1,1,2,5,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,1043,10263.967450,1.602796e+09,11,6816668,27.0,0.0,0.0,0.0,0.0,...,0,0,-1.602796e+09,1,1,2,5,0,0,0
199996,1635,35998.565400,1.609535e+09,9,6984939,0.0,4.5,0.0,0.0,0.0,...,4,2022,4.129920e+07,0,0,0,3,0,0,0
199997,789,22084.062000,1.593378e+09,3,6670084,17.0,0.0,0.0,0.0,4.0,...,8,2022,6.678720e+07,0,1,2,5,32,2,0
199998,476,14377.805400,1.607288e+09,6,6917324,0.0,0.0,0.0,0.0,0.0,...,0,0,-1.607288e+09,1,0,0,0,8,0,0


In [24]:
X = df.drop(["target"], axis=1)
y = df.target

In [25]:
X.columns

Index(['program_id', 'price', 'cart_timestamp', 'control_month', 'student_id',
       'age_indicator', 'feedback_avg_d1', 'feedback_avg_d3',
       'feedback_avg_d5', 'feedback_avg_d4', 'feedback_avg_d2', 'control_year',
       'support_feedback_avg', 'ABC', 'promo', 'bought_d1', 'bought_d2',
       'bought_d3', 'bought_d4', 'bought_d5', 'bought_avg_duration',
       'm_was_conversations', 'p_was_conversations',
       'program_family_main_direction_id', 'program_type',
       'program_starting_soon', 'current_program_starts_on_timestamp',
       'current_program_starts_on_day', 'current_program_starts_on_month',
       'current_program_starts_on_year', 'time_diff', 'auto_payment', 'gender',
       'platform', 'os', 'program_duration', 'communication_type'],
      dtype='object')

In [26]:

class_counts = Counter(y).most_common()
class_counts = dict([(cl[0], cl[1]) for cl in class_counts])
class_counts

{0: 174301, 1: 13512, 5: 3678, 4: 3659, 3: 3027, 2: 1823}

In [27]:
sample_class_counts = class_counts.copy()
sample_class_counts[2] = sample_class_counts[2] * 2

In [28]:
from imblearn.over_sampling import RandomOverSampler

over_sampler: RandomOverSampler = RandomOverSampler(sample_class_counts)
X, y = over_sampler.fit_resample(X, y)



In [29]:
Counter(y).most_common()

[(0, 174301), (1, 13512), (5, 3678), (4, 3659), (2, 3646), (3, 3027)]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [31]:
clf = RandomForestClassifier(random_state=0, max_depth=50, n_estimators=500)

In [32]:
clf.fit(X_train, y_train)

In [33]:
pred = clf.predict(X_test)

In [34]:
0.2 * recall_score(y_test, pred, average='macro') + 0.8 * precision_score(y_test, pred, average='macro')

0.9027985807463323

In [35]:
clf.classes_

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [36]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98     52253
           1       0.94      0.69      0.80      4067
           2       0.94      0.91      0.92      1097
           3       0.92      0.69      0.79       930
           4       0.95      0.70      0.80      1109
           5       0.90      0.69      0.78      1091

    accuracy                           0.96     60547
   macro avg       0.93      0.78      0.84     60547
weighted avg       0.96      0.96      0.95     60547



In [37]:
importance_df = pd.DataFrame(clf.feature_importances_, columns=["value"],
                             index=clf.feature_names_in_).sort_values(by="value", ascending=False)
importance_df

Unnamed: 0,value
control_month,0.110831
student_id,0.097288
cart_timestamp,0.096824
time_diff,0.085532
price,0.077386
age_indicator,0.069313
program_id,0.040987
current_program_starts_on_timestamp,0.028808
current_program_starts_on_day,0.028054
m_was_conversations,0.025353


#Validation

In [38]:
df_test = pd.read_csv("./content/test_enriched.csv")

In [39]:
df_test["ABC"] = abc_le.transform(df_test["ABC"])

In [40]:
df_test['control_year'] = pd.to_datetime(df_test['month_id']).dt.strftime("%Y").astype(int)
df_test['control_month'] = pd.to_datetime(df_test['month_id']).dt.strftime("%m").astype(int)
df_test['control_timestamp'] = df_test['month_id'].apply(
    lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

df_test['cart_day'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%d").astype(int)
df_test['cart_month'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%m").astype(int)
df_test['cart_year'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%Y").astype(int)
df_test['cart_timestamp'] = df_test['carts_created_at'].apply(
    lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [41]:
df_test['promo'] = df['promo'].apply(lambda promo: promo == '+').astype(int)

In [42]:
df_test["communication_type"] = ct_le.transform(df_test["communication_type"])

In [43]:
df_test = df_test.apply(lambda row: os_trans(row), axis=1)
df_test["os"] = os_le.transform(df_test["os"])
df_test.platform = df_test.platform.apply(lambda p: 'mobile' if p == 'tablet' else p)
df_test["platform"] = platform_le.transform(df_test["platform"])

In [44]:
df_test["program_type"] = program_type_le.transform(df_test["program_type"])
df_test["program_starting_soon"] = program_starting_soon_le.transform(df_test["program_starting_soon"])

In [45]:
program_family_price_type_le_dict = dict(zip(program_family_price_type_le.classes_,
                                             program_family_price_type_le.transform(
                                                 program_family_price_type_le.classes_)))
df_test['program_family_price_type'] = df_test['program_family_price_type'].apply(
    lambda program_family_price_type: program_family_price_type_le_dict.get(program_family_price_type, -1))

In [46]:
df_test['current_program_starts_on_day'] = pd.to_datetime(df_test['current_program_starts_on']).dt.strftime("%d")
df_test['current_program_starts_on_month'] = pd.to_datetime(df_test['current_program_starts_on']).dt.strftime("%m")
df_test['current_program_starts_on_year'] = pd.to_datetime(df_test['current_program_starts_on']).dt.strftime("%Y")

In [47]:
df_test['current_program_starts_on_timestamp'] = df_test['current_program_starts_on'].apply(lambda d: to_timestamp(d))

In [48]:
df_test[['program_duration']] = df_test[['program_duration']].fillna('0 недель')
df_test = df_test.apply(lambda row: transform_duration(row), axis=1)

In [49]:
browser_le_dict = dict(zip(browser_le.classes_, browser_le.transform(browser_le.classes_)))
df_test['browser'] = df_test['browser'].apply(lambda browser: browser_le_dict.get(browser, -1))

In [50]:
df_test = df_test.fillna(0)

In [51]:
df_test["time_diff"] = df_test["current_program_starts_on_timestamp"] - df_test["cart_timestamp"]

In [52]:
df_test.gender = df_test.gender.astype(int)

In [53]:
df_test = df_test.fillna(0)

In [54]:
df_test['current_program_starts_on_day'] = df_test['current_program_starts_on_day'].astype(int)
df_test['current_program_starts_on_month'] = df_test['current_program_starts_on_month'].astype(int)
df_test['current_program_starts_on_year'] = df_test['current_program_starts_on_year'].astype(int)

In [55]:
df_test["time_diff"] = df_test["current_program_starts_on_timestamp"] - df_test["cart_timestamp"]

In [56]:
df_test.gender = df_test.gender.astype(int)

In [57]:
X_val = df_test[x_feats]

In [58]:
pred_test = clf.predict(X_val)

In [59]:
df_test["target"] = pred_test

In [60]:
df_test = df_test[["id", "target"]]

In [61]:
df_test.to_csv("./content/pred.csv", index=False)

In [62]:
pred_test = clf.predict(X_val)

In [63]:
df_test["target"] = pred_test

In [64]:
df_test = df_test[["id", "target"]]

In [65]:
df_test.to_csv("./content/pred.csv", index=False)