In [842]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import numpy as np
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import classification_report
from sklearn import preprocessing

import time
from datetime import datetime

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

%matplotlib inline

from catboost import CatBoostClassifier

In [843]:
df = pd.read_csv("./content/train_enriched.csv")

In [844]:
abc_le = preprocessing.LabelEncoder()
df["ABC"] = abc_le.fit_transform(df["ABC"])

In [845]:
df['control_year'] = pd.to_datetime(df['month_id']).dt.strftime("%Y").astype(int)
df['control_month'] = pd.to_datetime(df['month_id']).dt.strftime("%m").astype(int)

df['cart_day'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%d").astype(int)
df['cart_month'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%m").astype(int)
df['cart_year'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%Y").astype(int)
df['cart_timestamp'] = df['carts_created_at'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [846]:
df['promo']=df['promo'].apply(lambda promo: promo =='+').astype(int)

In [847]:
ct_le = preprocessing.LabelEncoder()
df["communication_type"] = ct_le.fit_transform(df["communication_type"])

In [848]:
os_le = preprocessing.LabelEncoder()
df["os"] = os_le.fit_transform(df["os"])

browser_le = preprocessing.LabelEncoder()
df["browser"] = browser_le.fit_transform(df["browser"])

platform_le = preprocessing.LabelEncoder()
df["platform"] = platform_le.fit_transform(df["platform"])

program_family_price_type_le = preprocessing.LabelEncoder()
df["program_family_price_type"] = program_family_price_type_le.fit_transform(df["program_family_price_type"])

program_type_le = preprocessing.LabelEncoder()
df["program_type"] = program_type_le.fit_transform(df["program_type"])

program_starting_soon_le = preprocessing.LabelEncoder()
df["program_starting_soon"] = program_starting_soon_le.fit_transform(df["program_starting_soon"])

In [849]:
df['current_program_starts_on_day'] = pd.to_datetime(df['current_program_starts_on']).dt.strftime("%d")
df['current_program_starts_on_month'] = pd.to_datetime(df['current_program_starts_on']).dt.strftime("%m")
df['current_program_starts_on_year'] = pd.to_datetime(df['current_program_starts_on']).dt.strftime("%Y")

In [861]:
def to_timestamp(d):
    if not isinstance(d, float):
        d = time.mktime(datetime.strptime(d, "%Y-%m-%d").timetuple())
    return d

In [862]:
df['current_program_starts_on_timestamp'] = df['current_program_starts_on'].apply(lambda d: to_timestamp(d))

In [863]:
df['current_program_starts_on_timestamp']

0         1.602450e+09
1                  NaN
2         1.659560e+09
3         1.664140e+09
4                  NaN
              ...     
199995             NaN
199996    1.650834e+09
199997    1.660165e+09
199998             NaN
199999    1.660252e+09
Name: current_program_starts_on_timestamp, Length: 200000, dtype: float64

In [864]:
def transform_duration(row):
    program_duration = str(row.program_duration)
    splt =  program_duration.split(' ')
    unit = splt[1][0:3]
    number = int(splt[0])
    if unit =='мес':
        number=number * 4
    row['program_duration'] = number
    return row


In [865]:
df[['program_duration']]=df[['program_duration']].fillna('0 недель')
df=df.apply(lambda row: transform_duration(row), axis=1)

In [866]:
def replace_country(row):
    country=row['country']
    if country in ['Россия', '<span>Россия</span>', 'Росссия','РФ']:
         row['country']= "Россия"
    return row

In [867]:
df = df.apply(lambda row: replace_country(row), axis=1)

In [868]:
country_le = preprocessing.LabelEncoder()
df["country"] = country_le.fit_transform(df["country"])

In [869]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 77 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   ABC                                  200000 non-null  int64  
 1   activity                             97599 non-null   float64
 2   age_indicator                        159123 non-null  float64
 3   auto_payment                         200000 non-null  int64  
 4   available_program_starts_on          188693 non-null  object 
 5   avg_hw_mark                          44496 non-null   float64
 6   avg_quiz_result                      16259 non-null   float64
 7   bought_avg_duration                  10359 non-null   float64
 8   bought_d1                            200000 non-null  int64  
 9   bought_d2                            200000 non-null  int64  
 10  bought_d3                            200000 non-null  int64  
 11  bought_d4    

In [870]:
df.dtypes[df.dtypes == "object"].values

array([dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'),
       dtype('O'), dtype('O'), dtype('O'), dtype('O')], dtype=object)

In [871]:
mass_object = df.dtypes[df.dtypes == "object"].index.values

In [872]:
mass_object

array(['available_program_starts_on', 'carts_created_at', 'city',
       'current_program_starts_on', 'month_id', 'program_price_type',
       'current_program_starts_on_day', 'current_program_starts_on_month',
       'current_program_starts_on_year'], dtype=object)

In [873]:
# df = df.drop(['available_program_starts_on', 'carts_created_at', 'city',
#        'current_program_starts_on', 'month_id',], axis = 1)

In [874]:
df=df[["program_id",
"price",
"cart_timestamp",
"control_month",
"student_id",
"age_indicator",
"feedback_avg_d1",
"feedback_avg_d3",
"feedback_avg_d5",
"feedback_avg_d4",
"feedback_avg_d2",
"control_year",
"support_feedback_avg",
"ABC",
"promo" , 
       'bought_d1', 'bought_d2', 'bought_d3', 'bought_d4',
       'bought_d5', 'bought_avg_duration',"m_was_conversations","p_was_conversations",
  
       "program_family_main_direction_id",
       "program_type", "program_starting_soon",
       "current_program_starts_on_day", "current_program_starts_on_month", "current_program_starts_on_year",
       "current_program_starts_on_timestamp",
       "target" ]]

In [875]:
df = df.fillna(0)

# imp = IterativeImputer(initial_strategy='median')
# df = pd.DataFrame(data=imp.fit_transform(df), columns=df.columns)

#df = df.dropna()

In [876]:
df['current_program_starts_on_day'] = df['current_program_starts_on_day'].astype(int)
df['current_program_starts_on_month'] = df['current_program_starts_on_month'].astype(int)
df['current_program_starts_on_year'] = df['current_program_starts_on_year'].astype(int)

In [877]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 31 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   program_id                           200000 non-null  int64  
 1   price                                200000 non-null  float64
 2   cart_timestamp                       200000 non-null  float64
 3   control_month                        200000 non-null  int64  
 4   student_id                           200000 non-null  int64  
 5   age_indicator                        200000 non-null  float64
 6   feedback_avg_d1                      200000 non-null  float64
 7   feedback_avg_d3                      200000 non-null  float64
 8   feedback_avg_d5                      200000 non-null  float64
 9   feedback_avg_d4                      200000 non-null  float64
 10  feedback_avg_d2                      200000 non-null  float64
 11  control_year 

In [878]:
df

Unnamed: 0,program_id,price,cart_timestamp,control_month,student_id,age_indicator,feedback_avg_d1,feedback_avg_d3,feedback_avg_d5,feedback_avg_d4,...,m_was_conversations,p_was_conversations,program_family_main_direction_id,program_type,program_starting_soon,current_program_starts_on_day,current_program_starts_on_month,current_program_starts_on_year,current_program_starts_on_timestamp,target
0,1469,20042.959300,1.598389e+09,9,6694527,32.0,5.0,0.0,0.0,0.0,...,0.0,0.0,1,2,2,12,10,2020,1.602450e+09,0
1,1392,15057.315000,1.596575e+09,6,6712877,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2,0,2,0,0,0,0.000000e+00,0
2,376,23389.029300,1.592600e+09,2,6659444,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4,2,1,4,8,2022,1.659560e+09,0
3,1160,22260.632220,1.618348e+09,11,7151591,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,0,0,26,9,2022,1.664140e+09,0
4,952,7255.515915,1.595106e+09,10,6705666,30.0,5.0,0.0,0.0,0.0,...,0.0,0.0,1,0,2,0,0,0,0.000000e+00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,1043,10263.967450,1.602796e+09,11,6816668,27.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,0,2,0,0,0,0.000000e+00,0
199996,1635,35998.565400,1.609535e+09,9,6984939,0.0,4.5,0.0,0.0,0.0,...,0.0,0.0,1,2,2,25,4,2022,1.650834e+09,0
199997,789,22084.062000,1.593378e+09,3,6670084,17.0,0.0,0.0,0.0,4.0,...,0.0,0.0,4,2,0,11,8,2022,1.660165e+09,0
199998,476,14377.805400,1.607288e+09,6,6917324,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1,0,0,0,0,0,0.000000e+00,0


In [879]:
X =  df.drop(["target"], axis = 1)
y = df[["target"]]

In [880]:
X.columns

Index(['program_id', 'price', 'cart_timestamp', 'control_month', 'student_id',
       'age_indicator', 'feedback_avg_d1', 'feedback_avg_d3',
       'feedback_avg_d5', 'feedback_avg_d4', 'feedback_avg_d2', 'control_year',
       'support_feedback_avg', 'ABC', 'promo', 'bought_d1', 'bought_d2',
       'bought_d3', 'bought_d4', 'bought_d5', 'bought_avg_duration',
       'm_was_conversations', 'p_was_conversations',
       'program_family_main_direction_id', 'program_type',
       'program_starting_soon', 'current_program_starts_on_day',
       'current_program_starts_on_month', 'current_program_starts_on_year',
       'current_program_starts_on_timestamp'],
      dtype='object')

In [881]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [1027]:
clf = RandomForestClassifier(random_state=0, max_depth=50, n_estimators=500)

In [1028]:
clf.fit(X_train, y_train)

  clf.fit(X_train, y_train)


In [1029]:
pred = clf.predict(X_test)

In [1030]:
0.2* recall_score(y_test, pred, average='macro') + 0.8* precision_score(y_test, pred, average='macro')

0.8793598121727884

In [1031]:
clf.classes_

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [1032]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     52331
           1       0.93      0.64      0.76      4005
           2       0.93      0.54      0.69       548
           3       0.92      0.66      0.77       915
           4       0.94      0.63      0.75      1097
           5       0.90      0.61      0.73      1104

    accuracy                           0.95     60000
   macro avg       0.93      0.68      0.78     60000
weighted avg       0.95      0.95      0.94     60000



In [1033]:
importance_df = pd.DataFrame(clf.feature_importances_, columns=["value"],
                             index=clf.feature_names_in_).sort_values(by="value", ascending=False)
importance_df

Unnamed: 0,value
cart_timestamp,0.141
student_id,0.134468
control_month,0.11351
price,0.104509
age_indicator,0.09163
program_id,0.050897
current_program_starts_on_timestamp,0.034926
current_program_starts_on_day,0.034179
m_was_conversations,0.030098
feedback_avg_d1,0.029055


#Validation

In [1034]:
df_test = pd.read_csv("./content/test_enriched.csv")

In [1035]:
df_test["ABC"] = abc_le.transform(df_test["ABC"])

In [1036]:
df_test['control_year'] = pd.to_datetime(df_test['month_id']).dt.strftime("%Y").astype(int)
df_test['control_month'] = pd.to_datetime(df_test['month_id']).dt.strftime("%m").astype(int)

df_test['cart_day'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%d").astype(int)
df_test['cart_month'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%m").astype(int)
df_test['cart_year'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%Y").astype(int)
df_test['cart_timestamp'] = df_test['carts_created_at'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [1037]:
df_test['promo']=df['promo'].apply(lambda promo: promo =='+').astype(int)

In [1038]:
df_test["communication_type"] = ct_le.transform(df_test["communication_type"])

In [1039]:
df_test["os"] = os_le.transform(df_test["os"])
df_test["platform"] = platform_le.transform(df_test["platform"])

In [1040]:
df_test["program_type"] = program_type_le.transform(df_test["program_type"])
df_test["program_starting_soon"] = program_starting_soon_le.transform(df_test["program_starting_soon"])

In [1041]:
program_family_price_type_le_dict = dict(zip(program_family_price_type_le.classes_, program_family_price_type_le.transform(program_family_price_type_le.classes_)))
df_test['program_family_price_type'] = df_test['program_family_price_type'].apply(lambda program_family_price_type: program_family_price_type_le_dict.get(program_family_price_type, -1))

In [1042]:
df_test['current_program_starts_on_day'] = pd.to_datetime(df_test['current_program_starts_on']).dt.strftime("%d")
df_test['current_program_starts_on_month'] = pd.to_datetime(df_test['current_program_starts_on']).dt.strftime("%m")
df_test['current_program_starts_on_year'] = pd.to_datetime(df_test['current_program_starts_on']).dt.strftime("%Y")

In [1043]:
df_test['current_program_starts_on_timestamp'] = df_test['current_program_starts_on'].apply(lambda d: to_timestamp(d))

In [1044]:
df_test[['program_duration']]=df_test[['program_duration']].fillna('0 недель')
df_test=df_test.apply(lambda row: transform_duration(row), axis=1)

In [1045]:
browser_le_dict = dict(zip(browser_le.classes_, browser_le.transform(browser_le.classes_)))
df_test['browser'] = df_test['browser'].apply(lambda browser: browser_le_dict.get(browser, -1))

In [1046]:
df_test = df_test.apply(lambda row: replace_country(row), axis=1)

In [1047]:
df_test["country"] = country_le.transform(df_test["country"])

In [1048]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84997 entries, 0 to 84996
Data columns (total 76 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   ABC                                  84997 non-null  int64  
 1   activity                             41481 non-null  float64
 2   age_indicator                        67548 non-null  float64
 3   auto_payment                         84997 non-null  int64  
 4   available_program_starts_on          80233 non-null  object 
 5   avg_hw_mark                          18909 non-null  float64
 6   avg_quiz_result                      6954 non-null   float64
 7   bought_avg_duration                  4277 non-null   float64
 8   bought_d1                            84997 non-null  int64  
 9   bought_d2                            84997 non-null  int64  
 10  bought_d3                            84997 non-null  int64  
 11  bought_d4                   

In [1049]:
mass_object_v = mass_object[mass_object!="target"]

In [1050]:
# X_val = df_test.drop(mass_object_v, axis = 1)

In [1051]:
df_test = df_test.fillna(0)
X_val=df_test[["program_id",
"price",
"cart_timestamp",
"control_month",
"student_id",
"age_indicator",
"feedback_avg_d1",
"feedback_avg_d3",
"feedback_avg_d5",
"feedback_avg_d4",
"feedback_avg_d2",
"control_year",
"support_feedback_avg",
"ABC",
"promo" , 
       'bought_d1', 'bought_d2', 'bought_d3', 'bought_d4',
       'bought_d5', 'bought_avg_duration',"m_was_conversations","p_was_conversations",
  
       "program_family_main_direction_id",
       "program_type", "program_starting_soon",
       "current_program_starts_on_day", "current_program_starts_on_month", "current_program_starts_on_year",
       "current_program_starts_on_timestamp",]]

In [1052]:
pred_test = clf.predict(X_val)

In [1053]:
df_test["target"] = pred_test

In [1054]:
df_test=df_test[["id", "target"]]

In [1055]:
df_test.to_csv("./content/pred.csv", index=False)