In [691]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import numpy as np
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import classification_report
from sklearn import preprocessing

import time
from datetime import datetime

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

%matplotlib inline

from catboost import CatBoostClassifier

In [692]:
df = pd.read_csv("./content/train.csv")

In [693]:
abc_le = preprocessing.LabelEncoder()
df["ABC"] = abc_le.fit_transform(df["ABC"])

In [694]:
df['control_year'] = pd.to_datetime(df['month_id']).dt.strftime("%Y").astype(int)
df['control_month'] = pd.to_datetime(df['month_id']).dt.strftime("%m").astype(int)

df['cart_day'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%d").astype(int)
df['cart_month'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%m").astype(int)
df['cart_year'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%Y").astype(int)
df['cart_timestamp'] = df['carts_created_at'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [695]:
df['promo']=df['promo'].apply(lambda promo: promo =='+').astype(int)

In [696]:
ct_le = preprocessing.LabelEncoder()
df["communication_type"] = ct_le.fit_transform(df["communication_type"])

In [697]:
os_le = preprocessing.LabelEncoder()
df["os"] = os_le.fit_transform(df["os"])

browser_le = preprocessing.LabelEncoder()
df["browser"] = browser_le.fit_transform(df["browser"])

platform_le = preprocessing.LabelEncoder()
df["platform"] = platform_le.fit_transform(df["platform"])

In [698]:
def replace_country(row):
    country=row['country']
    if country in ['Россия', '<span>Россия</span>', 'Росссия','РФ']:
         row['country']= "Россия"
    return row

In [699]:
df = df.apply(lambda row: replace_country(row), axis=1)

In [700]:
country_le = preprocessing.LabelEncoder()
df["country"] = country_le.fit_transform(df["country"])

In [701]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 65 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   id                         200000 non-null  int64  
 1   age_indicator              159123 non-null  float64
 2   month_id                   200000 non-null  object 
 3   student_id                 200000 non-null  int64  
 4   program_id                 200000 non-null  int64  
 5   carts_created_at           200000 non-null  object 
 6   spent_time_total           86309 non-null   float64
 7   spent_time_to_complete_hw  42467 non-null   float64
 8   completed_hw               97599 non-null   float64
 9   failed_hw                  97599 non-null   float64
 10  reworked_hw                97599 non-null   float64
 11  interacted_hw              97599 non-null   float64
 12  avg_hw_mark                44496 non-null   float64
 13  test_with_good_mark        97

In [702]:
df.dtypes[df.dtypes == "object"].values

array([dtype('O'), dtype('O'), dtype('O')], dtype=object)

In [703]:
mass_object = df.dtypes[df.dtypes == "object"].index.values

In [704]:
mass_object

array(['month_id', 'carts_created_at', 'city'], dtype=object)

In [705]:
df = df.drop(mass_object, axis = 1)

In [706]:
df=df[["program_id",
"price",
"cart_timestamp",
"control_month",
"student_id",
"age_indicator",
"feedback_avg_d1",
"feedback_avg_d3",
"feedback_avg_d5",
"feedback_avg_d4",
"feedback_avg_d2",
"control_year",
"support_feedback_avg",
"ABC",
"promo" , 
       'bought_d1', 'bought_d2', 'bought_d3', 'bought_d4',
       'bought_d5', 'bought_avg_duration',"m_was_conversations","p_was_conversations",
       "target" ]]

In [707]:
df = df.fillna(0)

# imp = IterativeImputer(initial_strategy='median')
# df = pd.DataFrame(data=imp.fit_transform(df), columns=df.columns)

df = df.dropna()

In [708]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 24 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   program_id            200000 non-null  int64  
 1   price                 200000 non-null  float64
 2   cart_timestamp        200000 non-null  float64
 3   control_month         200000 non-null  int64  
 4   student_id            200000 non-null  int64  
 5   age_indicator         200000 non-null  float64
 6   feedback_avg_d1       200000 non-null  float64
 7   feedback_avg_d3       200000 non-null  float64
 8   feedback_avg_d5       200000 non-null  float64
 9   feedback_avg_d4       200000 non-null  float64
 10  feedback_avg_d2       200000 non-null  float64
 11  control_year          200000 non-null  int64  
 12  support_feedback_avg  200000 non-null  float64
 13  ABC                   200000 non-null  int64  
 14  promo                 200000 non-null  int64  
 15  

In [709]:
X =  df.drop(["target"], axis = 1)
y = df[["target"]]

In [710]:
X.columns

Index(['program_id', 'price', 'cart_timestamp', 'control_month', 'student_id',
       'age_indicator', 'feedback_avg_d1', 'feedback_avg_d3',
       'feedback_avg_d5', 'feedback_avg_d4', 'feedback_avg_d2', 'control_year',
       'support_feedback_avg', 'ABC', 'promo', 'bought_d1', 'bought_d2',
       'bought_d3', 'bought_d4', 'bought_d5', 'bought_avg_duration',
       'm_was_conversations', 'p_was_conversations'],
      dtype='object')

In [711]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [712]:
clf = RandomForestClassifier(random_state=0, max_depth=50, n_estimators=500, max_features = 25)

In [713]:
clf.fit(X_train, y_train)

  clf.fit(X_train, y_train)


In [714]:
pred = clf.predict(X_test)

In [715]:
0.2* recall_score(y_test, pred, average='macro') + 0.8* precision_score(y_test, pred, average='macro')

0.8571104498957557

In [716]:
clf.classes_

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [717]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     52331
           1       0.90      0.68      0.78      4005
           2       0.90      0.53      0.67       548
           3       0.87      0.66      0.75       915
           4       0.90      0.57      0.70      1097
           5       0.88      0.63      0.73      1104

    accuracy                           0.95     60000
   macro avg       0.90      0.68      0.77     60000
weighted avg       0.95      0.95      0.94     60000



In [718]:
importance_df = pd.DataFrame(clf.feature_importances_, columns=["value"],
                             index=clf.feature_names_in_).sort_values(by="value", ascending=False)
importance_df

Unnamed: 0,value
cart_timestamp,0.167988
student_id,0.157437
price,0.14624
age_indicator,0.102816
program_id,0.098496
control_month,0.069676
feedback_avg_d1,0.033775
m_was_conversations,0.029686
ABC,0.026449
feedback_avg_d3,0.02345


#Validation

In [746]:
df_test = pd.read_csv("./content/test.csv")

In [747]:
df_test["ABC"] = abc_le.transform(df_test["ABC"])

In [748]:
df_test['control_year'] = pd.to_datetime(df_test['month_id']).dt.strftime("%Y").astype(int)
df_test['control_month'] = pd.to_datetime(df_test['month_id']).dt.strftime("%m").astype(int)

df_test['cart_day'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%d").astype(int)
df_test['cart_month'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%m").astype(int)
df_test['cart_year'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%Y").astype(int)
df_test['cart_timestamp'] = df_test['carts_created_at'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [749]:
df_test['promo']=df['promo'].apply(lambda promo: promo =='+').astype(int)

In [750]:
df_test["communication_type"] = ct_le.transform(df_test["communication_type"])

In [751]:
df_test["os"] = os_le.transform(df_test["os"])
df_test["platform"] = platform_le.transform(df_test["platform"])

In [752]:
browser_le_dict = dict(zip(browser_le.classes_, browser_le.transform(browser_le.classes_)))
df_test['browser'] = df_test['browser'].apply(lambda browser: browser_le_dict.get(browser, -1))

In [753]:
df_test = df_test.apply(lambda row: replace_country(row), axis=1)

In [754]:
df_test["country"] = country_le.transform(df_test["country"])

In [755]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84997 entries, 0 to 84996
Data columns (total 64 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         84997 non-null  int64  
 1   age_indicator              67548 non-null  float64
 2   month_id                   84997 non-null  object 
 3   student_id                 84997 non-null  int64  
 4   program_id                 84997 non-null  int64  
 5   carts_created_at           84997 non-null  object 
 6   spent_time_total           36687 non-null  float64
 7   spent_time_to_complete_hw  18046 non-null  float64
 8   completed_hw               41481 non-null  float64
 9   failed_hw                  41481 non-null  float64
 10  reworked_hw                41481 non-null  float64
 11  interacted_hw              41481 non-null  float64
 12  avg_hw_mark                18909 non-null  float64
 13  test_with_good_mark        41481 non-null  flo

In [756]:
mass_object_v = mass_object[mass_object!="target"]

In [757]:
X_val = df_test.drop(mass_object_v, axis = 1)

In [758]:
X_val = X_val.fillna(0)
X_val=X_val[["program_id",
"price",
"cart_timestamp",
"control_month",
"student_id",
"age_indicator",
"feedback_avg_d1",
"feedback_avg_d3",
"feedback_avg_d5",
"feedback_avg_d4",
"feedback_avg_d2",
"control_year",
"support_feedback_avg",
"ABC",
"promo" , 
       'bought_d1', 'bought_d2', 'bought_d3', 'bought_d4',
       'bought_d5', 'bought_avg_duration',"m_was_conversations","p_was_conversations",
     ]]

In [759]:
pred_test = clf.predict(X_val)

In [760]:
df_test["target"] = pred_test

In [761]:
df_test=df_test[["id", "target"]]

In [762]:
df_test.to_csv("./content/pred.csv", index=False)