In [409]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import numpy as np
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import classification_report
from sklearn import preprocessing

import time
from datetime import datetime

%matplotlib inline

In [410]:
df = pd.read_csv("./content/train.csv")

In [411]:
abc_le = preprocessing.LabelEncoder()
df["ABC"] = abc_le.fit_transform(df["ABC"])

In [412]:
df['control_year'] = pd.to_datetime(df['month_id']).dt.strftime("%Y").astype(int)
df['control_month'] = pd.to_datetime(df['month_id']).dt.strftime("%m").astype(int)

df['cart_day'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%d").astype(int)
df['cart_month'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%m").astype(int)
df['cart_year'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%Y").astype(int)
df['cart_timestamp'] = df['carts_created_at'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [413]:
df['promo']=df['promo'].apply(lambda promo: promo =='+').astype(int)

In [414]:
ct_le = preprocessing.LabelEncoder()
df["communication_type"] = ct_le.fit_transform(df["communication_type"])

In [415]:
os_le = preprocessing.LabelEncoder()
df["os"] = os_le.fit_transform(df["os"])

browser_le = preprocessing.LabelEncoder()
df["browser"] = browser_le.fit_transform(df["browser"])

platform_le = preprocessing.LabelEncoder()
df["platform"] = platform_le.fit_transform(df["platform"])

In [416]:
def replace_country(row):
    country=row['country']
    if country in ['Россия', '<span>Россия</span>', 'Росссия','РФ']:
         row['country']= "Россия"
    return row

In [417]:
df = df.apply(lambda row: replace_country(row), axis=1)

In [418]:
country_le = preprocessing.LabelEncoder()
df["country"] = country_le.fit_transform(df["country"])

In [419]:
df = df.fillna(0)

In [420]:
df.dtypes[df.dtypes == "object"].values

array([dtype('O'), dtype('O'), dtype('O')], dtype=object)

In [421]:
mass_object = df.dtypes[df.dtypes == "object"].index.values

In [422]:
mass_object = np.append(mass_object, "target")

In [423]:
X = df.drop(mass_object, axis = 1)
y = df[["target"]]

In [424]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [425]:
clf = RandomForestClassifier(random_state=0, max_depth=50, n_estimators=500, max_features = 25)

In [426]:
clf.fit(X_train, y_train)

  clf.fit(X_train, y_train)


In [427]:
pred = clf.predict(X_test)

In [428]:
0.2* recall_score(y_test, pred, average='macro') + 0.8* precision_score(y_test, pred, average='macro')

0.8529331677676146

In [429]:
clf.classes_

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [430]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96     52331
           1       0.95      0.52      0.67      4005
           2       0.94      0.35      0.51       548
           3       0.92      0.50      0.64       915
           4       0.93      0.44      0.60      1097
           5       0.91      0.44      0.60      1104

    accuracy                           0.93     60000
   macro avg       0.93      0.54      0.66     60000
weighted avg       0.93      0.93      0.92     60000



In [431]:
importance_df = pd.DataFrame(clf.feature_importances_, columns=["value"],
                             index=clf.feature_names_in_).sort_values(by="value", ascending=False)
importance_df

Unnamed: 0,value
student_id,8.081695e-02
price,7.944171e-02
cart_timestamp,7.371811e-02
id,7.197231e-02
program_id,6.134948e-02
...,...
bought_d2,1.134229e-03
bought_d4,9.818013e-04
hw_leader,9.253281e-04
failed_hw,2.637422e-04


#Validation

In [432]:
df_test = pd.read_csv("./content/test.csv")

In [433]:
df_test["ABC"] = abc_le.transform(df_test["ABC"])

In [434]:
df_test['control_year'] = pd.to_datetime(df_test['month_id']).dt.strftime("%Y").astype(int)
df_test['control_month'] = pd.to_datetime(df_test['month_id']).dt.strftime("%m").astype(int)

df_test['cart_day'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%d").astype(int)
df_test['cart_month'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%m").astype(int)
df_test['cart_year'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%Y").astype(int)
df_test['cart_timestamp'] = df_test['carts_created_at'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [435]:
df_test['promo']=df['promo'].apply(lambda promo: promo =='+').astype(int)

In [436]:
df_test["communication_type"] = ct_le.transform(df_test["communication_type"])

In [437]:
df_test["os"] = os_le.transform(df_test["os"])
df_test["platform"] = platform_le.transform(df_test["platform"])

In [438]:
browser_le_dict = dict(zip(browser_le.classes_, browser_le.transform(browser_le.classes_)))
df_test['browser'] = df_test['browser'].apply(lambda browser: browser_le_dict.get(browser, -1))

In [439]:
df_test = df_test.apply(lambda row: replace_country(row), axis=1)

In [440]:
df_test["country"] = country_le.transform(df_test["country"])

In [441]:
df_test = df_test.fillna(0)

In [442]:
mass_object_v = mass_object[mass_object!="target"]

In [443]:
X_val = df_test.drop(mass_object_v, axis = 1)

In [444]:
# X_val.drop(columns=["id", "student_id", "spent_time_to_complete_hw"], inplace=True)

In [445]:
pred_test = clf.predict(X_val)

In [446]:
df_test["target"] = pred_test

In [447]:
df_test=df_test[["id", "target"]]

In [448]:
df_test.to_csv("./content/pred.csv", index=False)