In [2119]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import numpy as np
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import classification_report
from sklearn import preprocessing

import time
from datetime import datetime

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

%matplotlib inline

from catboost import CatBoostClassifier

In [2120]:
df = pd.read_csv("./content/train_enriched.csv")

In [2121]:
abc_le = preprocessing.LabelEncoder()
df["ABC"] = abc_le.fit_transform(df["ABC"])

In [2122]:
df['control_year'] = pd.to_datetime(df['month_id']).dt.strftime("%Y").astype(int)
df['control_month'] = pd.to_datetime(df['month_id']).dt.strftime("%m").astype(int)
df['control_timestamp'] = df['month_id'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

df['cart_day'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%d").astype(int)
df['cart_month'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%m").astype(int)
df['cart_year'] = pd.to_datetime(df['carts_created_at']).dt.strftime("%Y").astype(int)
df['cart_timestamp'] = df['carts_created_at'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [2123]:
df['promo']=df['promo'].apply(lambda promo: promo =='+').astype(int)

In [2124]:
ct_le = preprocessing.LabelEncoder()
df["communication_type"] = ct_le.fit_transform(df["communication_type"])

In [None]:
cats = ['Windows', 'Mac OS X', 'iOS', 'Android', 'Linux', 'Ubuntu',
       'Chrome OS', 'Fedora']
def os_trans(row):
    os = row.os
    if os == 'iOS':
        os='Mac OS X'
    elif os=='Ubuntu'    or os=='Fedora':
        os='Linux'   
    row["os"]=os
    return row

In [None]:
os_le = preprocessing.LabelEncoder()
df=df.apply(lambda row: os_trans(row), axis=1)
df["os"] = os_le.fit_transform(df["os"])

browser_le = preprocessing.LabelEncoder()
df["browser"] = browser_le.fit_transform(df["browser"])

platform_le = preprocessing.LabelEncoder()
df.platform=df.platform.apply(lambda p: 'mobile' if p =='tablet' else p)
df["platform"] = platform_le.fit_transform(df["platform"])

program_family_price_type_le = preprocessing.LabelEncoder()
df["program_family_price_type"] = program_family_price_type_le.fit_transform(df["program_family_price_type"])

program_type_le = preprocessing.LabelEncoder()
df["program_type"] = program_type_le.fit_transform(df["program_type"])

program_starting_soon_le = preprocessing.LabelEncoder()
df["program_starting_soon"] = program_starting_soon_le.fit_transform(df["program_starting_soon"])

In [None]:
df['current_program_starts_on_day'] = pd.to_datetime(df['current_program_starts_on']).dt.strftime("%d")
df['current_program_starts_on_month'] = pd.to_datetime(df['current_program_starts_on']).dt.strftime("%m")
df['current_program_starts_on_year'] = pd.to_datetime(df['current_program_starts_on']).dt.strftime("%Y")

In [None]:
def to_timestamp(d):
    if not isinstance(d, float):
        d = time.mktime(datetime.strptime(d, "%Y-%m-%d").timetuple())
    return d

In [None]:
df['current_program_starts_on_timestamp'] = df['current_program_starts_on'].apply(lambda d: to_timestamp(d))

In [None]:
df['current_program_starts_on_timestamp']

In [None]:
def transform_duration(row):
    program_duration = str(row.program_duration)
    splt =  program_duration.split(' ')
    unit = splt[1][0:3]
    number = int(splt[0])
    if unit =='мес':
        number=number * 4
    row['program_duration'] = number
    return row


In [None]:
df[['program_duration']]=df[['program_duration']].fillna('0 недель')
df=df.apply(lambda row: transform_duration(row), axis=1)

In [None]:
def replace_country(row):
    country=row['country']
    if country in ['Россия', '<span>Россия</span>', 'Росссия','РФ']:
         row['country']= "Россия"
    return row

In [None]:
# df = df.apply(lambda row: replace_country(row), axis=1)

In [None]:
# country_le = preprocessing.LabelEncoder()
# df["country"] = country_le.fit_transform(df["country"])

In [None]:
# df.info()

In [None]:
# df.dtypes[df.dtypes == "object"].values

In [None]:
# mass_object = df.dtypes[df.dtypes == "object"].index.values

In [None]:
# mass_object

In [None]:
# df = df.drop(['available_program_starts_on', 'carts_created_at', 'city',
#        'current_program_starts_on', 'month_id',], axis = 1)

In [None]:
x_feats=["program_id",
"price",
 "cart_timestamp",
 "control_month",
"student_id",
"age_indicator",
# "feedback_avg_d1",
# "feedback_avg_d3",
# "feedback_avg_d5",
# "feedback_avg_d4",
# "feedback_avg_d2",
 "control_year",
# "support_feedback_avg",
"ABC",
"promo" , 
         
       'bought_d1', 'bought_d2', 'bought_d3', 'bought_d4',
       'bought_d5', 
         
#           'bought_avg_duration',
          "m_was_conversations",
          "p_was_conversations",
  
       "program_family_main_direction_id",
       "program_type",
         
#          "program_starting_soon",
         
       "current_program_starts_on_timestamp", 
          "time_diff",
         "auto_payment",
          "gender",
         "platform",
        "os",
     ]
feats=x_feats+[   "target" ]

In [None]:
feats

In [None]:
# df.info()

In [None]:
# df=df[feats]

In [None]:
# df.info()

In [None]:
 df = df.fillna(0)

# imp = IterativeImputer(initial_strategy='median')
# df = pd.DataFrame(data=imp.fit_transform(df), columns=df.columns)

#df = df.dropna()

# cols =[
#     "age_indicator",
# "feedback_avg_d1",
# "feedback_avg_d3",
# "feedback_avg_d5",
# "feedback_avg_d4",
# "feedback_avg_d2",
# "support_feedback_avg",
# "bought_avg_duration",
# "m_was_conversations",
# "p_was_conversations",
# "current_program_starts_on_timestamp",
# ]

# df=df[~df[cols].isna().all(axis=1)]

In [None]:
df["time_diff"] = df["current_program_starts_on_timestamp"] - df["cart_timestamp"]

In [None]:
df.gender=df.gender.astype(int)

In [None]:
def price_trans(row):
    price=row.price
    cat=0
    if price<=15000:
        cat=0
    elif price >15000 and price<=40000:
        cat=1
    elif price >40000 and price<=60000:
        cat=2
    else:
        cat=4    
    row["price"]=cat
    return row

In [None]:
# df=df.apply(lambda row: price_trans(row), axis=1)

In [None]:
_ = df[["price"]].hist(figsize=(20,12))

In [None]:
# df["bought_total"] = df.bought_d1+df.bought_d2 + df.bought_d3+df.bought_d4+df.bought_d5

In [None]:
# def age_transform(row):
#     age = row.age_indicator
#     if age<10:
#         row['age_indicator']= 20
#     return row

In [None]:
# df = df.apply(lambda row: age_transform(row), axis=1)

In [None]:
df=df[feats]

In [None]:
# df['current_program_starts_on_day'] = df['current_program_starts_on_day'].astype(int)
# df['current_program_starts_on_month'] = df['current_program_starts_on_month'].astype(int)
# df['current_program_starts_on_year'] = df['current_program_starts_on_year'].astype(int)

In [None]:
# df.info()

In [None]:
# df = df.fillna(0)

In [None]:
df

In [None]:
X =  df.drop(["target"], axis = 1)
y = df[["target"]]

In [None]:
X.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
clf = RandomForestClassifier(random_state=0, max_depth=50, n_estimators=500)

In [None]:
clf.fit(X_train, y_train)

In [None]:
pred = clf.predict(X_test)

In [None]:
0.2* recall_score(y_test, pred, average='macro') + 0.8* precision_score(y_test, pred, average='macro')

In [None]:
clf.classes_

In [None]:
print(classification_report(y_test, pred))

In [None]:
importance_df = pd.DataFrame(clf.feature_importances_, columns=["value"],
                             index=clf.feature_names_in_).sort_values(by="value", ascending=False)
importance_df

#Validation

In [None]:
df_test = pd.read_csv("./content/test_enriched.csv")

In [None]:
df_test["ABC"] = abc_le.transform(df_test["ABC"])

In [None]:
df_test['control_year'] = pd.to_datetime(df_test['month_id']).dt.strftime("%Y").astype(int)
df_test['control_month'] = pd.to_datetime(df_test['month_id']).dt.strftime("%m").astype(int)
df_test['control_timestamp'] = df_test['month_id'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

df_test['cart_day'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%d").astype(int)
df_test['cart_month'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%m").astype(int)
df_test['cart_year'] = pd.to_datetime(df_test['carts_created_at']).dt.strftime("%Y").astype(int)
df_test['cart_timestamp'] = df_test['carts_created_at'].apply(lambda d: time.mktime(datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [None]:
df_test['promo']=df['promo'].apply(lambda promo: promo =='+').astype(int)

In [None]:
df_test["communication_type"] = ct_le.transform(df_test["communication_type"])

In [None]:
df_test=df_test.apply(lambda row: os_trans(row), axis=1)
df_test["os"] = os_le.transform(df_test["os"])
df_test.platform=df_test.platform.apply(lambda p: 'mobile' if p =='tablet' else p)
df_test["platform"] = platform_le.transform(df_test["platform"])

In [None]:
df_test["program_type"] = program_type_le.transform(df_test["program_type"])
df_test["program_starting_soon"] = program_starting_soon_le.transform(df_test["program_starting_soon"])

In [None]:
program_family_price_type_le_dict = dict(zip(program_family_price_type_le.classes_, program_family_price_type_le.transform(program_family_price_type_le.classes_)))
df_test['program_family_price_type'] = df_test['program_family_price_type'].apply(lambda program_family_price_type: program_family_price_type_le_dict.get(program_family_price_type, -1))

In [None]:
df_test['current_program_starts_on_day'] = pd.to_datetime(df_test['current_program_starts_on']).dt.strftime("%d")
df_test['current_program_starts_on_month'] = pd.to_datetime(df_test['current_program_starts_on']).dt.strftime("%m")
df_test['current_program_starts_on_year'] = pd.to_datetime(df_test['current_program_starts_on']).dt.strftime("%Y")

In [None]:
df_test['current_program_starts_on_timestamp'] = df_test['current_program_starts_on'].apply(lambda d: to_timestamp(d))

In [None]:
df_test[['program_duration']]=df_test[['program_duration']].fillna('0 недель')
df_test=df_test.apply(lambda row: transform_duration(row), axis=1)

In [None]:
browser_le_dict = dict(zip(browser_le.classes_, browser_le.transform(browser_le.classes_)))
df_test['browser'] = df_test['browser'].apply(lambda browser: browser_le_dict.get(browser, -1))

In [None]:
# df_test = df_test.apply(lambda row: age_transform(row), axis=1)

In [None]:
# df_test = df_test.apply(lambda row: replace_country(row), axis=1)

In [None]:
# df_test["country"] = country_le.transform(df_test["country"])

In [None]:
df_test.info()

In [None]:
mass_object_v = mass_object[mass_object!="target"]

In [None]:
# X_val = df_test.drop(mass_object_v, axis = 1)

In [None]:
df_test = df_test.fillna(0)

In [None]:
df_test["time_diff"] = df_test["current_program_starts_on_timestamp"] - df_test["cart_timestamp"]

In [None]:
df_test.gender=df_test.gender.astype(int)

In [None]:
# df_test=df_test.apply(lambda row: price_trans(row), axis=1)

In [2185]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84997 entries, 0 to 84996
Data columns (total 77 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   ABC                                  84997 non-null  int64  
 1   activity                             41481 non-null  float64
 2   age_indicator                        67548 non-null  float64
 3   auto_payment                         84997 non-null  int64  
 4   available_program_starts_on          80233 non-null  object 
 5   avg_hw_mark                          18909 non-null  float64
 6   avg_quiz_result                      6954 non-null   float64
 7   bought_avg_duration                  4277 non-null   float64
 8   bought_d1                            84997 non-null  int64  
 9   bought_d2                            84997 non-null  int64  
 10  bought_d3                            84997 non-null  int64  
 11  bought_d4                   

In [2186]:
mass_object_v = mass_object[mass_object!="target"]

In [2187]:
# X_val = df_test.drop(mass_object_v, axis = 1)

In [2188]:
df_test = df_test.fillna(0)

In [2189]:
df_test["time_diff"] = df_test["current_program_starts_on_timestamp"] - df_test["cart_timestamp"]

In [2190]:
df_test.gender=df_test.gender.astype(int)

In [2191]:
# df_test=df_test.apply(lambda row: price_trans(row), axis=1)

In [2192]:
X_val=df_test[x_feats]

In [2193]:
pred_test = clf.predict(X_val)

In [2194]:
df_test["target"] = pred_test

In [2195]:
df_test=df_test[["id", "target"]]

In [2196]:
df_test.to_csv("./content/pred.csv", index=False)