In [None]:
!pip install optuna rtdl category_encoders ruamel.yaml einops

In [None]:
from scipy.optimize import fmin
import random
import os
import sys
import pandas as pd
import pickle
import pathlib
import argparse
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from tqdm import tqdm, trange
from typing import Optional, Sequence, Tuple, Union, Any, Dict, List
from copy import deepcopy
import enum
import optuna
import rtdl
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import category_encoders as ce
import ruamel.yaml
import math
from collections import OrderedDict, defaultdict
from sklearn.metrics import roc_auc_score, f1_score, precision_score, accuracy_score, recall_score, roc_auc_score, balanced_accuracy_score, log_loss, mean_absolute_error, mean_squared_error, r2_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import make_pipeline
from scipy.spatial import distance_matrix
from scipy.linalg import qr
from einops import rearrange
from xgboost import XGBRegressor, XGBClassifier

In [None]:
def set_global_seed(seed: int) -> None:
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


set_global_seed(42)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

## Metrics

In [None]:
class Metric():
    def __init__(self, metric, higher_is_better=True, name='name', optimize=False, discrete=False, **kwargs):
        self.name = name
        self.higher_is_better = higher_is_better
        self.optimize = optimize
        self.discrete = discrete
        self.metric = metric
        self.best_thr = 0.5

    def __repr__(self):
        return str(self.name)

    def __call__(self, y_true, y_pred, thr=0.5, use_best=False):
        if self.discrete:
            return self.metric(y_true, y_pred, thr=thr if not use_best else self.best_thr)
        else:
            return self.metric(y_true, y_pred)

    def find_threshold(self, y_true, y_pred):
        if self.optimize:
            w0 = [0.5]
            res = fmin(self.opt, w0, args=(y_true, y_pred), disp=0)[0]
            self.best_thr = res
            return res
        else:
            return 0.5

    def opt(self, w, y_true, y_pred):
        return (-1) ** (self.higher_is_better) * self(y_true, y_pred, w[0])


def f1_custom(y_true, y_pred, thr=0.5):
    return f1_score(y_true, y_pred > thr, average='micro')


def f1_macro(y_true, y_pred, thr=0.5):
    return f1_score(y_true, y_pred > thr, average='macro')


def acc_score(y_true, y_pred, thr=0.5):
    return accuracy_score(y_true, y_pred > thr)


def bacc_score(y_true, y_pred, thr=0.5):
    return balanced_accuracy_score(y_true, y_pred > thr)


class MetricFactory:
    def __init__(self, ):
        self.metrics = {
            'auc': Metric(metric=roc_auc_score, higher_is_better=True, name='auc', optimize=False, discrete=False),
            'log-loss': Metric(metric=log_loss, higher_is_better=False, name='log-loss', optimize=False,
                              discrete=False),
            'f1': Metric(metric=f1_custom, higher_is_better=True, name='f1', optimize=True, discrete=True),
            'f1-macro': Metric(metric=f1_macro, higher_is_better=True, name='f1_macro', optimize=True, discrete=True),
            'balanced-acc': Metric(metric=bacc_score, higher_is_better=True, name='balanced-acc', optimize=True,
                                   discrete=True),
            'acc': Metric(metric=acc_score, higher_is_better=True, name='acc', optimize=True, discrete=True),
            'mse': Metric(metric=mean_squared_error, higher_is_better=False, name='mse', optimize=False, discrete=False),
            'r2': Metric(metric=r2_score, higher_is_better=True, name='r2', optimize=False, discrete=False),
            'mae': Metric(metric=mean_absolute_error, higher_is_better=False, name='mae', optimize=False, discrete=False)
        }

    def get_allowed(self):
        return sorted(list(self.metrics.keys()))

    def add(self, metric_name, metric_class):
        self.metrics[metric_name] = metric_class
        return self

    def remove(self, metric_name):
        del self.models[metric_name]
        return self

    def __getitem__(self, metric_name):
        return deepcopy(self.metrics[metric_name])


In [None]:
def compute_metrics(
    outputs,
    targets,
) -> Dict[str, float]:
    metrics = {}
    y_true = targets
    y_pred = outputs

    mse = metric_factory['mse']
    mse_score = mse(y_true, y_pred)
    
    mae = metric_factory['mae']
    mae_score = mae(y_true, y_pred)
    
    r2 = metric_factory['r2']
    r2_score = r2(y_true, y_pred)
    
    metrics['rmse'] = np.sqrt(mse_score)
    metrics['mae'] = mae_score
    metrics['r2'] = r2_score
    
    return metrics

## Data

### NYC Taxi

In [None]:
from sklearn.datasets import fetch_openml
df_nt = fetch_openml(data_id=42729, as_frame=True, parser='auto').frame
df_nt = df_nt[df_nt['tip_amount'] <= 20]
nf_nt = ['PULocationID', 'DOLocationID', 'passenger_count', 'tolls_amount', 'total_amount',
         'lpep_pickup_datetime_day', 'lpep_pickup_datetime_hour', 'lpep_pickup_datetime_minute',
        'lpep_dropoff_datetime_day', 'lpep_dropoff_datetime_hour', 'lpep_dropoff_datetime_minute']
cf_nt = ['VendorID', 'store_and_fwd_flag', 'RatecodeID', 'extra', 'mta_tax', 
        'improvement_surcharge', 'trip_type']
scaler = StandardScaler()
df_nt[nf_nt] = scaler.fit_transform(df_nt[nf_nt])
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transf = ohe.fit_transform(df_nt[cf_nt])
tdf = pd.DataFrame(transf, columns=ohe.get_feature_names_out())
tdf.index = df_nt.index
df_nt = df_nt.drop(cf_nt, axis=1)
df_nt = pd.concat([df_nt, tdf], axis=1)
df_nt_target_name = 'tip_amount'
df_nt.shape

### Colleges

In [None]:
df_cl = fetch_openml(data_id=42727, as_frame=True).frame
df_cl.fillna(df_cl.median(), inplace=True)
nf_cl = ['city', 'state', 'zip', 'latitude', 'longitude', 'admission_rate',
        'sat_verbal_midrange', 'sat_math_midrange', 'sat_writing_midrange', 'act_combined_midrange', 
        'act_english_midrange', 'act_math_midrange', 'act_writing_midrange', 'sat_total_average', 'undergrad_size',
        'percent_white', 'percent_black', 'percent_hispanic', 'percent_asian', 'percent_part_time', 
        'average_cost_academic_year', 'average_cost_program_year', 'tuition_(instate)', 'tuition_(out_of_state)',
        'spend_per_student', 'faculty_salary', 'percent_part_time_faculty', 'completion_rate', 'percent_female',
        'agege24', 'faminc', 'mean_earnings_6_years', 'median_earnings_6_years', 'mean_earnings_10_years',
        'median_earnings_10_years', 'carnegie_basic_classification', 'carnegie_undergraduate', 'carnegie_size',
        'religious_affiliation', ]
cf_cl = ['predominant_degree', 'highest_degree', 'ownership', 'region', 'gender']
le = LabelEncoder()
df_cl['city'] = le.fit_transform(df_cl['city'])
df_cl['state'] = le.fit_transform(df_cl['state'])
df_cl['zip'] = le.fit_transform(df_cl['zip'])
df_cl['carnegie_basic_classification'] = le.fit_transform(df_cl['carnegie_basic_classification'])
df_cl['carnegie_undergraduate'] = le.fit_transform(df_cl['carnegie_undergraduate'])
df_cl['carnegie_size'] = le.fit_transform(df_cl['carnegie_size'])
df_cl['religious_affiliation'] = le.fit_transform(df_cl['religious_affiliation'])
scaler = StandardScaler()
df_cl[nf_cl] = scaler.fit_transform(df_cl[nf_cl])
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transf = ohe.fit_transform(df_cl[cf_cl])
tdf = pd.DataFrame(transf, columns=ohe.get_feature_names_out())
tdf.index = df_cl.index
df_cl = df_cl.drop(cf_cl, axis=1)
df_cl = pd.concat([df_cl, tdf], axis=1)
df_cl_target_name = 'percent_pell_grant'
df_cl.shape

### House sales

In [None]:
df_hs = fetch_openml(data_id=42731, as_frame=True).frame
df_hs.fillna(df_hs.median(), inplace=True)
df_hs = df_hs[df_hs['price'] <= 3000000]
nf_hs = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built', 
    'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_year', 'date_month', 'date_day']
cf_hs = ['floors', 'waterfront', 'view', 'condition', 'grade']
le = LabelEncoder()
df_hs['zipcode'] = le.fit_transform(df_hs['zipcode'])
scaler = StandardScaler()
df_hs[nf_hs] = scaler.fit_transform(df_hs[nf_hs])
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transf = ohe.fit_transform(df_hs[cf_hs])
tdf = pd.DataFrame(transf, columns=ohe.get_feature_names_out())
tdf.index = df_hs.index
df_hs = df_hs.drop(cf_hs, axis=1)
df_hs = pd.concat([df_hs, tdf], axis=1)
df_hs_target_name = 'price'
df_hs.shape

### Black friday

In [None]:
df_bf = fetch_openml(data_id=41540, as_frame=True).frame
df_bf.fillna(df_bf.median(), inplace=True)
nf_bf = ['Occupation', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3']
cf_bf = ['Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status']
le = LabelEncoder()
scaler = StandardScaler()
df_bf[nf_bf] = scaler.fit_transform(df_bf[nf_bf])
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transf = ohe.fit_transform(df_bf[cf_bf])
tdf = pd.DataFrame(transf, columns=ohe.get_feature_names_out())
tdf.index = df_bf.index
df_bf = df_bf.drop(cf_bf, axis=1)
df_bf = pd.concat([df_bf, tdf], axis=1)
df_bf_target_name = 'Purchase'
df_bf.shape

### Beijing PM2.5

In [None]:
df_bp = pd.read_csv('./data/PRSA_data_2010.1.1-2014.12.31.csv')
df_bp.drop(['No'], axis=1, inplace=True)
df_bp.fillna(df_bp.median(), inplace=True)
df_bp = df_bp[df_bp['pm2.5'] <= 600]
nf_bp = ['month', 'day', 'hour', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir']
cf_bp = ['year', 'cbwd']
le = LabelEncoder()
df_bp['year'] = le.fit_transform(df_bp['year'])
df_bp['cbwd'] = le.fit_transform(df_bp['cbwd'])
scaler = StandardScaler()
df_bp[nf_bp] = scaler.fit_transform(df_bp[nf_bp])
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transf = ohe.fit_transform(df_bp[cf_bp])
tdf = pd.DataFrame(transf, columns=ohe.get_feature_names_out())
tdf.index = df_bp.index
df_bp = df_bp.drop(cf_bp, axis=1)
df_bp = pd.concat([df_bp, tdf], axis=1)
df_bp_target_name = 'pm2.5'
df_bp.shape

### Brazilian houses

In [None]:
df_bh = fetch_openml(data_id=42688, as_frame=True).frame
df_bh.fillna(df_bh.median(), inplace=True)
df_bh = df_bh[df_bh['total_(BRL)'] <= 40000]
nf_bh = ['area', 'rooms', 'bathroom', 'parking_spaces', 'floor', 'hoa_(BRL)', 'rent_amount_(BRL)',
        'property_tax_(BRL)', 'fire_insurance_(BRL)']
cf_bh = ['city', 'animal', 'furniture']
le = LabelEncoder()
scaler = StandardScaler()
df_bh['city'] = le.fit_transform(df_bh['city'])
df_bh['animal'] = le.fit_transform(df_bh['animal'])
df_bh['furniture'] = le.fit_transform(df_bh['furniture'])
df_bh[nf_bh] = scaler.fit_transform(df_bh[nf_bh])
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transf = ohe.fit_transform(df_bh[cf_bh])
tdf = pd.DataFrame(transf, columns=ohe.get_feature_names_out())
tdf.index = df_bh.index
df_bh = df_bh.drop(cf_bh, axis=1)
df_bh = pd.concat([df_bh, tdf], axis=1)
df_bh_target_name = 'total_(BRL)'
df_bh.shape

### Give Me Some Credit

In [None]:
df_gsc = pd.read_csv('./data/gsc-training.csv', )
df_gsc = df_gsc.drop(['Unnamed: 0'], axis=1)
df_gsc.fillna(df_gsc.median(), inplace=True)

scaler = StandardScaler()
nf_gsc = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'MonthlyIncome', 'age']
cf_gsc = ['NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']
df_gsc[nf_gsc] = scaler.fit_transform(df_gsc[nf_gsc])
le = LabelEncoder()
cc_gsc = []
for cf_name in cf_gsc:
    df_gsc[cf_name] = le.fit_transform(df_gsc[cf_name])
    cc_gsc.append(len(np.unique(df_gsc[cf_name])))
tdf = df_gsc[cf_gsc]
df_gsc = df_gsc.drop(cf_gsc, axis=1)
df_gsc = pd.concat([df_gsc, tdf], axis=1)
df_gsc_target_name = 'SeriousDlqin2yrs'
df_gsc.shape

### Churn Modelling

In [None]:
df_cm = pd.read_csv('./data/Churn_Modelling.csv')
df_cm = df_cm.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
df_cm.fillna(df_cm.median(), inplace=True)

scaler = StandardScaler()
nf_cm = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']
cf_cm = ['Geography', 'Gender', 'Tenure', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']
df_cm[nf_cm] = scaler.fit_transform(df_cm[nf_cm])
le = LabelEncoder()
cc_cm = []
for cf_name in cf_cm:
    df_cm[cf_name] = le.fit_transform(df_cm[cf_name])
    cc_cm.append(len(np.unique(df_cm[cf_name])))
tdf = df_cm[cf_cm]
df_cm = df_cm.drop(cf_cm, axis=1)
df_cm = pd.concat([df_cm, tdf], axis=1)
df_cm_target_name = 'Exited'
df_cm.shape

### Vehicle Loan Default

In [None]:
df_vld = pd.read_csv('./data/vehicle_loan_default_train.csv')
df_vld = df_vld.drop(['UNIQUEID', 'EMPLOYEE_CODE_ID', 'MOBILENO_AVL_FLAG'], axis=1)
df_vld.fillna(df_vld.median(), inplace=True)

le = LabelEncoder()
df_vld['AVERAGE_ACCT_AGE'] = le.fit_transform(df_vld['AVERAGE_ACCT_AGE'])
df_vld['CREDIT_HISTORY_LENGTH'] = le.fit_transform(df_vld['CREDIT_HISTORY_LENGTH'])


df_vld['DATE_OF_BIRTH'] = pd.to_datetime(df_vld['DATE_OF_BIRTH'], format='%d-%m-%Y')
df_vld['DATE_OF_BIRTH_d'] = df_vld['DATE_OF_BIRTH'].dt.day
df_vld['DATE_OF_BIRTH_m'] = df_vld['DATE_OF_BIRTH'].dt.month
df_vld['DATE_OF_BIRTH_y'] = df_vld['DATE_OF_BIRTH'].dt.year
df_vld = df_vld.drop(['DATE_OF_BIRTH'], axis=1)
df_vld['DISBURSAL_DATE'] = pd.to_datetime(df_vld['DISBURSAL_DATE'], format='%d-%m-%Y')
df_vld['DISBURSAL_DATE_d'] = df_vld['DISBURSAL_DATE'].dt.day
df_vld['DISBURSAL_DATE_m'] = df_vld['DISBURSAL_DATE'].dt.month
df_vld['DISBURSAL_DATE_y'] = df_vld['DISBURSAL_DATE'].dt.year
df_vld = df_vld.drop(['DISBURSAL_DATE'], axis=1)


nf_vld = ['DISBURSED_AMOUNT', 'ASSET_COST', 'LTV', 'BRANCH_ID', 'SUPPLIER_ID', 'MANUFACTURER_ID', 'CURRENT_PINCODE_ID', 
     'PERFORM_CNS_SCORE', 'PRI_CURRENT_BALANCE', 'PRI_SANCTIONED_AMOUNT', 'PRI_DISBURSED_AMOUNT', 'SEC_CURRENT_BALANCE',
     'SEC_SANCTIONED_AMOUNT', 'SEC_DISBURSED_AMOUNT', 'PRIMARY_INSTAL_AMT', 'SEC_INSTAL_AMT', 'NO_OF_INQUIRIES', 
      'PRI_NO_OF_ACCTS', 'PRI_ACTIVE_ACCTS', 'PRI_OVERDUE_ACCTS', 'SEC_NO_OF_ACCTS', 'SEC_ACTIVE_ACCTS', 
      'SEC_OVERDUE_ACCTS', 'NEW_ACCTS_IN_LAST_SIX_MONTHS', 'DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS', 
     'AVERAGE_ACCT_AGE', 'CREDIT_HISTORY_LENGTH', 'DATE_OF_BIRTH_d', 'DATE_OF_BIRTH_m', 'DATE_OF_BIRTH_y', 
     'DISBURSAL_DATE_d', 'DISBURSAL_DATE_m', 'DISBURSAL_DATE_y'] 
cf_vld = ['EMPLOYMENT_TYPE', 'STATE_ID', 'AADHAR_FLAG', 'PAN_FLAG', 'VOTERID_FLAG', 'DRIVING_FLAG',
     'PASSPORT_FLAG', 'PERFORM_CNS_SCORE_DESCRIPTION']
scaler = StandardScaler()
df_vld[nf_vld] = scaler.fit_transform(df_vld[nf_vld])

cc_vld = []
for cf_name in cf_vld:
    df_vld[cf_name] = le.fit_transform(df_vld[cf_name])
    cc_vld.append(len(np.unique(df_vld[cf_name])))

tdf = df_vld[cf_vld]
df_vld = df_vld.drop(cf_vld, axis=1)
df_vld = pd.concat([df_vld, tdf], axis=1)
df_vld_target_name = 'LOAN_DEFAULT'
df_vld.shape

### Adult Income Dataset

In [None]:
df_ai = pd.read_csv('./data/adult.csv')
df_ai.replace('<=50K', 0, inplace=True)
df_ai.replace('>50K', 1, inplace=True)
df_ai.fillna(df_ai.median(), inplace=True)
nf_ai = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'] 
cf_ai = ['workclass', 'education', 'educational-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender',
     'native-country']
scaler = StandardScaler()
df_ai[nf_ai] = scaler.fit_transform(df_ai[nf_ai])

cc_ai = []
for cf_name in cf_ai:
    df_ai[cf_name] = le.fit_transform(df_ai[cf_name])
    cc_ai.append(len(np.unique(df_ai[cf_name])))

tdf = df_ai[cf_ai]
tdf.index = df_ai.index
df_ai = df_ai.drop(cf_ai, axis=1)
df_ai = pd.concat([df_ai, tdf], axis=1)
df_ai_target_name = 'income'
df_ai.shape

### HELOC

In [None]:
df_heloc = pd.read_csv('./data/RiskData.csv')
df_heloc.replace('.', 0, inplace=True)
df_heloc = df_heloc.drop(['Sampling_Weight'], axis=1)

nf_heloc = ['Interest_Revenue', 'Application_Date', 'Age', 'Income', 'Debt_Ratio', 'Loan_Amt',
     'Loan_Amt_Req', 'LTV', 'FICO_Score', 'Prior_Custom_Score', 'Current_Custom_Score', 'CB_Age_Oldest_TL',
     'CB_Age_Newest_TL', 'CB_Avg_Mos_File', 'CB_Nb_Sat_TL', 'CB_Pct_Sat_TL', 'CB_Mos_Since_Dlq', 'CB_Max_Dlq_12_Mos',
     'CB_Max_Dlq_Ever', 'CB_Nb_Total_TL', 'CB_Nb_TL_Open_12', 'CB_Pct_IL_TL', 'CB_Nb_Inq_6_Mos', 
     'CB_Nb_Inq_6_Mos_excl_7_Days', 'CB_Rev_Util', 'CB_IL_Util', 'CB_Nb_Rev_TL_w_Bal', 'CB_Nb_IL_TL_w_Bal', 
     'CB_Nb_Rev_Tl_75_Pct_Limit', 'CB_Pct_TL_w_Bal']
cf_heloc = ['Nb_Borrowers', 'Region', 'Bank_Relationship', 'CB_Nb_60_Plus_TL', 'CB_Nb_90_Plus_TL']
scaler = StandardScaler()
df_heloc.fillna(df_heloc.median(), inplace=True)
df_heloc[nf_heloc] = df_heloc[nf_heloc].astype(float)
df_heloc[cf_heloc] = df_heloc[cf_heloc].astype(str)

df_heloc[nf_heloc] = scaler.fit_transform(df_heloc[nf_heloc])

cc_heloc = []
for cf_name in cf_heloc:
    df_heloc[cf_name] = le.fit_transform(df_heloc[cf_name])
    cc_heloc.append(len(np.unique(df_heloc[cf_name])))

tdf = df_heloc[cf_heloc]
df_heloc = df_heloc.drop(cf_heloc, axis=1)
df_heloc = pd.concat([df_heloc, tdf], axis=1)
df_heloc_target_name = 'Risk_Flag'
df_heloc.shape

### Fraud Ecomm

In [None]:
from sklearn.preprocessing import LabelEncoder


df_fe = pd.read_csv('./data/fraud_ecomm.csv')
ip = pd.read_csv('./data/IpAddress_to_Country.csv')
df_fe['signup_time'] = pd.to_datetime(df_fe['signup_time'], format='%Y-%m-%d %H:%M:%S')
df_fe['signup_time_y'] = df_fe['signup_time'].dt.year
df_fe['signup_time_mon'] = df_fe['signup_time'].dt.month
df_fe['signup_time_w'] = df_fe['signup_time'].dt.week
df_fe['signup_time_d'] = df_fe['signup_time'].dt.day
df_fe['signup_time_h'] = df_fe['signup_time'].dt.hour
df_fe['signup_time_m'] = df_fe['signup_time'].dt.minute
df_fe['signup_time_s'] = df_fe['signup_time'].dt.second
df_fe['signup_time_wd'] = df_fe['signup_time'].dt.dayofweek
df_fe = df_fe.drop(['signup_time'], axis=1)
df_fe['purchase_time'] = pd.to_datetime(df_fe['purchase_time'], format='%Y-%m-%d %H:%M:%S')
df_fe['purchase_time_y'] = df_fe['purchase_time'].dt.year
df_fe['purchase_time_mon'] = df_fe['purchase_time'].dt.month
df_fe['purchase_time_w'] = df_fe['purchase_time'].dt.week
df_fe['purchase_time_d'] = df_fe['purchase_time'].dt.day
df_fe['purchase_time_h'] = df_fe['purchase_time'].dt.hour
df_fe['purchase_time_m'] = df_fe['purchase_time'].dt.minute
df_fe['purchase_time_s'] = df_fe['purchase_time'].dt.second
df_fe['purchase_time_wd'] = df_fe['purchase_time'].dt.dayofweek
df_fe = df_fe.drop(['purchase_time'], axis=1)
le = LabelEncoder()
df_fe['device_id'] = le.fit_transform(df_fe['device_id'])
df_fe['source'] = le.fit_transform(df_fe['source'])
df_fe['browser'] = le.fit_transform(df_fe['browser'])
df_fe['sex'] = le.fit_transform(df_fe['sex'])
df_fe['age'] = le.fit_transform(df_fe['age'])
ip['country'] = le.fit_transform(ip['country'])
ip['lower_bound_ip_address'] = ip['lower_bound_ip_address'].astype('float')
ip['upper_bound_ip_address'] = ip['upper_bound_ip_address'].astype('float')
df_fe['ip_address'] = df_fe['ip_address'].astype('float')
def ip_to_country(ip_val):
    try :
        return ip.country[(ip.lower_bound_ip_address < ip_val)                            
                                & 
                                (ip.upper_bound_ip_address > ip_val)].iloc[0]
    except IndexError :
        return -1
df_fe['ip_country'] = df_fe['ip_address'].apply(ip_to_country)

device_duplicates = pd.DataFrame(df_fe.groupby(by = "device_id").device_id.count())
device_duplicates.rename(columns={"device_id": "freq_device"}, inplace=True)           
device_duplicates.reset_index(level=0, inplace= True)
df_fe = df_fe.merge(device_duplicates, on= "device_id")

df_fe = df_fe.drop(['user_id'], axis=1)
nf_fe = ['purchase_value', 'ip_address', 'device_id', 'signup_time_y', 'signup_time_mon',
       'signup_time_w', 'signup_time_d', 'signup_time_h', 'signup_time_m',
       'signup_time_s', 'signup_time_wd', 'purchase_time_y',
       'purchase_time_mon', 'purchase_time_w', 'purchase_time_d',
       'purchase_time_h', 'purchase_time_m', 'purchase_time_s',
       'purchase_time_wd', 'age', 'ip_country', 'freq_device']
cf_fe = ['source', 'browser', 'sex']

scaler = StandardScaler()
df_fe[nf_fe] = scaler.fit_transform(df_fe[nf_fe])

cc_fe = []
for cf_name in cf_fe:
    df_fe[cf_name] = le.fit_transform(df_fe[cf_name])
    cc_fe.append(len(np.unique(df_fe[cf_name])))

tdf = df_fe[cf_fe]
tdf.index = df_fe.index
df_fe = df_fe.drop(cf_fe, axis=1)
df_fe = pd.concat([df_fe, tdf], axis=1)

df_fe_target_name = 'class'

df_fe.shape

### HPO

In [None]:
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

def objective_c(trial):

    params = {
        "n_estimators": trial.suggest_categorical("n", [100, 200, 500, 1000]),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("lr", 1e-3, 1, log=True),
        "gamma": trial.suggest_float("gamma", 1e-2, 1),
        "subsample": trial.suggest_float("subsample", 1e-2, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 1e-2, 1),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 1)
    }

    model = XGBClassifier(**params)
    model.fit(X_train_t, y_train_t)
    pred = model.predict(X_val_t)
    AUC = roc_auc_score(y_val_t, pred)
    return AUC
     

In [None]:
X = df_vld.drop([df_vld_target_name], axis=1)
y = df_vld[df_vld_target_name]
X_train_t, X_val_t, y_train_t, y_val_t = train_test_split(X, y, test_size=0.3, shuffle=True)
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective_c, n_trials=100)
print(study.best_trial.params)
print(study.best_value)

In [None]:
def objective_r(trial):

    params = {
        "n_estimators": trial.suggest_categorical("n", [5, 10, 50, 100, 200, 500]),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("lr", 1e-3, 1, log=True),
        "gamma": trial.suggest_float("gamma", 1e-2, 1),
        "subsample": trial.suggest_float("subsample", 1e-2, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 1e-2, 1),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 1)
    }

    model = XGBRegressor(**params)
    model.fit(X_train_t, y_train_t)
    pred = model.predict(X_val_t)
    RMSE = mean_squared_error(y_val_t, pred, squared=False)
    return RMSE
     

In [None]:
X = df_bh.drop([df_bh_target_name], axis=1)
y = df_bh[df_bh_target_name]
X_train_t, X_val_t, y_train_t, y_val_t = train_test_split(X, y, test_size=0.3, shuffle=True)
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler())
study.optimize(objective_r, n_trials=100)
print(study.best_trial.params)
print(study.best_value)