# Project Info

Candidate Models:
- [XgBoost](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)
- [LightGBM](https://lightgbm.readthedocs.io/en/latest/Python-API.html)
- [CatBoost](https://catboost.ai/)

Criterion: log-loss
Remember we are predicting probabilities.

Colab from the Business Analytics project:  
https://colab.research.google.com/drive/1xr7VrPMDFF9uksnPYwU1CnTB5d5JB-gd

General approach
- Import libraries
- Import data (sample for development)

- Function for determining long-tail
  - If long-tail, function for encoding column into 80:20 categorical
  - If not long-tail, leave as is
- Function for replacing value with relative incidence proportion

- Split the data into N subsets
- Split each of N subsets into N train, validation, test

- Fit a model for each subset
- Implement "voting" for each model ensemble

In [None]:
# import libraries
import os
import random

import numpy as np
import pandas as pd

import sklearn
import lightgbm

# mount the google drive
from google.colab import drive
drive.mount('/content/drive')

# set random state
random_state = 42

Mounted at /content/drive


In [None]:
# import sample data
p = '/content/drive/MyDrive/data'
data_f = "2022.csv"
data_fp = os.path.join(p, data_f)

df = pd.read_csv(data_fp)

NameError: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# # import data and sample the data
# p = '/content/drive/MyDrive/team2/machine_learning'
# data_f = "ProjectTrainingData.csv"
# data_fp = os.path.join(p, data_f)

# sample_size = 100000
# num_records = sum(1 for line in open(data_fp)) - 1 

# skip = sorted(random.sample(range(1,num_records+1),num_records-sample_size))
# df = pd.read_csv(data_fp, skiprows=skip)

KeyboardInterrupt: ignored

In [None]:
# make everything into a string
id_click_list = ["id", "click"]
col_name_list = list(set(df.columns) - set(id_click_list))
df[col_name_list] = df[col_name_list].astype(str)

In [None]:
# confirm data
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10055435465659353593,0,14102100,1005,0,518d6168,92c9e92d,f028772b,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157
1,10059243968861188634,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15701,320,50,1722,0,35,-1,79
2,10109662282079545448,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,1779deee,2347f47a,...,1,0,18936,320,50,1926,2,39,-1,13
3,10177862548969891071,0,14102100,1010,1,85f751fd,c4e18dd6,50e219e0,ffc6ffd0,7801e8d9,...,4,0,21665,320,50,2493,3,35,-1,117
4,10261408033730224183,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,00848fac,2347f47a,...,1,3,18993,320,50,2161,0,35,100148,157


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                100000 non-null  uint64
 1   click             100000 non-null  int64 
 2   hour              100000 non-null  object
 3   C1                100000 non-null  object
 4   banner_pos        100000 non-null  object
 5   site_id           100000 non-null  object
 6   site_domain       100000 non-null  object
 7   site_category     100000 non-null  object
 8   app_id            100000 non-null  object
 9   app_domain        100000 non-null  object
 10  app_category      100000 non-null  object
 11  device_id         100000 non-null  object
 12  device_ip         100000 non-null  object
 13  device_model      100000 non-null  object
 14  device_type       100000 non-null  object
 15  device_conn_type  100000 non-null  object
 16  C14               100000 non-null  obje

# Data Preparation

In [None]:
# check for high cardinality columns
# high cardinality = lots of unique values
# low cardinality = not a lot of unique values
# we will start with 10% as our cutoff value
nunique_series = df.nunique() / len(df) * 100
print(nunique_series)

exclude_list = list(nunique_series[nunique_series > 1].index)
exclude_list.append("click")
exclude_list

id                  100.000
click                 0.002
hour                  0.216
C1                    0.007
banner_pos            0.007
site_id               1.477
site_domain           1.360
site_category         0.020
app_id                1.306
app_domain            0.095
app_category          0.020
device_id            17.079
device_ip            77.322
device_model          3.138
device_type           0.005
device_conn_type      0.004
C14                   1.614
C15                   0.008
C16                   0.009
C17                   0.376
C18                   0.004
C19                   0.062
C20                   0.157
C21                   0.055
dtype: float64


['id',
 'site_id',
 'site_domain',
 'app_id',
 'device_id',
 'device_ip',
 'device_model',
 'C14',
 'click']

In [None]:
# function for correcting long-tail
def correct_long_tail(df, col_name, cutoff=.20):
  """ This function corrects a long-tail categorical distrubtion
      
      Input:
        df: dataframe
        col_name: str of column name 

      Output:
        col: series of column values
  """
  long_tail_flag = False

  unique_value_cnt = len(df[col_name].unique())

  value_by_cnt_df = pd.DataFrame(df[col_name].value_counts(normalize = True))

  c_value_perc = 0
  c_distinct_perc = 0
  replace_list = []
  for index, row in value_by_cnt_df.iterrows():
    c_distinct_perc += 1/unique_value_cnt
    c_value_perc += row[col_name]
    
    # If we have seen 20% of distinct values and have 80% of total values
    if (c_distinct_perc > cutoff) and (c_value_perc > 1 - cutoff):
      replace_list.append(index)

    # print(f"{c_distinct_perc} | {c_value_perc}")

  replace_df = df.copy()

  col = replace_df[col_name].apply(lambda x: "tail_value" if x in replace_list else x)

  return col

c = correct_long_tail(df, "C19")
c.value_counts(normalize = True)

35            0.31377
39            0.22449
tail_value    0.16590
167           0.08069
161           0.04311
47            0.02814
1327          0.02720
297           0.02648
163           0.02119
679           0.02071
175           0.01836
935           0.01569
687           0.01427
Name: C19, dtype: float64

In [None]:
col_name_list = list(set(df.columns) - set(exclude_list))
print(col_name_list)

for col_name in col_name_list:
  print(col_name)
  df[col_name] = correct_long_tail(df, col_name)

['C20', 'C16', 'app_domain', 'C18', 'device_type', 'C19', 'C1', 'app_category', 'device_conn_type', 'site_category', 'banner_pos', 'C15', 'hour', 'C21', 'C17']
C20
C16
app_domain
C18
device_type
C19
C1
app_category
device_conn_type
site_category
banner_pos
C15
hour
C21
C17


In [None]:
!pip install category_encoders
import category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 765 kB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.1.post0


In [None]:
woe_enc = category_encoders.WOEEncoder(col_name_list)
woe_fit = woe_enc.fit(df[col_name_list], df['click'])
woe_df = woe_fit.transform(df[col_name_list])

In [None]:
woe_df = pd.concat([woe_df, df['click']], axis=1)

woe_df = woe_df.sample(frac=1, random_state=42).reset_index(drop=True)

woe_df = woe_df.astype(float)

In [None]:
l = [x / 10 for x in list(range(1, 10, 5))]
l

[0.1, 0.6]

In [None]:
woe_df.head()

Unnamed: 0,C20,C16,app_domain,C18,device_type,C19,C1,app_category,device_conn_type,site_category,banner_pos,C15,hour,C21,C17,click
0,0.162266,-0.084064,0.154317,-0.09079,-0.001799,-0.03466,-0.000359,0.180697,-4.7e-05,0.254595,-0.036027,-0.081666,-0.285839,-0.232263,-0.00239,0.0
1,0.162266,-0.084064,0.154317,-0.09079,-0.001799,-0.03466,-0.000359,0.180697,-4.7e-05,0.254595,-0.036027,-0.081666,-0.180571,-0.024678,-0.744674,0.0
2,0.162266,-0.084064,-0.277088,-0.09079,-0.001799,-0.03466,-0.000359,-0.466584,-4.7e-05,-0.317639,-0.036027,-0.081666,-0.125753,-0.436111,-0.55025,0.0
3,0.01828,-0.084064,0.154317,-0.09079,0.020509,-0.03466,0.003881,0.180697,-4.7e-05,-0.317639,-0.036027,-0.081666,0.242685,-0.024678,0.25732,1.0
4,0.162266,-0.084064,0.154317,0.309664,-0.001799,-0.119495,-0.000359,0.180697,-4.7e-05,0.040748,0.086702,-0.081666,-0.085751,1.199837,0.993235,0.0


In [None]:
woe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   C20               100000 non-null  float64
 1   C16               100000 non-null  float64
 2   app_domain        100000 non-null  float64
 3   C18               100000 non-null  float64
 4   device_type       100000 non-null  float64
 5   C19               100000 non-null  float64
 6   C1                100000 non-null  float64
 7   app_category      100000 non-null  float64
 8   device_conn_type  100000 non-null  float64
 9   site_category     100000 non-null  float64
 10  banner_pos        100000 non-null  float64
 11  C15               100000 non-null  float64
 12  hour              100000 non-null  float64
 13  C21               100000 non-null  float64
 14  C17               100000 non-null  float64
 15  click             100000 non-null  float64
dtypes: float64(16)
memory

## Modeling

In [None]:
train_perc = .6
val_perc = .2

In [None]:
from re import X
train_df, val_df, test_df = np.split(woe_df, 
                                  [int(train_perc*len(woe_df)),
                                    int((train_perc + val_perc)*len(woe_df))])

train_X = train_df.drop(["click"], axis=1)
train_y = train_df["click"]

val_X = val_df.drop(["click"], axis=1)
val_y = val_df["click"]

test_X = test_df.drop(["click"], axis=1)
test_y = test_df["click"]

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, make_scorer
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
est = lightgbm.LGBMClassifier(
    objective= 'binary',
    nthread=4,
    seed=random_state
)

parameters = {
    'max_depth': range (2, 10, 2),
    'n_estimators': range(60, 240, 20),
    'learning_rate': [0.1, 0.01, 0.05]
}

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

clf = GridSearchCV(
    estimator=est,
    param_grid=parameters,
    scoring = LogLoss,
    n_jobs = -1,
    cv = 5,
    verbose=True
)

clf.fit(train_X,
        train_y,
        verbose=True)

print(clf.best_params_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 120}


In [None]:
xgb_c = xgboost.XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=random_state,
    max_depth=xgb_clf.best_params_.get("max_depth"),
    learning_rate=xgb_clf.best_params_.get("learning_rate"),
    n_estimators=xgb_clf.best_params_.get("n_estimators")
)

xgb_c.fit(train_X, train_y)

y_pred = xgb_c.predict(val_X)

accuracy = accuracy_score(val_y, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

roc_auc = roc_auc_score(val_y, y_pred)
print("ROC_AUC: %.2f%%" % (roc_auc * 100.0))

Accuracy: 83.41%
ROC_AUC: 52.55%


In [None]:
num_splits = 10
train_df_list = np.array_split(train_df, num_splits)
val_df_list = np.array_split(val_df, num_splits)
test_df_list = []


6000
2000
2000


In [None]:
# train test split
train_df, val_df, test_df = np.split(df, 
                                    [int(df.head()*len(df)),
                                      int((train_perc + val_perc)*len(df))])

In [None]:
# confirm train
train_df.head()
train_df_y=train_df.iloc[:,-1]
df.drop(df.columns[i], axis=1)

0        157
1         79
2         13
3        117
4        157
        ... 
59995     48
59996     42
59997     23
59998    182
59999     79
Name: C21, Length: 60000, dtype: int64

In [None]:
# confirm val
val_df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
60000,15769367607334356398,0,14102610,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,20108,320,50,2299,2,1327,100084,52
60001,15772716576874331805,0,14102610,1005,1,92c7cbe7,6e882918,75fa27f6,ecad2386,7801e8d9,...,1,0,21683,320,50,2496,3,167,-1,23
60002,15780481116374295046,0,14102610,1002,0,ffcb6b9a,fd0f18e8,50e219e0,ecad2386,7801e8d9,...,0,0,5299,320,50,479,3,39,100077,23
60003,15782393117659104918,0,14102610,1005,0,85f751fd,c4e18dd6,50e219e0,92f5800b,ae637522,...,1,3,21191,320,50,2424,1,161,100192,71
60004,1585201452533901602,1,14102610,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,...,1,0,20632,320,50,2374,3,39,-1,23


In [None]:
# confirm test
test_df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
80000,9011359456728644748,0,14102809,1005,1,f0ee33c5,9cf7de2f,f028772b,ecad2386,7801e8d9,...,1,0,22680,320,50,2528,0,39,100075,221
80001,9014069646008620437,0,14102809,1005,1,57ef2c87,bd6d812f,f028772b,ecad2386,7801e8d9,...,1,0,22683,320,50,2528,0,39,-1,221
80002,9018341982126077768,0,14102809,1005,0,85f751fd,c4e18dd6,50e219e0,54c5d545,2347f47a,...,1,2,23144,320,50,2665,0,34,-1,221
80003,907765228224517089,0,14102809,1005,0,85f751fd,c4e18dd6,50e219e0,e9739828,df32afa9,...,1,0,23143,320,50,2665,0,34,100135,221
80004,9105010637968482882,0,14102809,1005,0,85f751fd,c4e18dd6,50e219e0,d36838b1,0e8616ad,...,1,2,23144,320,50,2665,0,34,100111,221


###Xgboost

In [None]:
dtrain = xgb.DMatrix(train_df)
dval = xgb.DMatrix(val_df)

ValueError: ignored

In [None]:
#Set parameter
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'
#Set training and validation data
evallist = [(train_df, 'train'), (val_df, 'eval')]
num_round = 10

In [None]:
#Train Model
bst = xgb.train(param, train_df, num_round, evallist)


TypeError: ignored

###LightGBM

###CatBoost

In [None]:
model = CatBoostRegressor(iterations=2,
                          learning_rate=1,
                          depth=2)
# Fit model
model.fit(train_df, v)
# Get predictions
preds = model.predict(eval_data)