In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from ipywidgets import widgets, interactive
import missingno as msno
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

In [2]:
# Load dataset
train = pd.read_csv('train_attacker_2022_cleaned.csv')
test = pd.read_csv('test_attacker_2022_cleaned.csv')

print(train.shape)

(47459, 20)


In [3]:
# Split train data into train and validation sets
validation = train.sample(17459,random_state=44)
train = train.drop(validation.index)

print(train.shape)
print(validation.shape)

(30000, 20)
(17459, 20)


In [4]:
train_categorical = train.select_dtypes('object')
validation_categorical = validation.select_dtypes('object')
test_categorical = test.select_dtypes('object')

- Version 1: One hot encoding + PCA

In [5]:
# One hot encoding
from sklearn.preprocessing import OneHotEncoder

OHE = OneHotEncoder(drop='first',sparse=False,handle_unknown='ignore')
OHE.fit(train_categorical)

train_OHE = pd.DataFrame(OHE.transform(train_categorical),columns=OHE.get_feature_names_out(),index=train_categorical.index)
validation_OHE = pd.DataFrame(OHE.transform(validation_categorical),columns=OHE.get_feature_names_out(),index=validation_categorical.index)
test_OHE = pd.DataFrame(OHE.transform(test_categorical),columns=OHE.get_feature_names_out(),index=test_categorical.index)

In [6]:
train_OHE

Unnamed: 0,cat_3_M,cat_3_S,cat_3_S1,cat_6_S,cat_6_S1,location_id_M,location_id_M1,location_id_S,location_id_S1,cat_8_M,cat_8_M1,cat_8_S,cat_8_S1,cat_10_M,cat_10_M1,cat_10_S,cat_10_S1,cat_11_M,cat_11_M1,cat_11_S,cat_11_S1,com_type_M,com_type_S,com_type_S1,cat_12_M,cat_12_M1,cat_12_S,cat_12_S1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=8)
pca.fit(train_OHE)

train_PCA = pd.DataFrame(pca.transform(train_OHE),index=train_OHE.index,columns=[f'pca_{i}' for i in range(1,9)])
validation_PCA = pd.DataFrame(pca.transform(validation_OHE),index=validation_OHE.index,columns=[f'pca_{i}' for i in range(1,9)])
test_PCA = pd.DataFrame(pca.transform(test_OHE),index=test_OHE.index,columns=[f'pca_{i}' for i in range(1,9)])

In [8]:
train_PCA

Unnamed: 0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8
1,-0.301034,-0.088210,-0.089447,-0.026757,-0.031806,-0.010625,-0.016023,-0.049548
2,1.090086,-0.092842,-0.307300,0.241743,-0.087208,-0.075398,0.087557,-0.147207
3,-0.301034,-0.088210,-0.089447,-0.026757,-0.031806,-0.010625,-0.016023,-0.049548
5,0.353313,-0.081273,-0.317192,0.116419,0.009005,-0.101555,-0.019536,-0.259802
6,-0.301034,-0.088210,-0.089447,-0.026757,-0.031806,-0.010625,-0.016023,-0.049548
...,...,...,...,...,...,...,...,...
47449,-0.301034,-0.088210,-0.089447,-0.026757,-0.031806,-0.010625,-0.016023,-0.049548
47450,-0.301034,-0.088210,-0.089447,-0.026757,-0.031806,-0.010625,-0.016023,-0.049548
47453,-0.301034,-0.088210,-0.089447,-0.026757,-0.031806,-0.010625,-0.016023,-0.049548
47457,1.082780,-0.094122,-0.307918,0.100025,-0.033767,-0.092285,-0.047507,-0.128273


In [9]:
# Merge PCA variables to the raw data
train_v1 = pd.merge(train.select_dtypes(exclude='object'),train_PCA,left_index=True,right_index=True)
validation_v1 = pd.merge(validation.select_dtypes(exclude='object'),validation_PCA,left_index=True,right_index=True)
test_v1 = pd.merge(test.select_dtypes(exclude='object'),test_PCA,left_index=True,right_index=True)

In [10]:
train_v1

Unnamed: 0,label,value,num_date_review,unknown_var_7,unknown_var_8,unknown_var_9,unknown_var_13,unknown_var_17,social_friend_count,delta_time,delta_date,age,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8
1,0,4500000.0,11.0,84.0,55.154329,45.0,0.62,2164.0,2534.0,0.885,1435.0,32.0,-0.301034,-0.088210,-0.089447,-0.026757,-0.031806,-0.010625,-0.016023,-0.049548
2,0,3367400.0,0.0,2.0,1.414214,1.0,0.33,2177.0,653.0,0.000,-143.0,41.0,1.090086,-0.092842,-0.307300,0.241743,-0.087208,-0.075398,0.087557,-0.147207
3,1,4500000.0,11.0,0.0,20.254213,0.0,0.62,2164.0,0.0,0.885,1435.0,32.0,-0.301034,-0.088210,-0.089447,-0.026757,-0.031806,-0.010625,-0.016023,-0.049548
5,1,4500000.0,11.0,-71.0,20.254213,-71.0,0.62,726.0,3416.0,0.885,1435.0,32.0,0.353313,-0.081273,-0.317192,0.116419,0.009005,-0.101555,-0.019536,-0.259802
6,0,4500000.0,9.0,16.0,20.254213,2.0,0.62,2164.0,0.0,1.801,1555.0,33.0,-0.301034,-0.088210,-0.089447,-0.026757,-0.031806,-0.010625,-0.016023,-0.049548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47449,0,6125000.0,5.0,16.0,20.254213,2.0,0.88,2181.0,0.0,0.731,1677.0,33.0,-0.301034,-0.088210,-0.089447,-0.026757,-0.031806,-0.010625,-0.016023,-0.049548
47450,1,4500000.0,11.0,16.0,20.254213,2.0,0.62,1810.0,11.0,0.885,1435.0,32.0,-0.301034,-0.088210,-0.089447,-0.026757,-0.031806,-0.010625,-0.016023,-0.049548
47453,0,4500000.0,11.0,16.0,20.254213,2.0,0.62,2164.0,0.0,0.885,1435.0,32.0,-0.301034,-0.088210,-0.089447,-0.026757,-0.031806,-0.010625,-0.016023,-0.049548
47457,1,4500000.0,11.0,16.0,20.254213,2.0,0.62,2164.0,65.0,0.885,1435.0,32.0,1.082780,-0.094122,-0.307918,0.100025,-0.033767,-0.092285,-0.047507,-0.128273


In [11]:
train_v1.to_csv('data_for_model/train_v1.csv',index=False)
validation_v1.to_csv('data_for_model/validation_v1.csv',index=False)
test_v1.to_csv('data_for_model/test_v1.csv',index=False)

- Version 2: Label encoding

In [12]:
train['location_id'].value_counts()

L     29130
M1      441
S       264
M       130
S1       35
Name: location_id, dtype: int64

In [13]:
def label_encoding(feature):
    category_list = []
    data = train.copy()
    for i in data[feature].value_counts().index:
        category_list.append(data[data[feature] == i]['label'].value_counts(normalize=True).sort_index().to_list())
    df_encode = pd.DataFrame(category_list,index=data[feature].value_counts().index).fillna(0).sort_values(1,ascending=True)
    
    return list(df_encode.index)

In [14]:
from sklearn.preprocessing import OrdinalEncoder
for col in train_categorical.select_dtypes('object').columns:
    label_enc = OrdinalEncoder(categories=[label_encoding(col)],handle_unknown='use_encoded_value',unknown_value=-1)
    train_categorical[col] = label_enc.fit_transform(train_categorical[[col]]).flatten()
    validation_categorical[col] = label_enc.transform(validation_categorical[[col]]).flatten()
    test_categorical[col] = label_enc.transform(test_categorical[[col]]).flatten()

In [15]:
# Merge label encoding data to the raw data
train_v2 = pd.merge(train.select_dtypes(exclude='object'),train_categorical,left_index=True,right_index=True)
validation_v2 = pd.merge(validation.select_dtypes(exclude='object'),validation_categorical,left_index=True,right_index=True)
test_v2 = pd.merge(test.select_dtypes(exclude='object'),test_categorical,left_index=True,right_index=True)

In [16]:
train_v2

Unnamed: 0,label,value,num_date_review,unknown_var_7,unknown_var_8,unknown_var_9,unknown_var_13,unknown_var_17,social_friend_count,delta_time,delta_date,age,cat_3,cat_6,location_id,cat_8,cat_10,cat_11,com_type,cat_12
1,0,4500000.0,11.0,84.0,55.154329,45.0,0.62,2164.0,2534.0,0.885,1435.0,32.0,1.0,0.0,2.0,1.0,2.0,2.0,1.0,2.0
2,0,3367400.0,0.0,2.0,1.414214,1.0,0.33,2177.0,653.0,0.000,-143.0,41.0,1.0,0.0,2.0,0.0,1.0,2.0,2.0,2.0
3,1,4500000.0,11.0,0.0,20.254213,0.0,0.62,2164.0,0.0,0.885,1435.0,32.0,1.0,0.0,2.0,1.0,2.0,2.0,1.0,2.0
5,1,4500000.0,11.0,-71.0,20.254213,-71.0,0.62,726.0,3416.0,0.885,1435.0,32.0,1.0,0.0,2.0,1.0,2.0,2.0,2.0,2.0
6,0,4500000.0,9.0,16.0,20.254213,2.0,0.62,2164.0,0.0,1.801,1555.0,33.0,1.0,0.0,2.0,1.0,2.0,2.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47449,0,6125000.0,5.0,16.0,20.254213,2.0,0.88,2181.0,0.0,0.731,1677.0,33.0,1.0,0.0,2.0,1.0,2.0,2.0,1.0,2.0
47450,1,4500000.0,11.0,16.0,20.254213,2.0,0.62,1810.0,11.0,0.885,1435.0,32.0,1.0,0.0,2.0,1.0,2.0,2.0,1.0,2.0
47453,0,4500000.0,11.0,16.0,20.254213,2.0,0.62,2164.0,0.0,0.885,1435.0,32.0,1.0,0.0,2.0,1.0,2.0,2.0,1.0,2.0
47457,1,4500000.0,11.0,16.0,20.254213,2.0,0.62,2164.0,65.0,0.885,1435.0,32.0,1.0,0.0,2.0,1.0,1.0,2.0,2.0,2.0


In [17]:
train_v2.to_csv('data_for_model/train_v2.csv',index=False)
validation_v2.to_csv('data_for_model/validation_v2.csv',index=False)
test_v2.to_csv('data_for_model/test_v2.csv',index=False)