In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb

from sklearn.model_selection import train_test_split

import gc



In [2]:
class konfig:
    random_state = 4222
    kaggle = True

In [3]:
path1 = '../archive/train_data.ftr'
path2 = '../archive/train_data_f32.ftr'
path3 = '../archive/test_data.ftr'
path4 = '../archive/test_data_f32.ftr'
#X_1 = pd.read_feather(path1)
#X_2 = pd.read_feather(path2)
#X_3 = pd.read_feather(path3)
#X_4 = pd.read_feather(path4)

In [4]:
train_data = pd.read_feather(path1)
train_data.head()
len(train_data.customer_ID.unique())

458913

In [5]:
train_data.shape

(5531451, 191)

In [6]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# Check for number of missing values
train_data.isnull().sum()

## Could be observed that there are many columns with many missing values

customer_ID          0
S_2                  0
P_2              45985
D_39                 0
B_1                  0
B_2               2016
R_1                  0
S_3            1020544
D_41              2016
B_3               2016
D_42           4740137
D_43           1658396
D_44            274319
B_4                  0
D_45              2017
B_5                  0
R_2                  0
D_46           1211699
D_47                 0
D_48            718725
D_49           4985917
B_6                233
B_7                  0
B_8              22268
D_50           3142402
D_51                 0
B_9                  0
R_3                  0
D_52             29563
P_3             301492
B_10                 0
D_53           4084585
S_5                  0
B_11                 0
S_6                  0
D_54              2016
R_4                  0
S_7            1020544
B_12                 0
S_8                  0
D_55            184803
D_56           2990943
B_13             49519
R_5        

In [7]:
# There are multiple transactions. Lets take only the latest transaction from each customer.
train=train_data.groupby('customer_ID').tail(1)
train=train.set_index(['customer_ID'])

#Drop date column since it is no longer relevant
train.drop(['S_2'],axis=1,inplace=True)
#Check for number of rows
train.shape
# We now have 458913 rows, which corresponds to the number of unique customers.

(458913, 189)

In [8]:
train.select_dtypes(['object'])

0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed
...
ffff41c8a52833b56430603969b9ca48d208e7c192c6a4081a6acc28cf4f8af7
ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fdd3e5b57cfcbee30286
ffff9984b999fccb2b6127635ed0736dda94e544e67e026eee4d20f680639ff6
ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf388145b2c3d01967fcce461
fffff1d38b785cef84adeace64f8f83db3a0c31e8d92eaba8b115f71cab04681


In [9]:
#Perform one-hot encoding for D_63 and D_64
#Drop columns D_63 and D_64 subsequently
train_D63 = pd.get_dummies(train[['D_63']])
train = pd.concat([train, train_D63], axis=1)
train = train.drop(['D_63'], axis=1)

train_D64 = pd.get_dummies(train[['D_64']])
train = pd.concat([train, train_D64], axis=1)
train = train.drop(['D_64'], axis=1)

In [10]:
train.columns

Index(['P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43',
       ...
       'D_63_CO', 'D_63_CR', 'D_63_XL', 'D_63_XM', 'D_63_XZ', 'D_64_', 'D_64_-1', 'D_64_O', 'D_64_R', 'D_64_U'], dtype='object', length=198)

In [11]:
#Given that there are many columns with large number of missing values, it is impractical to go through every single one of them to determine whether it is useful.
#Furthermore, we do not have information on the feature (e.g. actual name of the feature) except the type of variable
#Lets remove columns if there are >85% of missing values
train=train.dropna(axis=1, thresh=int(0.85*len(train)))

#Checking the shape of new train data
train.shape
## We are left with 160 columns

(458913, 162)

In [12]:
train_without_target=train.drop(['target'],axis=1)
cor_matrix = train_without_target.corr().abs()
upper_tri = cor_matrix.where((np.triu(np.ones(cor_matrix.shape), k=1) + np.tril(np.ones(cor_matrix.shape), k=-1)).astype(bool))
#Drop out columns with absolute correlation of more than 85%
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.85)]
train_drop_highcorr=train_without_target.drop(to_drop,axis=1)
train_drop_highcorr.shape

(458913, 133)

In [13]:
# Lets remove columns with variance less than or equal to 0.05. Keep only columns with high variance.
from sklearn.feature_selection import VarianceThreshold
from itertools import compress
def fs_variance(df, threshold:float=0.05):
    """
    Return a list of selected variables based on the threshold.
    """
    # The list of columns in the data frame
    features = list(df.columns)

    # Initialize and fit the method
    vt = VarianceThreshold(threshold = threshold)
    _ = vt.fit(df)

    # Get which column names which pass the threshold
    feat_select = list(compress(features, vt.get_support()))

    return feat_select
columns_to_keep=fs_variance(train_drop_highcorr)
# We are left with 85 columns (excluding target), which passed the threshold.
train_final=train[columns_to_keep]
len(columns_to_keep)

85

In [14]:
train_final1=train_final.join(train['target'])
x_train=train_final1.drop(['target'],axis=1)
y_train=train_final1['target']

In [16]:
# Split train data into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train, test_size=0.25, random_state=26)

In [36]:
trans = ['D_68', 'B_30', 'B_38', 'D_114', 'D_117', 'D_120', 'D_126']
for item in trans:
    x_train_split[item] = pd.to_numeric(x_train_split[item])


from xgboost import XGBClassifier
model=XGBClassifier(n_estimators=200,max_depth=3,learning_rate=0.3, subsample=0.5)
model.fit(x_train_split,y_train_split)
#Test the model
y_predict=model.predict(x_test_split)
print('XGBoost Classifier Accuracy: {:.3f}'.format(accuracy_score(y_test_split, y_predict)))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBoost Classifier Accuracy: 0.894


In [33]:
print('\nXGBoost Classifier Precision: {:.3f}'.format(precision_score (y_test_split, y_predict)))
print('\nXGBoost Classifier Recall: {:.3f}'.format(recall_score (y_test_split, y_predict)))


XGBoost Classifier Precision: 0.796

XGBoost Classifier Recall: 0.792


In [24]:
# Make a list of columns that we want to load for test data. Remove one-hot encoded names and target (since these columns not in the test data)
columns_to_load=list(columns_to_keep)
columns_to_load=columns_to_load+['D_63','D_64','customer_ID','S_2']
columns_to_load.remove('D_63_CO')
columns_to_load.remove('D_63_CR')
columns_to_load.remove('D_63_CL')
columns_to_load.remove('D_64_O')
columns_to_load.remove('D_64_R')
columns_to_load.remove('D_64_U')

In [26]:
#Read in the test_data
test_data = pd.read_feather(path3 ,columns=columns_to_load)

In [27]:
# There are multiple transactions. Lets take only the latest transaction from each customer.
test=test_data.groupby('customer_ID').tail(1)
test=test.set_index(['customer_ID'])

#Drop date column since it is no longer relevant
test.drop(['S_2'],axis=1,inplace=True)

In [28]:
#Perform one-hot encoding for D_63 and D_64
#Drop columns D_63 and D_64 subsequently
test_D63 = pd.get_dummies(test[['D_63']])
test = pd.concat([test, test_D63], axis=1)
test = test.drop(['D_63'], axis=1)

test_D64 = pd.get_dummies(test[['D_64']])
test = pd.concat([test, test_D64], axis=1)
test = test.drop(['D_64'], axis=1)

In [29]:
#Keep columns that we want.
test_final=test[columns_to_keep]

In [30]:
#Predict probabilities of default
y_test_predict=model.predict_proba(test_final)

In [31]:
#Retrieve the probability of default
y_predict_final=y_test_predict[:,1]

# Merge the prediction and customer_ID into submission dataframe
submission = pd.DataFrame({"customer_ID":test_final.index,"prediction":y_predict_final})

submission.to_csv('submission.csv', index=False)