In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amexfeather/test_data_f32.ftr
/kaggle/input/amexfeather/train_data.ftr
/kaggle/input/amexfeather/train_data_f32.ftr
/kaggle/input/amexfeather/test_data.ftr
/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


In [2]:
raw_train_dataset = pd.read_feather('../input/amexfeather/train_data.ftr')

# Keep the latest statement records for each customer, set customer_ID as the index, and sort by customer_ID
train_dataset = raw_train_dataset.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()
del raw_train_dataset
gc.collect()

0

In [3]:
train_dataset.head()

Unnamed: 0_level_0,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2018-03-13,0.93457,0.009117,0.009384,1.007812,0.006104,0.13501,0.001604,0.007175,,...,,,0.007187,0.004234,0.005085,,0.00581,0.00297,0.00853,0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,2018-03-25,0.880371,0.178101,0.034698,1.003906,0.006912,0.165527,0.00555,0.00507,,...,,,0.002981,0.007481,0.007874,,0.003284,0.00317,0.008514,0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,2018-03-12,0.880859,0.009705,0.004284,0.8125,0.006451,,0.003796,0.007195,,...,,,0.007381,0.006622,0.000965,,0.002201,0.000834,0.003445,0
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,2018-03-29,0.621582,0.001082,0.012566,1.005859,0.007828,0.287842,0.004532,0.009941,,...,,,0.002705,0.006184,0.001899,,0.008186,0.005558,0.002983,0
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,2018-03-30,0.87207,0.005573,0.007679,0.815918,0.001247,,0.000231,0.005527,,...,,,0.002974,0.004162,0.005764,,0.008156,0.006943,0.000905,0


In [4]:
# find the categorical columns
categorical_cols = []
for col in train_dataset.columns:
    if train_dataset[col].dtypes == 'category':
        categorical_cols.append(col)
categorical_cols

['D_63',
 'D_64',
 'D_66',
 'D_68',
 'B_30',
 'B_38',
 'D_114',
 'D_116',
 'D_117',
 'D_120',
 'D_126']

In [5]:
numerical_cols = [col for col in train_dataset.columns if col not in categorical_cols + ["target"]]

print(f'Total number of features: {len(train_dataset.columns)}')
print(f'Total number of categorical features: {len(categorical_cols)}')
print(f'Total number of continuos features: {len(numerical_cols)}')

Total number of features: 190
Total number of categorical features: 11
Total number of continuos features: 178


In [6]:
# add the date column to the list of columns to be dropped
cols_to_drop = ['S_2']

In [7]:
# cols_to_drop = []
# find columns where 75% of the values are missing and add them to the list of columns to be dropped 
for col in train_dataset.columns:
    if train_dataset[col].isna().sum()/train_dataset.shape[0] > 0.75:
        cols_to_drop.append(col)
print(len(cols_to_drop), cols_to_drop)

24 ['S_2', 'D_42', 'D_49', 'D_66', 'D_73', 'D_76', 'R_9', 'B_29', 'D_87', 'D_88', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']


In [8]:
train_dataset.shape
# drop the columns where a majority of the values are missing
train_dataset = train_dataset.drop(cols_to_drop, axis=1)
train_dataset.shape

(458913, 166)

In [9]:
# find columns with missing numerical values and categorical values of the train dataset
cols_with_missing_num_vals = []
cols_with_missing_cat_vals = []
for col in train_dataset.columns:
    if train_dataset[col].isna().sum() > 0:
        if col in numerical_cols:
            cols_with_missing_num_vals.append(col)
        else:
            cols_with_missing_cat_vals.append(col)
print("Columns with mising numerical values", len(cols_with_missing_num_vals), cols_with_missing_num_vals)
print("Columns with mising numerical values", len(cols_with_missing_cat_vals), cols_with_missing_cat_vals)

Columns with mising numerical values 78 ['P_2', 'B_2', 'S_3', 'D_41', 'B_3', 'D_43', 'D_44', 'D_45', 'D_46', 'D_48', 'B_6', 'B_8', 'D_50', 'D_52', 'P_3', 'D_53', 'D_54', 'S_7', 'D_55', 'D_56', 'B_13', 'S_9', 'D_59', 'D_61', 'B_15', 'D_62', 'B_16', 'B_17', 'B_19', 'B_20', 'D_69', 'B_22', 'D_70', 'D_72', 'D_74', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'D_80', 'B_27', 'D_81', 'D_82', 'D_83', 'D_84', 'B_33', 'D_89', 'D_91', 'S_22', 'S_23', 'S_24', 'S_25', 'D_103', 'D_104', 'D_105', 'D_107', 'R_27', 'D_109', 'D_112', 'S_27', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_128', 'D_129', 'D_130', 'D_131', 'D_139', 'D_141', 'D_143', 'D_145']
Columns with mising numerical values 7 ['D_68', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120']


In [10]:
# impute the missing numerical values with its median of the train dataset
for col in cols_with_missing_num_vals:
    train_dataset[col] = train_dataset[col].fillna(train_dataset[col].median())
# impute the missing categorical values with its mode of the train dataset
for col in cols_with_missing_cat_vals:
    train_dataset[col] =  train_dataset[col].fillna(train_dataset[col].mode()[0])

In [11]:
train_dataset.isna().sum()

P_2       0
D_39      0
B_1       0
B_2       0
R_1       0
         ..
D_141     0
D_143     0
D_144     0
D_145     0
target    0
Length: 166, dtype: int64

In [12]:
raw_test_dataset = pd.read_feather('../input/amexfeather/test_data.ftr')

# Keep the latest statement records for each customer, set customer_ID as the index, and sort by customer_ID
test_dataset = raw_test_dataset.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()
del raw_test_dataset
gc.collect()

69

In [13]:
test_dataset.shape
# drop the columns where a majority of the values are missing
test_dataset = test_dataset.drop(cols_to_drop, axis=1)
test_dataset.shape

(924621, 165)

In [14]:
test_dataset.isna().sum()

P_2      4784
D_39        0
B_1         0
B_2        43
R_1         0
         ... 
D_140       0
D_141    5050
D_143    5050
D_144       0
D_145    5050
Length: 165, dtype: int64

In [15]:
# find columns with missing numerical values and categorical values of the test dataset
cols_with_missing_num_vals_test = []
cols_with_missing_cat_vals_test = []
for col in test_dataset.columns:
    if test_dataset[col].isna().sum() > 0:
        if col in numerical_cols:
            cols_with_missing_num_vals_test.append(col)
        else:
            cols_with_missing_cat_vals_test.append(col)
print("Columns with mising numerical values", len(cols_with_missing_num_vals_test), cols_with_missing_num_vals_test)
print("Columns with mising numerical values", len(cols_with_missing_cat_vals_test), cols_with_missing_cat_vals_test)

Columns with mising numerical values 85 ['P_2', 'B_2', 'S_3', 'D_41', 'B_3', 'D_43', 'D_44', 'D_45', 'D_46', 'D_48', 'B_6', 'B_8', 'D_50', 'D_52', 'P_3', 'D_53', 'D_54', 'S_7', 'D_55', 'D_56', 'B_13', 'S_9', 'D_59', 'D_61', 'B_15', 'D_62', 'B_16', 'B_17', 'B_19', 'B_20', 'S_12', 'D_69', 'B_22', 'D_70', 'D_72', 'D_74', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'D_80', 'B_27', 'D_81', 'D_82', 'S_17', 'D_83', 'D_84', 'D_86', 'B_33', 'D_89', 'D_91', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'D_103', 'D_104', 'D_105', 'D_107', 'B_37', 'R_27', 'D_109', 'D_112', 'B_40', 'S_27', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_128', 'D_129', 'B_41', 'D_130', 'D_131', 'D_139', 'D_141', 'D_143', 'D_145']
Columns with mising numerical values 7 ['D_68', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120']


In [17]:
test_dataset.isna().sum()

P_2      0
D_39     0
B_1      0
B_2      0
R_1      0
        ..
D_140    0
D_141    0
D_143    0
D_144    0
D_145    0
Length: 165, dtype: int64

In [18]:
train_dataset_without_target = train_dataset.drop(["target"],axis=1)

#get the correlation matrix
cor_matrix = train_dataset_without_target.corr()
col_core = set()

for i in range(len(cor_matrix.columns)):
    for j in range(i):
        # if the correlation score between two features is greater than 0.9, remove one feature
        if(cor_matrix.iloc[i, j] > 0.9):
            col_name = cor_matrix.columns[i]
#             print(cor_matrix.columns[i], cor_matrix.columns[j])
            col_core.add(col_name)
col_core

{'B_11',
 'B_13',
 'B_15',
 'B_23',
 'B_33',
 'B_37',
 'D_104',
 'D_119',
 'D_141',
 'D_143',
 'D_74',
 'D_75',
 'D_77',
 'S_24',
 'S_7'}

In [19]:
print(train_dataset.shape)
print(test_dataset.shape)

# drop the columns with a high correlation ith other features
train_dataset = train_dataset.drop(col_core, axis=1)
test_dataset = test_dataset.drop(col_core, axis=1)
print(train_dataset.shape)
print(test_dataset.shape)

(458913, 166)
(924621, 165)
(458913, 151)
(924621, 150)


In [20]:
trainX = train_dataset.drop("target", axis = 1)
trainY = train_dataset["target"] 

In [21]:
print(train_dataset.shape)
print(test_dataset.shape)
print(trainX.shape)
print(trainY.shape)

(458913, 151)
(924621, 150)
(458913, 150)
(458913,)


In [22]:
categorical_cols.remove('D_66')

In [23]:
for col in categorical_cols:
    trainX[col] = trainX[col].astype(str)
for col in categorical_cols:
    test_dataset[col] = test_dataset[col].astype(str)

In [24]:
trainX = pd.get_dummies(trainX, columns = categorical_cols)
test_dataset = pd.get_dummies(test_dataset, columns = categorical_cols)

test_dataset = test_dataset.reindex(columns = trainX.columns, fill_value=0)

In [25]:
print(train_dataset.shape)
print(test_dataset.shape)
print(trainX.shape)
print(trainY.shape)

(458913, 151)
(924621, 181)
(458913, 181)
(458913,)


In [26]:
# from sklearn.model_selection import train_test_split

In [27]:
# X_train, X_test, y_train, y_test = train_test_split( trainX, trainY, test_size=0.3, random_state=69)

In [30]:
# from imblearn.over_sampling import SMOTE

# # oversample to handle class imbalance
# oversample = SMOTE()
# X_train, y_train = oversample.fit_resample(X_train, y_train)

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
# X_train = scaling.transform(X_train)
# X_test = scaling.transform(X_test)

In [None]:
# import xgboost as xgb

In [None]:
# xgbClassificationModel = xgb.XGBClassifier(objective = 'reg:logistic', max_depth = 5, seed = 0, n_estimators=1000,eta = 0.05)

In [None]:
# xgbClassificationModel.fit(X_train, y_train)

In [None]:
# def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

#     def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         df = (pd.concat([y_true, y_pred], axis='columns')
#               .sort_values('prediction', ascending=False))
#         df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
#         four_pct_cutoff = int(0.04 * df['weight'].sum())
#         df['weight_cumsum'] = df['weight'].cumsum()
#         df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
#         return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
#     def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         df = (pd.concat([y_true, y_pred], axis='columns')
#               .sort_values('prediction', ascending=False))
#         df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
#         df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
#         total_pos = (df['target'] * df['weight']).sum()
#         df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
#         df['lorentz'] = df['cum_pos_found'] / total_pos
#         df['gini'] = (df['lorentz'] - df['random']) * df['weight']
#         return df['gini'].sum()

#     def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         y_true_pred = y_true.rename(columns={'target': 'prediction'})
#         return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

#     g = normalized_weighted_gini(y_true, y_pred)
#     d = top_four_percent_captured(y_true, y_pred)

#     return 0.5 * (g + d)

In [None]:
# predictions_xgb = xgbClassificationModel.predict_proba(X_test)

In [None]:
# y_true = y_test.to_frame(name = 'target')
# y_true = y_true.reset_index(drop=True)
# y_pred = pd.DataFrame(predictions_xgb[:,1], columns = ['prediction'])

In [None]:
# amex_metric(y_true, y_pred)

In [28]:
from imblearn.over_sampling import SMOTE

# oversample to handle class imbalance
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(trainX, trainY)

In [29]:
X_test = test_dataset

In [31]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

NameError: name 'X_train' is not defined

In [None]:
import xgboost as xgb

In [None]:
xgbClassificationModel = xgb.XGBClassifier(objective = 'reg:logistic', max_depth = 5, seed = 0, n_estimators=1000,eta = 0.05)

In [None]:
xgbClassificationModel.fit(X_train, y_train)

In [None]:
predictions_xgb = svmClassificationModel._predict_proba_lr(X_test)

In [None]:
len(predictions_xgb[:,1])

In [None]:
predictions_xgb = predictions_xgb[:,1]
sample_dataset = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
output = pd.DataFrame({'customer_ID': sample_dataset.customer_ID, 'prediction': predictions_xgb})
output.to_csv('Submission XGB v3.csv', index=False)