In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
raw_train_dataset = pd.read_feather('../input/amexfeather/train_data.ftr')

# Keep the latest statement records for each customer, set customer_ID as the index, and sort by customer_ID
train_dataset = raw_train_dataset.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()
del raw_train_dataset
gc.collect()

In [3]:
train_dataset.head()

In [4]:
# find the categorical columns
categorical_cols = []
for col in train_dataset.columns:
    if train_dataset[col].dtypes == 'category':
        categorical_cols.append(col)
categorical_cols

In [5]:
numerical_cols = [col for col in train_dataset.columns if col not in categorical_cols + ["target"]]

print(f'Total number of features: {len(train_dataset.columns)}')
print(f'Total number of categorical features: {len(categorical_cols)}')
print(f'Total number of continuos features: {len(numerical_cols)}')

In [6]:
# add the date column to the list of columns to be dropped
cols_to_drop = ['S_2']

In [7]:
# cols_to_drop = []
# find columns where 75% of the values are missing and add them to the list of columns to be dropped 
for col in train_dataset.columns:
    if train_dataset[col].isna().sum()/train_dataset.shape[0] > 0.75:
        cols_to_drop.append(col)
print(len(cols_to_drop), cols_to_drop)

In [8]:
train_dataset.shape
# drop the columns where a majority of the values are missing
train_dataset = train_dataset.drop(cols_to_drop, axis=1)
train_dataset.shape

In [9]:
# find columns with missing numerical values and categorical values of the train dataset
cols_with_missing_num_vals = []
cols_with_missing_cat_vals = []
for col in train_dataset.columns:
    if train_dataset[col].isna().sum() > 0:
        if col in numerical_cols:
            cols_with_missing_num_vals.append(col)
        else:
            cols_with_missing_cat_vals.append(col)
print("Columns with mising numerical values", len(cols_with_missing_num_vals), cols_with_missing_num_vals)
print("Columns with mising numerical values", len(cols_with_missing_cat_vals), cols_with_missing_cat_vals)

In [10]:
# impute the missing numerical values with its median of the train dataset
for col in cols_with_missing_num_vals:
    train_dataset[col] = train_dataset[col].fillna(train_dataset[col].median())
# impute the missing categorical values with its mode of the train dataset
for col in cols_with_missing_cat_vals:
    train_dataset[col] =  train_dataset[col].fillna(train_dataset[col].mode()[0])

In [11]:
train_dataset.isna().sum()

In [12]:
raw_test_dataset = pd.read_feather('../input/amexfeather/test_data.ftr')

# Keep the latest statement records for each customer, set customer_ID as the index, and sort by customer_ID
test_dataset = raw_test_dataset.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()
del raw_test_dataset
gc.collect()

In [13]:
test_dataset.shape
# drop the columns where a majority of the values are missing
test_dataset = test_dataset.drop(cols_to_drop, axis=1)
test_dataset.shape

In [14]:
test_dataset.isna().sum()

In [15]:
# find columns with missing numerical values and categorical values of the test dataset
cols_with_missing_num_vals_test = []
cols_with_missing_cat_vals_test = []
for col in test_dataset.columns:
    if test_dataset[col].isna().sum() > 0:
        if col in numerical_cols:
            cols_with_missing_num_vals_test.append(col)
        else:
            cols_with_missing_cat_vals_test.append(col)
print("Columns with mising numerical values", len(cols_with_missing_num_vals_test), cols_with_missing_num_vals_test)
print("Columns with mising numerical values", len(cols_with_missing_cat_vals_test), cols_with_missing_cat_vals_test)

In [16]:
# impute the missing numerical values with its median of the train dataset
for col in cols_with_missing_num_vals_test:
    test_dataset[col] = test_dataset[col].fillna(train_dataset[col].median())
# impute the missing categorical values with its mode of the train dataset
for col in cols_with_missing_cat_vals_test:
    test_dataset[col] =  test_dataset[col].fillna(train_dataset[col].mode()[0])

In [17]:
test_dataset.isna().sum()

In [18]:
train_dataset_without_target = train_dataset.drop(["target"],axis=1)

#get the correlation matrix
cor_matrix = train_dataset_without_target.corr()
col_core = set()

for i in range(len(cor_matrix.columns)):
    for j in range(i):
        # if the correlation score between two features is greater than 0.9, remove one feature
        if(cor_matrix.iloc[i, j] > 0.9):
            col_name = cor_matrix.columns[i]
#             print(cor_matrix.columns[i], cor_matrix.columns[j])
            col_core.add(col_name)
col_core

In [19]:
print(train_dataset.shape)
print(test_dataset.shape)

# drop the columns with a high correlation ith other features
train_dataset = train_dataset.drop(col_core, axis=1)
test_dataset = test_dataset.drop(col_core, axis=1)
print(train_dataset.shape)
print(test_dataset.shape)

In [20]:
trainX = train_dataset.drop("target", axis = 1)
trainY = train_dataset["target"] 

In [21]:
print(train_dataset.shape)
print(test_dataset.shape)
print(trainX.shape)
print(trainY.shape)

In [22]:
categorical_cols.remove('D_66')

In [23]:
for col in categorical_cols:
    trainX[col] = trainX[col].astype(str)
for col in categorical_cols:
    test_dataset[col] = test_dataset[col].astype(str)

In [24]:
trainX = pd.get_dummies(trainX, columns = categorical_cols)
test_dataset = pd.get_dummies(test_dataset, columns = categorical_cols)

test_dataset = test_dataset.reindex(columns = trainX.columns, fill_value=0)

In [25]:
print(train_dataset.shape)
print(test_dataset.shape)
print(trainX.shape)
print(trainY.shape)

In [26]:
# from sklearn.model_selection import train_test_split

In [27]:
# X_train, X_test, y_train, y_test = train_test_split( trainX, trainY, test_size=0.3, random_state=69)

In [28]:
# from imblearn.over_sampling import SMOTE

# # oversample to handle class imbalance
# oversample = SMOTE()
# X_train, y_train = oversample.fit_resample(X_train, y_train)

In [29]:
# from sklearn.preprocessing import MinMaxScaler
# scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
# X_train = scaling.transform(X_train)
# X_test = scaling.transform(X_test)

In [30]:
# import xgboost as xgb

In [31]:
# xgbClassificationModel = xgb.XGBClassifier(objective = 'reg:logistic', max_depth = 5, seed = 0, n_estimators=1000,eta = 0.05)

In [32]:
# xgbClassificationModel.fit(X_train, y_train)

In [33]:
# def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

#     def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         df = (pd.concat([y_true, y_pred], axis='columns')
#               .sort_values('prediction', ascending=False))
#         df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
#         four_pct_cutoff = int(0.04 * df['weight'].sum())
#         df['weight_cumsum'] = df['weight'].cumsum()
#         df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
#         return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
#     def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         df = (pd.concat([y_true, y_pred], axis='columns')
#               .sort_values('prediction', ascending=False))
#         df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
#         df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
#         total_pos = (df['target'] * df['weight']).sum()
#         df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
#         df['lorentz'] = df['cum_pos_found'] / total_pos
#         df['gini'] = (df['lorentz'] - df['random']) * df['weight']
#         return df['gini'].sum()

#     def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         y_true_pred = y_true.rename(columns={'target': 'prediction'})
#         return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

#     g = normalized_weighted_gini(y_true, y_pred)
#     d = top_four_percent_captured(y_true, y_pred)

#     return 0.5 * (g + d)

In [34]:
# predictions_xgb = xgbClassificationModel.predict_proba(X_test)

In [35]:
# y_true = y_test.to_frame(name = 'target')
# y_true = y_true.reset_index(drop=True)
# y_pred = pd.DataFrame(predictions_xgb[:,1], columns = ['prediction'])

In [36]:
# amex_metric(y_true, y_pred)

In [37]:
from imblearn.over_sampling import SMOTE

# oversample to handle class imbalance
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(trainX, trainY)

In [38]:
X_test = test_dataset

In [39]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

In [40]:
import xgboost as xgb

In [41]:
xgbClassificationModel = xgb.XGBClassifier(objective = 'reg:logistic', max_depth = 5, seed = 0, n_estimators=1000,eta = 0.05)

In [42]:
xgbClassificationModel.fit(X_train, y_train)

In [43]:
predictions_xgb = xgbClassificationModel.predict_proba(X_test)

In [44]:
len(predictions_xgb[:,1])

In [45]:
predictions_xgb = predictions_xgb[:,1]
sample_dataset = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
output = pd.DataFrame({'customer_ID': sample_dataset.customer_ID, 'prediction': predictions_xgb})
output.to_csv('Submission XGB v3.csv', index=False)