In [1]:
# Dependencies
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Load Data
train_data = pd.read_feather('/kaggle/input/amexfeather/train_data.ftr')
print("The intial dataset shape:", train_data.shape)

The intial dataset shape: (5531451, 191)


In [3]:
train_data.head(5)

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938477,0.001734,0.008728,1.006836,0.009224,0.124023,0.008774,0.004707,...,,,0.002426,0.003706,0.003819,,0.000569,0.00061,0.002674,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936523,0.005775,0.004925,1.000977,0.006153,0.126709,0.000798,0.002714,...,,,0.003956,0.003166,0.005032,,0.009575,0.005493,0.009216,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.954102,0.091492,0.021652,1.009766,0.006817,0.123962,0.007599,0.009422,...,,,0.003269,0.007328,0.000427,,0.003429,0.006985,0.002604,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960449,0.002455,0.013687,1.00293,0.001372,0.117188,0.000685,0.005531,...,,,0.006119,0.004517,0.003201,,0.008423,0.006527,0.009598,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947266,0.002483,0.01519,1.000977,0.007607,0.11731,0.004654,0.009308,...,,,0.003672,0.004944,0.008888,,0.00167,0.008125,0.009827,0


In [4]:
train_data = train_data.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()
train_data.shape

(458913, 190)

# DATA PREPROCESSING

**Removing columns with high number of missing data**

In [5]:
null_data=pd.DataFrame((train_data.isnull().sum()/len(train_data))*100, columns=['Null %'])
cols_with_high_null_data = list(null_data[null_data["Null %"]>80].index)
train_data = train_data.drop(cols_with_high_null_data, axis =1)
print("Columns removed from the dataset:", cols_with_high_null_data)
print("Shape without high null value containing columns :", train_data.shape)

Columns removed from the dataset: ['D_42', 'D_49', 'D_66', 'D_73', 'D_76', 'R_9', 'B_29', 'D_87', 'D_88', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']
Shape without high null value containing columns : (458913, 167)


**Removing non-numeric columns**

In [6]:
train_data = train_data.drop(['S_2'], axis =1)
print('Shape without non-numeric columns : ', train_data.shape)

Shape without non-numeric columns :  (458913, 166)


**Categorical Features**

In [7]:
# Run after removing columns to remove D_66
categories=[]
for categorical_column in train_data.select_dtypes(include=['category']).columns:
    categories.append(categorical_column)
print("Identified categorical features : ", categories)

Identified categorical features :  ['D_63', 'D_64', 'D_68', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']


In [8]:
len(categories)

10

In [9]:
for col in categories:
    train_data[col] =  train_data[col].fillna(train_data[col].mode()[0])

**Fill null values**

In [10]:
null_columns = train_data.columns[train_data.isna().any()].tolist()
for col in null_columns:
     train_data[col] = train_data[col].fillna(train_data[col].median())

**Categorical Enconding**

In [None]:
train_data.shape

In [11]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

train_data[categories] = enc.fit_transform(train_data[categories])

In [12]:
train_data.shape

(458913, 166)

In [13]:
test_data = pd.read_feather('/kaggle/input/amexfeather/test_data.ftr')
test_data = test_data.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

In [14]:
import gc
gc.collect()

23

In [15]:
null_data_1=pd.DataFrame((test_data.isnull().sum()/len(test_data))*100, columns=['Null %'])
cols_with_high_null_data_1 = list(null_data_1[null_data_1["Null %"]>80].index)
test_data = test_data.drop(cols_with_high_null_data_1, axis =1)
print("Columns removed from the dataset:", cols_with_high_null_data_1)
print("Shape without high null value containing columns :", test_data.shape)

Columns removed from the dataset: ['D_42', 'D_49', 'D_66', 'D_73', 'D_76', 'R_9', 'D_87', 'D_88', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']
Shape without high null value containing columns : (924621, 167)


In [16]:
test_data = test_data.drop(['S_2'], axis =1)

In [17]:
categories=[]
for categorical_column in test_data.select_dtypes(include=['category']).columns:
    categories.append(categorical_column)
print("Identified categorical features : ", categories)

Identified categorical features :  ['D_63', 'D_64', 'D_68', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']


In [18]:
len(categories)

10

In [19]:
for col in categories:
    test_data[col] =  test_data[col].fillna(test_data[col].mode()[0])

In [20]:
null_columns = test_data.columns[test_data.isna().any()].tolist()
for col in null_columns:
     test_data[col] = test_data[col].fillna(test_data[col].median())

In [21]:
test_data[categories] = enc.transform(test_data[categories])

In [None]:
test_data.shape

In [22]:
X = train_data[[col for col in train_data.columns if col not in ['target']]]
y = train_data['target']
# print("X shape :",X.shape)
# print("y shape :", y.shape)

# Train Model

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [24]:
columns = [col for col in train_data.columns if col not in ['target']]

**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)
predictions_2= rf_model.predict_proba(test_data[columns])


In [None]:
sample_dataset = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
output = pd.DataFrame({'customer_ID': sample_dataset.customer_ID, 'prediction': predictions_4[:, 1]})
output.to_csv('submission_RFC.csv', index=False)

**KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
predictions_3= knn.predict_proba(test_data[columns])

In [None]:
print(" k = {} , score = {} ".format(3,knn.score(x_test,y_test)))

In [None]:
sample_dataset = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
output = pd.DataFrame({'customer_ID': sample_dataset.customer_ID, 'prediction': predictions_3[:, 1]})
output.to_csv('submission_KNN.csv', index=False)

**SVM**

In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# from sklearn.datasets import make_classification
# X, y = make_classification(n_features=4, random_state=0)
clf = make_pipeline(StandardScaler(),LinearSVC(random_state=0, tol=1e-5))
clf.fit(x_train, y_train)
predictions_4=SVM.predict(test_data[columns])



In [None]:
print("Accuracy:",metrics.accuracy_score(test_data[columns], predictions_4))

In [None]:
sample_dataset = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
output = pd.DataFrame({'customer_ID': sample_dataset.customer_ID, 'prediction': predictions_4[:, 1]})
output.to_csv('submission_SVM.csv', index=False)

**XGB Boost**

In [25]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(x_train, y_train)
predictions_5= model.predict_proba(test_data[columns])

In [61]:
y_pred2 = model.predict(x_test)
print("Accuracy of Model::",accuracy_score(y_test,y_pred2))

Accuracy of Model:: 0.8995456674983384


In [55]:
sample_dataset = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
output = pd.DataFrame({'customer_ID': sample_dataset.customer_ID, 'prediction': predictions_5[:, 1]})
output.to_csv('submission_XGB.csv', index=False)

In [None]:
# def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

#     def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         df = (pd.concat([y_true, y_pred], axis='columns')
#               .sort_values('prediction', ascending=False))
#         df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
#         four_pct_cutoff = int(0.04 * df['weight'].sum())
#         df['weight_cumsum'] = df['weight'].cumsum()
#         df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
#         return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
#     def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         df = (pd.concat([y_true, y_pred], axis='columns')
#               .sort_values('prediction', ascending=False))
#         df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
#         df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
#         total_pos = (df['target'] * df['weight']).sum()
#         df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
#         df['lorentz'] = df['cum_pos_found'] / total_pos
#         df['gini'] = (df['lorentz'] - df['random']) * df['weight']
#         return df['gini'].sum()

#     def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         y_true_pred = y_true.rename(columns={'target': 'prediction'})
#         return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

#     g = normalized_weighted_gini(y_true, y_pred)
#     d = top_four_percent_captured(y_true, y_pred)

#     return 0.5 * (g + d)

In [None]:
# y_predict_final = test_prediction[:,1]
# test = test.reset_index()

# submission = pd.DataFrame({"customer_ID":test.customer_ID,"prediction":y_predict_final})

# submission.to_csv('submission.csv', index=False)