In [1]:
# Dependencies
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Load Data
train_data = pd.read_feather('/kaggle/input/amexfeather/train_data.ftr')
print("The intial dataset shape:", train_data.shape)

The intial dataset shape: (5531451, 191)


In [None]:
train_data.head(5)

In [3]:
train_data = train_data.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()
train_data.shape

(458913, 190)

# DATA PREPROCESSING

**Removing columns with high number of missing data**

In [4]:
null_data=pd.DataFrame((train_data.isnull().sum()/len(train_data))*100, columns=['Null %'])
cols_with_high_null_data = list(null_data[null_data["Null %"]>80].index)
train_data = train_data.drop(cols_with_high_null_data, axis =1)
print("Columns removed from the dataset:", cols_with_high_null_data)
print("Shape without high null value containing columns :", train_data.shape)

Columns removed from the dataset: ['D_42', 'D_49', 'D_66', 'D_73', 'D_76', 'R_9', 'B_29', 'D_87', 'D_88', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']
Shape without high null value containing columns : (458913, 167)


**Removing non-numeric columns**

In [5]:
train_data = train_data.drop(['S_2'], axis =1)
print('Shape without non-numeric columns : ', train_data.shape)

Shape without non-numeric columns :  (458913, 166)


**Categorical Features**

In [6]:
# Run after removing columns to remove D_66
categories=[]
for categorical_column in train_data.select_dtypes(include=['category']).columns:
    categories.append(categorical_column)
print("Identified categorical features : ", categories)

Identified categorical features :  ['D_63', 'D_64', 'D_68', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']


In [7]:
len(categories)

10

In [8]:
for col in categories:
    train_data[col] =  train_data[col].fillna(train_data[col].mode()[0])

**Fill null values**

In [9]:
null_columns = train_data.columns[train_data.isna().any()].tolist()
for col in null_columns:
     train_data[col] = train_data[col].fillna(train_data[col].median())

**Categorical Enconding**

In [10]:
train_data.shape

(458913, 166)

In [11]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

train_data[categories] = enc.fit_transform(train_data[categories])

In [None]:
train_data.shape

In [12]:
test_data = pd.read_feather('/kaggle/input/amexfeather/test_data.ftr')
test_data = test_data.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

In [13]:
import gc
gc.collect()

23

In [14]:
null_data_1=pd.DataFrame((test_data.isnull().sum()/len(test_data))*100, columns=['Null %'])
cols_with_high_null_data_1 = list(null_data_1[null_data_1["Null %"]>80].index)
test_data = test_data.drop(cols_with_high_null_data_1, axis =1)
print("Columns removed from the dataset:", cols_with_high_null_data_1)
print("Shape without high null value containing columns :", test_data.shape)

Columns removed from the dataset: ['D_42', 'D_49', 'D_66', 'D_73', 'D_76', 'R_9', 'D_87', 'D_88', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']
Shape without high null value containing columns : (924621, 167)


In [15]:
test_data = test_data.drop(['S_2'], axis =1)

In [16]:
categories=[]
for categorical_column in test_data.select_dtypes(include=['category']).columns:
    categories.append(categorical_column)
print("Identified categorical features : ", categories)

Identified categorical features :  ['D_63', 'D_64', 'D_68', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']


In [None]:
len(categories)

In [17]:
for col in categories:
    test_data[col] =  test_data[col].fillna(test_data[col].mode()[0])

In [18]:
null_columns = test_data.columns[test_data.isna().any()].tolist()
for col in null_columns:
     test_data[col] = test_data[col].fillna(test_data[col].median())

In [19]:
test_data[categories] = enc.transform(test_data[categories])

In [None]:
test_data.shape

In [20]:
X = train_data[[col for col in train_data.columns if col not in ['target']]]
y = train_data['target']
# print("X shape :",X.shape)
# print("y shape :", y.shape)

# Train Model

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [22]:
columns = [col for col in train_data.columns if col not in ['target']]

**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)
predictions_2= rf_model.predict_proba(test_data[columns])


In [None]:
sample_dataset = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
output = pd.DataFrame({'customer_ID': sample_dataset.customer_ID, 'prediction': predictions_4[:, 1]})
output.to_csv('submission_RFC.csv', index=False)

**KNN**

In [23]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
predictions_3= knn.predict_proba(test_data[columns])

KeyboardInterrupt: 

In [None]:
print(" k = {} , score = {} ".format(3,knn.score(x_test,y_test)))

In [None]:
sample_dataset = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
output = pd.DataFrame({'customer_ID': sample_dataset.customer_ID, 'prediction': predictions_3[:, 1]})
output.to_csv('submission_KNN.csv', index=False)

**SVM**

In [None]:
from sklearn import svm
SVM = svm.SVC(kernel='linear',C=1.3,degree=8,cache_size=300) 
SVM.fit(x_train, y_train)
predictions_4=SVM.predict(test_data[columns])



In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions_4))

In [None]:
sample_dataset = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
output = pd.DataFrame({'customer_ID': sample_dataset.customer_ID, 'prediction': predictions_4[:, 1]})
output.to_csv('submission_SVM.csv', index=False)

**XGB Boost**

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(x_train, y_train)
predictions_5= model.predict_proba(test_data[columns])

In [None]:
sample_dataset = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
output = pd.DataFrame({'customer_ID': sample_dataset.customer_ID, 'prediction': predictions_5[:, 1]})
output.to_csv('submission_XGB.csv', index=False)

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [None]:
# y_predict_final = test_prediction[:,1]
# test = test.reset_index()

# submission = pd.DataFrame({"customer_ID":test.customer_ID,"prediction":y_predict_final})

# submission.to_csv('submission.csv', index=False)