In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
import warnings
warnings.filterwarnings("ignore")
from pathlib import Path
import seaborn as sb
import random
import gc
sb.color_palette("Spectral", as_cmap=True)



TRAIN_DATA_PATH = "/kaggle/input/parquet-files-amexdefault-prediction/"
TRAIN_FILE = "train_data"
TRAIN_LABELS = "train_labels"



In [3]:
#Convert .csv to feather and save
CSV_DATA_PATH = "/kaggle/input/amex-default-prediction/"
df = pd.read_csv(os.path.join(CSV_DATA_PATH,TRAIN_LABELS+".csv"))
path=os.path.join("/kaggle/working/",TRAIN_LABELS+".ftr")
if Path(path).exists():
   os.remove("/kaggle/working/train_labels.ftr")
print(f"Location of Feather File {path}")
df.to_feather(path)

In [4]:
def load_data(fpath,fname,flabels):
    file_path = os.path.join(fpath,fname+".ftr")
    file_path = Path(file_path)
    file_labels = Path(os.path.join("/kaggle/working/",flabels+".ftr"))
    if file_path.exists() and file_labels.exists():
        print(f"{file_path}, and {file_labels} are available")
        df1 = pd.read_feather(file_path,use_threads=True)
        df2 = pd.read_feather(file_labels,use_threads=True)
        
        return df1,df2
    else:
        print("No Such File")
        return
 

In [5]:
%%time
train_x,train_y = load_data(TRAIN_DATA_PATH,TRAIN_FILE,TRAIN_LABELS )
train_x.info()

In [6]:
categorical_var = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
discard = ["customer_ID","S_2"] + categorical_var
numeric_cols = list(set(train_x.columns)-set(discard))

In [7]:
%%time

for col in train_x.columns:
    if col in categorical_var:
        print(col)

#num_cols = train_x._get_numeric_data().columns
#cat_var = list(set(train_x.columns)-set(num_cols))
#cat_var

In [8]:
%%time
from scipy.stats import chi2_contingency
'''
H0: Categorical variables are not correlated
H1: Categorical variables are highly correlated
'''
drop_catvar=[]
cols = [i for i in train_x.columns.to_list()]
for i in range(len(train_x.columns)-1):
    col1 = cols[i]
    col2 = cols[i+1]
   
    if col1 in categorical_var:
        if col2 in categorical_var:
             
             result = pd.crosstab(index=train_x[col1],columns=train_x[col2])
             chi2 = chi2_contingency(result)
                
             if chi2[1] >= 0.05:
                 print(f"{col1} and {col2} are not correlated, p-value: {chi2[1]}")
             else:
                 print(f"{col1} and {col2} are correlated, p-value: {chi2[1]}")
             drop_catvar.append(col2)

categorical_var = list(set(categorical_var)-set(drop_catvar))

In [9]:
df_temp = pd.merge(train_x,train_y,on=["customer_ID"])

In [10]:
#Two-Way Table - Conditional Probability for Categorical Variable
for col in categorical_var:
    result = pd.crosstab(index=df_temp['target'],columns=train_x[col],normalize="index",margins=True,dropna=True)
    print(result)
    print("*"*20)
del df_temp
gc.collect()

In [11]:
print(train_x["D_63"].iloc[0:5])

In [12]:
for col in train_x.columns:
    if col in categorical_var:
        dummy = pd.get_dummies(train_x[col],prefix=col)
        train_x = train_x.join(dummy)
        train_x.drop(col,axis=1,inplace=True)
train_x.head()   

In [13]:
%%time
train_x["S_2"]=pd.to_datetime(train_x["S_2"])

for col in numeric_cols:
           
        try:
            train_x[col]=pd.to_numeric(train_x[col])
    
        except:
               print("Casting Error")

In [14]:
#Missing Value 
col_value=[]
col_name = []
for col in train_x.columns:
    missing_val = train_x[col].isna().sum()
    missing_val_per = round(missing_val*100/len(train_x),2)
    col_value.append(float(missing_val_per))
    col_name.append(col)
    #print(f"Column {col} has {missing_val} number of missing values i.e.{missing_val_per}%")


missing_val_df = pd.DataFrame({"Col":col_name,"Missing Value in Percentage":col_value})
missing_val_df = missing_val_df.sort_values("Missing Value in Percentage",ascending=False)


In [15]:
import matplotlib.pyplot as plt
plt.figure(figsize=(80,50))
plt.bar(missing_val_df["Col"],missing_val_df["Missing Value in Percentage"],width=0.8, color='green')
plt.xticks(rotation=90)
plt.xlabel("Column Name", fontsize=18)
plt.ylabel("Percentage",fontsize=18)
plt.title("Missing Value",fontsize=20)
plt.legend("AMEX",fontsize=18)
plt.show()
del missing_val_df
gc.collect()

In [16]:
train_x=train_x.dropna(axis=1)
train_x.head()

In [17]:
#Number of columns
catcols=[]
for col in train_x.columns:
    for cat_var in categorical_var:
        if col.startswith(cat_var):
            catcols.append(col)
            
numeric_cols = list(set(train_x.columns)-set(discard)-set(catcols))
len(numeric_cols)

In [18]:
#Detection of Class Imbalance
target = train_y.drop("customer_ID",axis=1)
target.tail()

In [23]:
def class_imbalance(df):
    yes = df[df["target"]==1]
    no = df[df["target"]==0]
    pyes = len(yes)*100/(len(no)+len(yes))
    pno = 100-pyes
    print("Percentage of Class 1:",round(pyes,2),"%")
    print("Percentage of Class 0:",round(pno,2),"%")
    if (pyes != pno):
        print("Class Imbalance Exsists\n")
    else:
        print("No Class Imbalance Exsists\n")
    plt.figure(figsize=(10,10))
    xlab = ["1","0"]
    xpos =np.arange(len(xlab))
    ylab=[pyes/100,pno/100]
    plt.bar(xpos,ylab,width=0.4,alpha=0.7, color="green")
    plt.xticks(xpos,xlab)
    plt.title("Class Imblance")
    plt.legend()
    plt.show()

In [24]:
class_imbalance(target)

In [25]:
#Down sampling factor 3
def down_sampling(trainx,trainy):
    df_merge = pd.merge(trainx,trainy,on="customer_ID")
    df_merge_1 = df_merge[df_merge['target']==1]
    df_merge_0 = df_merge[df_merge['target']==0]
    df_merge_dsample0 = df_merge_0.sample(n=len(df_merge_1))
    df_dsample = pd.concat([df_merge_dsample0,df_merge_1])
    train_y_dsample =df_dsample["target"] 
    train_x_dsample = df_dsample.drop("target",axis=1)
    del df_merge,df_merge_1,df_merge_0,trainx,trainy
    gc.collect()
    return train_x_dsample,train_y_dsample

In [26]:
trainx,trainy = down_sampling(train_x,train_y)

In [27]:
trainy_df = pd.DataFrame({"target":trainy.to_list()})
class_imbalance(trainy_df)

In [28]:
%%time
plt.figure(figsize=(30,30))
col_name = train_x.columns.to_list()
#numeric_cols = list(set(col_name)-set(discard))
corr = trainx[numeric_cols].corr()
matrix_mask=np.triu(corr)
sb.heatmap(corr,annot=True,fmt="0.1g",cmap="viridis",mask=matrix_mask)
plt.show()

In [29]:
#update numeric columns
drop_numeric_cols=[]
pair=[]
for col in numeric_cols:
    for i in range(len(corr)):
        if abs(corr[col].iloc[i]) >= 0.9 and col != numeric_cols[i] :
            print(f"{col} and {numeric_cols[i]} are highly correlated...") 
            if col not in pair:
                pair.append(col)
                pair.append(numeric_cols[i])
                drop_numeric_cols.append(col) 
numeric_cols = list(set(numeric_cols)-set(drop_numeric_cols))
print(f"Dropping columns : {drop_numeric_cols}")
del drop_numeric_cols
del pair
gc.collect()


In [30]:
%%time
fig,axes=plt.subplots(int(len(numeric_cols)/5),5)
nrow=0
ncol=0
df = trainx.sample(1000)
for col in numeric_cols:
    if ncol <5:
       fig.set_figheight(10)
       fig.set_figwidth(10)
       g=sb.distplot(df[col],hist=True,kde=True,ax=axes[nrow,ncol])
       g.set(title=col)
       g.set(xlabel=None)
       g.set(ylabel=None)
        
       ncol +=1
    else:
        nrow +=1
        ncol=0
fig.subplots_adjust(hspace=1.5) 
fig.subplots_adjust(wspace=1) 
plt.show()
del df
gc.collect()

In [31]:
%%time
df= trainx.sample(1000)
for col1,col2 in zip(numeric_cols,numeric_cols[1:]):
    if col1.startswith("S") and col2.startswith("R"):
        plt.figure(figsize=(5,5))
        sb.jointplot(x=col1,y=col2,data=df,kind="kde")
        plt.show()
del df
gc.collect()

In [32]:
#Detection of Outliers
from sklearn.ensemble import IsolationForest
def detect_outliers(train_data,col):
    cf = IsolationForest(random_state=224,n_jobs=-1).fit(np.array(train_data[col].to_list()).reshape(-1,1))
    predict = cf.predict(np.array(train_data[col].to_list()).reshape(-1,1))
    colors={1:"blue",-1:"black"}
    df = pd.DataFrame({"Colors":predict})
    plt.figure(figsize=(5,5))
    train_data[col].plot(style='.',color=df["Colors"].map(colors),alpha=0.6)
    plt.title(f"Outlier Detection of {col} ")
    plt.show()
    del df
    gc.collect()
    return predict

In [33]:
#Imputation
def data_impute(data,col):
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    IQR = q3-q1
    rng = 3*IQR
    data[col]=np.where(data[col] >= q3+rng,data[col].median(),data[col])
    data[col]=np.where(data[col] <= q1-rng,data[col].median(),data[col])

In [34]:
#Impute Numerical Values:
for col in numeric_cols:
    data_impute(trainx,col)

In [35]:
#Feature Engineering
trainx["Days"]=train_x["S_2"].dt.day
trainx["Month"]=train_x["S_2"].dt.month
trainx["Year"]=train_x["S_2"].dt.year

In [36]:
customer_id = [ i for i in trainx["customer_ID"].to_list()]
trainx.drop("customer_ID",axis=1,inplace=True)
trainx.head(5)

In [37]:
#Scale Numerical Values
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler()
num_cols = list(set(trainx.columns)-set(discard))
scaler.fit(trainx[num_cols])
trainx_scaled = scaler.transform(trainx[num_cols])
pca = PCA()
comp = pca.fit(trainx_scaled)
plt.plot(np.cumsum(comp.explained_variance_ratio_), color="green")
plt.grid(axis="both")
plt.xlabel("PRINCIPAL COMPONENTS")
plt.ylabel("VARIANCE")
sb.despine()
plt.show()