Firstly, we need to import all necessary libraries.


#Necessary imports



In [180]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from  sklearn.model_selection import GridSearchCV 
from sklearn.impute import SimpleImputer

Then, we load the file

In [181]:
df = pd.read_csv("creditCard.csv")
labels  = pd.read_csv("creditCardLabel.csv")


And check the first 5 rows

In [182]:
df.head()

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2
3,5009749,F,Y,N,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2


We can already see that the 1st row will cause trouble for our models, that's why we can proceed with deletion

In [183]:
df_transformation = df.copy()
df_transformation.drop(df.iloc[0].name, inplace=True)

Let's investigate unique values of each columns to see if there is any irrelevant information

In [184]:
for i in df.columns:
    for column in df.columns:
        unique_values = df[column].unique()
        print(f"Unique values for column '{column}': {unique_values}")


Unique values for column 'Ind_ID': [5008827 5009744 5009746 ... 5115992 5118219 5053790]
Unique values for column 'GENDER': ['M' 'F' nan]
Unique values for column 'Car_Owner': ['Y' 'N']
Unique values for column 'Propert_Owner': ['Y' 'N']
Unique values for column 'CHILDREN': [ 0  1  2  4  3 14]
Unique values for column 'Annual_income': [ 180000.   315000.        nan  450000.    90000.   472500.   270000.
  126000.   202500.   157500.   112500.   540000.   292500.   135000.
   76500.   215100.   225000.    67500.   171000.   103500.    99000.
  391500.    65250.    72900.   360000.   256500.   675000.   247500.
   85500.   121500.   130500.   211500.    81000.    72000.   148500.
  162000.   195750.   585000.   216000.   306000.   108000.    63000.
   45000.   337500.   131400.   117000.   445500.   234000.  1575000.
  144000.    67050.    73350.   193500.   900000.    94500.   198000.
   54000.   166500.   167400.   153000.   423000.   243000.   283500.
  252000.   495000.   612000.    

As we can see, ```["EDUCATION"]``` columns consist of both Academic Degree and Higher Education, it will add unnecesary feature when we'd perform encoding of categorical variables. We should take care of it.

In [185]:
df_transformation[["EDUCATION"]] = df_transformation[["EDUCATION"]].replace("Academic degree", "Higher Education") 

Let's investigate the ```[Car_owner]``` and ```[Propert_Owner]``` columns

In [186]:
uniqVal = pd.concat([df['Car_Owner'], df['Propert_Owner'],]).unique()

print(uniqVal)

['Y' 'N']


As we can see, they are boolean columns, so we should make them boolean instead of string

In [187]:
df_transformation['Car_Owner'] = df['Car_Owner'].map({'Y': 1, 'N': 0})
df_transformation['Propert_Owner'] = df['Propert_Owner'].map({'Y': 1, 'N': 0})


Columns ```[Employed_years]``` and ```[Birthday_count]``` appear to be quite hard to read, let's change that.

In [188]:
# Convert Employed_days to years
df_transformation['Employed_years'] = df_transformation['Employed_days'] / 365

# Convert Birthday_count to years
df_transformation['Age'] = df_transformation['Birthday_count'] / -365

# Round the values to two decimal places
df_transformation['Employed_years'] = df_transformation['Employed_years'].round(2)
df_transformation['Age'] = df_transformation['Age'].round(2)

# Drop the original columns
df_transformation.drop(['Employed_days', 'Birthday_count'], axis=1, inplace=True)




Additionally, we can change ```[Age]``` column further, by rounding, so we can apply methods related to discrete mathematics and not continuous

In [189]:

# Create a new column 'Age Bracket' based on the 'Age' column
df_transformation['Age'].round(2) 


1       37.14
2         NaN
3       37.14
4       37.14
5       37.14
        ...  
1543    32.76
1544    28.02
1545    36.09
1546    41.90
1547    45.48
Name: Age, Length: 1547, dtype: float64

Perhaps, we only need to know if person is employed or not, regardless of the time employed

In [190]:
df_transformation['Is_Employed'] = (df_transformation['Employed_years'] < 0).astype(int)


Becuase column ```[GENDER]``` also consist of letters, I've decided to change the name of the column to ```[isMale]```, and make it binary

In [191]:
# Convert Employed_days to years
df_transformation['isMale'] = df_transformation['GENDER'].apply(lambda x: 1 if x == 'M' else 0)
df_transformation.drop(["GENDER"],axis=1,inplace=True)

In [192]:
df_transformation

Unnamed: 0,Ind_ID,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members,Employed_years,Age,Is_Employed,isMale
1,5009744,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1.61,37.14,1,0
2,5009746,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1.61,,1,0
3,5009749,1,0,0,,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1.61,37.14,1,0
4,5009752,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1.61,37.14,1,0
5,5009753,1,0,0,315000.0,Pensioner,Higher education,Married,House / apartment,1,1,1,0,,2,-1.61,37.14,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,5028645,0,1,0,,Commercial associate,Higher education,Married,House / apartment,1,0,0,0,Managers,2,-5.98,32.76,1,0
1544,5023655,0,0,0,225000.0,Commercial associate,Incomplete higher,Single / not married,House / apartment,1,0,0,0,Accountants,1,-3.31,28.02,1,0
1545,5115992,1,1,2,180000.0,Working,Higher education,Married,House / apartment,1,0,0,0,Managers,4,-6.79,36.09,1,1
1546,5118219,1,0,0,270000.0,Working,Secondary / secondary special,Civil marriage,House / apartment,1,1,1,0,Drivers,2,-1.77,41.90,1,1


We should also make column ```Age``` discrete instead of continious.

In [193]:
df_transformation["Age"] = df_transformation["Age"].round(0)
df_transformation['Employed_years'] = df_transformation['Employed_years'].astype(int)
df_transformation

Unnamed: 0,Ind_ID,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members,Employed_years,Age,Is_Employed,isMale
1,5009744,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1,37.0,1,0
2,5009746,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1,,1,0
3,5009749,1,0,0,,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1,37.0,1,0
4,5009752,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1,37.0,1,0
5,5009753,1,0,0,315000.0,Pensioner,Higher education,Married,House / apartment,1,1,1,0,,2,-1,37.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,5028645,0,1,0,,Commercial associate,Higher education,Married,House / apartment,1,0,0,0,Managers,2,-5,33.0,1,0
1544,5023655,0,0,0,225000.0,Commercial associate,Incomplete higher,Single / not married,House / apartment,1,0,0,0,Accountants,1,-3,28.0,1,0
1545,5115992,1,1,2,180000.0,Working,Higher education,Married,House / apartment,1,0,0,0,Managers,4,-6,36.0,1,1
1546,5118219,1,0,0,270000.0,Working,Secondary / secondary special,Civil marriage,House / apartment,1,1,1,0,Drivers,2,-1,42.0,1,1


In [194]:
df_transformation_imputation = df_transformation.copy()
df_transformation_imputation

Unnamed: 0,Ind_ID,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members,Employed_years,Age,Is_Employed,isMale
1,5009744,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1,37.0,1,0
2,5009746,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1,,1,0
3,5009749,1,0,0,,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1,37.0,1,0
4,5009752,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1,37.0,1,0
5,5009753,1,0,0,315000.0,Pensioner,Higher education,Married,House / apartment,1,1,1,0,,2,-1,37.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,5028645,0,1,0,,Commercial associate,Higher education,Married,House / apartment,1,0,0,0,Managers,2,-5,33.0,1,0
1544,5023655,0,0,0,225000.0,Commercial associate,Incomplete higher,Single / not married,House / apartment,1,0,0,0,Accountants,1,-3,28.0,1,0
1545,5115992,1,1,2,180000.0,Working,Higher education,Married,House / apartment,1,0,0,0,Managers,4,-6,36.0,1,1
1546,5118219,1,0,0,270000.0,Working,Secondary / secondary special,Civil marriage,House / apartment,1,1,1,0,Drivers,2,-1,42.0,1,1


One way of managing missing values is by using imputation techniques. For now, we will use ```SimpleImputer``` from ```scikit-learn``` library.

In [195]:
# Import the SimpleImputer class
# Create an instance of SimpleImputer with the "most_frequent" strategy
imputer = SimpleImputer(missing_values=np.nan,strategy="most_frequent")

# Use the imputer to fill missing values in the "Type_Occupation" column
imputer.fit(df_transformation_imputation)
df_transformation_imputation = pd.DataFrame(imputer.transform(df_transformation_imputation))
df_transformation_imputation

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,5009744,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,Laborers,2,-1,37.0,1,0
1,5009746,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,Laborers,2,-1,42.0,1,0
2,5009749,1,0,0,135000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,Laborers,2,-1,37.0,1,0
3,5009752,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,Laborers,2,-1,37.0,1,0
4,5009753,1,0,0,315000.0,Pensioner,Higher education,Married,House / apartment,1,1,1,0,Laborers,2,-1,37.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,5028645,0,1,0,135000.0,Commercial associate,Higher education,Married,House / apartment,1,0,0,0,Managers,2,-5,33.0,1,0
1543,5023655,0,0,0,225000.0,Commercial associate,Incomplete higher,Single / not married,House / apartment,1,0,0,0,Accountants,1,-3,28.0,1,0
1544,5115992,1,1,2,180000.0,Working,Higher education,Married,House / apartment,1,0,0,0,Managers,4,-6,36.0,1,1
1545,5118219,1,0,0,270000.0,Working,Secondary / secondary special,Civil marriage,House / apartment,1,1,1,0,Drivers,2,-1,42.0,1,1


Another way of working with missing data is just ignoring it, this will be the next approach.

In [196]:
df_transformation_ignoringNA = df_transformation.copy()
df_transformation_ignoringNA

Unnamed: 0,Ind_ID,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members,Employed_years,Age,Is_Employed,isMale
1,5009744,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1,37.0,1,0
2,5009746,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1,,1,0
3,5009749,1,0,0,,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1,37.0,1,0
4,5009752,1,0,0,315000.0,Commercial associate,Higher education,Married,House / apartment,1,1,1,0,,2,-1,37.0,1,0
5,5009753,1,0,0,315000.0,Pensioner,Higher education,Married,House / apartment,1,1,1,0,,2,-1,37.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,5028645,0,1,0,,Commercial associate,Higher education,Married,House / apartment,1,0,0,0,Managers,2,-5,33.0,1,0
1544,5023655,0,0,0,225000.0,Commercial associate,Incomplete higher,Single / not married,House / apartment,1,0,0,0,Accountants,1,-3,28.0,1,0
1545,5115992,1,1,2,180000.0,Working,Higher education,Married,House / apartment,1,0,0,0,Managers,4,-6,36.0,1,1
1546,5118219,1,0,0,270000.0,Working,Secondary / secondary special,Civil marriage,House / apartment,1,1,1,0,Drivers,2,-1,42.0,1,1


What we should do next is changing the categorical variables into numerical, in order to do so we will use LabelEncoding (for now), so the NaN values will also be granted a label, that's one approach we will use

In [197]:
# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate over each column in the DataFrame
for column in df_transformation.columns:
    # Check if the column is of object type (categorical)
    if df_transformation[column].dtype == 'object':
        # Use label encoder to transform the column
        df_transformation[column] = label_encoder.fit_transform(df_transformation[column])



According to data source, non-positive value of column ```Employed Days``` indicated the years the person is working. However, it is more logical, that this column should be positive when person is working and negative when not.

In [198]:
df_transformation["Employed_years"] = df_transformation["Employed_years"] * -1


In [199]:
df_transformation

Unnamed: 0,Ind_ID,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members,Employed_years,Age,Is_Employed,isMale
1,5009744,1,0,0,315000.0,0,1,1,1,1,1,1,0,18,2,1,37.0,1,0
2,5009746,1,0,0,315000.0,0,1,1,1,1,1,1,0,18,2,1,,1,0
3,5009749,1,0,0,,0,1,1,1,1,1,1,0,18,2,1,37.0,1,0
4,5009752,1,0,0,315000.0,0,1,1,1,1,1,1,0,18,2,1,37.0,1,0
5,5009753,1,0,0,315000.0,1,1,1,1,1,1,1,0,18,2,1,37.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,5028645,0,1,0,,0,1,1,1,1,0,0,0,10,2,5,33.0,1,0
1544,5023655,0,0,0,225000.0,0,2,3,1,1,0,0,0,0,1,3,28.0,1,0
1545,5115992,1,1,2,180000.0,3,1,1,1,1,0,0,0,10,4,6,36.0,1,1
1546,5118219,1,0,0,270000.0,3,4,0,1,1,1,1,0,4,2,1,42.0,1,1


### Hypothesis testing


Now we can proceed with some hypothesis testing or other statistical test in order to find out something more.

Shapiro-Wilk Test

In [200]:
for column in df_transformation.select_dtypes(include=np.number).columns:
    stat, p = shapiro(df_transformation[column])
    print('Column:', column)
    print('Test Statistic:', stat)
    if(p > 0.05):
        print('Probably Gaussian')
    else:
        print('Probably not Gaussian')


Column: Ind_ID
Test Statistic: 0.9490448356105866
Probably not Gaussian
Column: Car_Owner
Test Statistic: 0.6226974818552935
Probably not Gaussian
Column: Propert_Owner
Test Statistic: 0.6019987030301852
Probably not Gaussian
Column: CHILDREN
Test Statistic: 0.5506461588590481
Probably not Gaussian
Column: Annual_income
Test Statistic: nan
Probably not Gaussian
Column: Type_Income
Test Statistic: 0.742107350724974
Probably not Gaussian
Column: EDUCATION
Test Statistic: 0.6185074818915145
Probably not Gaussian
Column: Marital_status
Test Statistic: 0.7105914736188281
Probably not Gaussian
Column: Housing_type
Test Statistic: 0.33994336455900376
Probably not Gaussian
Column: Mobile_phone
Test Statistic: 1.0
Probably Gaussian
Column: Work_Phone
Test Statistic: 0.4986439868343323
Probably not Gaussian
Column: Phone
Test Statistic: 0.581621744266775
Probably not Gaussian
Column: EMAIL_ID
Test Statistic: 0.3269528126775406
Probably not Gaussian
Column: Type_Occupation
Test Statistic: 0.88020

  res = hypotest_fun_out(*samples, **kwds)


In [201]:
df_transformation.dropna(inplace = True)

In [202]:
df_transformation.isna().sum()

Ind_ID             0
Car_Owner          0
Propert_Owner      0
CHILDREN           0
Annual_income      0
Type_Income        0
EDUCATION          0
Marital_status     0
Housing_type       0
Mobile_phone       0
Work_Phone         0
Phone              0
EMAIL_ID           0
Type_Occupation    0
Family_Members     0
Employed_years     0
Age                0
Is_Employed        0
isMale             0
dtype: int64

### EDA using visualizations


### MODEL 1 - HistGradientBoosting 

In [208]:
df_with_labels = pd.merge(df_transformation_imputation, labels, on='Ind_ID', how='left')
df_with_labels


KeyError: 'Ind_ID'

In [209]:
X = df_with_labels.drop("label",axis = 1)
X

NameError: name 'df_with_labels' is not defined

In [None]:
y = df_with_labels['label']
y

: 

In [None]:
model1_ignored_na = HistGradientBoostingClassifier()
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)

model1_ignored_na.fit(X_train,y_train)


: 

In [None]:
model1_ignored_na.score(X_test,y_test)

: 

In [None]:
y_pred = model1_ignored_na.predict(X_test)

: 

In [None]:
report_raw = classification_report(y_test, y_pred)
report_table_raw = pd.DataFrame([x.split() for x in report_raw.split('\n')[2:-5]], columns=['class', 'precision', 'recall', 'f1-score', 'support'])
print(report_table_raw)


: 

In [None]:
model1_ignored_na_PARAMS = {
    "learning_rate": [0.1, 0.01, 0.001],
    "max_iter": [100, 200, 300],
    "max_leaf_nodes": [31, 41, 51],
    "min_samples_leaf": [20, 30, 40]
}

: 

In [None]:
model1_ignored_na_grid = GridSearchCV(model1_ignored_na, model1_ignored_na_PARAMS, cv=5, n_jobs=-1)
model1_ignored_na_grid.fit(X_train, y_train)
model1_ignored_na_grid.best_params_

: 

In [None]:
model1_ignored_na_grid.score(X_test, y_test)
report_GSCV = classification_report(y_test, model1_ignored_na_grid.predict(X_test))
report_table_GSCV = pd.DataFrame([x.split() for x in report_GSCV.split('\n')[2:-5]], columns=['class', 'precision', 'recall', 'f1-score', 'support'])
print(report_GSCV)


: 

In [None]:
print(model1_ignored_na_grid.score(X_test, y_test))

: 