### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
import matplotlib.pyplot as plt
import random
import seaborn as sns

# Part 1: Data Preprocessing

## Task 1.1: Exploration and Initial Feature Selection

Reading the dataset.

In [None]:
dataset = pd.read_csv('train.csv')
print(f'Data shape: {dataset.shape}\n')
print(f'Data features: \n{dataset.dtypes}')

Dropping irrelevent columns. We can just infer by common sense that these features don't affect the credit score.

In [None]:
dataset.drop(columns = ['ID','Customer_ID','Name','SSN','Type_of_Loan'], inplace = True)

## Task 1.2: Cleaning and Transformation

Viewing unique values for each column.

In [None]:
# Function to help in identifying the unique values of a feature and its type to help in feature engineering
def unique_vals_and_type(data: pd.DataFrame, feature: str, cap: int):
  print(f'The unique values of \'{feature}\' feature of type {data[feature].dtype}: {dataset[feature].unique().tolist()[0:cap]}')

for feature in dataset.columns:
  unique_vals_and_type(dataset, feature, 25)

Defining some helper functions to use in analysis and cleaning.

In [None]:
# Removes special characters (underscores) from numeric data while ignoring nan values
def remove_underscores_numeric(data: pd.DataFrame, feature: str, remove_neg: bool = False):
  if data[feature].dtype == 'object':
    data[feature] = pd.to_numeric(data[feature].str.replace('_', ''), errors = 'coerce')
  if remove_neg: 
    data.loc[data[feature] < 0, feature] = np.nan

# Counts outliers which are k standard deviations away from the mean
def count_outliers(data: pd.DataFrame, feature: str, k: int):
  m,s = data[feature].mean() , data[feature].std()
  return np.sum((data[feature] < m - k*s) | (data[feature] > m + k*s))

# Clips outliers which are k standard deviations away from the mean
def clip_outliers(data: pd.DataFrame, feature: str, k: int):
  m,s = data[feature].mean() , data[feature].std()
  data[feature] = data[feature].clip(lower = m - k*s, upper = m + k*s)

Convert numerical features from 'object' type to their orignal type and replacing missing data of each with NA values.

In [None]:
remove_underscores_numeric(dataset, 'Age', remove_neg = True)
remove_underscores_numeric(dataset, 'Num_of_Loan', remove_neg = True)
remove_underscores_numeric(dataset, 'Num_of_Delayed_Payment')
remove_underscores_numeric(dataset, 'Num_Bank_Accounts', remove_neg = True)
remove_underscores_numeric(dataset, 'Annual_Income')
remove_underscores_numeric(dataset, 'Changed_Credit_Limit')
remove_underscores_numeric(dataset, 'Outstanding_Debt')
remove_underscores_numeric(dataset, 'Amount_invested_monthly')
remove_underscores_numeric(dataset, 'Monthly_Balance')
remove_underscores_numeric(dataset, 'Num_of_Delayed_Payment')

Replacing erroneous values in text data with NA values.

In [None]:
dataset['Occupation'] = dataset['Occupation'].replace(to_replace = '_______', value = np.nan)
dataset['Credit_Mix'] = dataset['Credit_Mix'].replace(to_replace = '_', value = np.nan)
dataset['Payment_of_Min_Amount'] = dataset['Payment_of_Min_Amount'].replace(to_replace = 'NM', value = np.nan)
dataset['Payment_Behaviour'] = dataset['Payment_Behaviour'].replace(to_replace = '!@9#%8', value = np.nan)

Extracting data from text columns which have embedded values in a certain form.

In [None]:
# Replace the Credit History Age column with years and months.
temp_df = dataset['Credit_History_Age'].str.extract(r'(?P<Years>[\d]?[\d]) Years and (?P<Months>[\d]?[\d]) Months', expand = True)
dataset['Credit_History_Age'] = temp_df['Years'].astype('float64') + (temp_df['Months'].astype('float64') / 12)

# Split the Payement_Behavior feature into two features which will replace the original feature
temp_df = dataset['Payment_Behaviour'].str.extract(r'(?P<Spending_Behavior>High|Low)_spent_(?P<Payment_Volume>Small|Medium|Large)_value_payments', expand = True)
dataset = pd.concat([dataset.drop(columns = ['Payment_Behaviour']), temp_df], axis = 1)

Checking unique values for each categorical column after cleaning.

In [None]:
# Get which features are of 'object' type in the dataset and their unique values
for feature in dataset.select_dtypes(include = 'object').columns:
  unique_vals_and_type(dataset, feature, 25)

To decide whether the occupation and Month columns should be kept, we plot a histogram of occupation - credit score and another one with month - credit score.

In [None]:
# Plot a cluster column chart of occupation vs credit score
counts = dataset.groupby(['Occupation','Credit_Score'], dropna = True).size().unstack()
counts.plot(kind = 'bar', stacked = False, figsize = (10, 5))
plt.xlabel('Occupation')
plt.ylabel('Count')
plt.legend(title='Credit_Score')
plt.show()

In [None]:
# Plot a cluster column chart of month vs credit score
counts = dataset.groupby(['Month','Credit_Score'], dropna = True).size().unstack()
counts.plot(kind = 'bar', stacked = False, figsize = (10, 5))
plt.xlabel('Month')
plt.ylabel('Count')
plt.legend(title='Credit_Score')
plt.show()

The distribution of credit score for each occupation and for each month are nearly the same, so occupation & month values won't be significant for our model.

In [None]:
# Drop the 'Occupation' and 'Month' columns
dataset.drop(columns = ['Occupation','Month'], inplace = True)

## Task 1.3: Encoding and Imputation

We'll encode each categorical variable by a numerical ordinal encoding.  
For this notebook, we will map the values before the train-test split, but it will be easy to construct a custom mapper class to take in a dataframe similar to the original and transform its values to the specified mapping before classification.

In [None]:
# Grouping categorical features and grouping numeric features for future use
cat_features = dataset.select_dtypes(include = 'object')
num_features = dataset.select_dtypes(exclude = 'object')
cat_features.head()

In [None]:
# Encoding the categorical variables in a separate dataframe
encoder = OrdinalEncoder(
  categories = [['Bad','Standard','Good'],['No','Yes'],['Poor','Standard','Good'],['Low','High'],['Small','Medium','Large']],
  handle_unknown = 'use_encoded_value',
  unknown_value = np.nan
)
encoded_cat_features = pd.DataFrame(data = encoder.fit_transform(cat_features), columns = cat_features.columns)
encoded_cat_features.head()

In [None]:
# Obtaining the encoded data
encoded_data = pd.concat([dataset.drop(columns = cat_features.columns), encoded_cat_features], axis = 1)
encoded_data.head()

Describe the dataset to see summaries of each feature.

In [None]:
dataset.describe()

Analyzing the original data for NA value and outlier percentage. We will process the encoded data accordingly after that.

In [None]:
def count_na_and_outliers(dataset: pd.DataFrame):
  # Count outliers and NA values for each numerical feature and store results in a dataframe
  features, outliers, percent_outliers, nas, percent_nas = [],[],[],[],[]

  for f in dataset.columns:
    features.append(f)
    if (dataset[f].dtype != 'object'):
      outliers.append(count_outliers(dataset, f, 3))
      percent_outliers.append(outliers[-1]/dataset.shape[0] * 100)
    else:
      outliers.append(np.nan)
      percent_outliers.append(np.nan)
    nas.append(dataset[f].isna().sum())
    percent_nas.append(nas[-1]/dataset.shape[0] * 100)

  return pd.DataFrame({'Feature': features, 'Outliers': outliers, 'Outlier%': percent_outliers, 'NA': nas, 'NA%': percent_nas})

count_na_and_outliers(dataset)

All outliers seem to be within the 5% range of the whole dataset, so we will leave them as they might belong to the original distribution. On the other hand, NA values will cause problems, and will incur a noticable loss of information if we just delete the rows or columns which have NA values in all features, so we will perform data imputation for these values.  
We will remove entries with NA Age or Num_Bank_Accounts as these are not many and won't have a significant effect on the model, then, we will use simple imputation by most frequent value for categorical data and iterative imputation for numerical data.

In [None]:
# Removing rows with NA Age or Num_Bank_Accounts
imputed_data = encoded_data.copy().dropna(subset = ['Age','Num_Bank_Accounts'])

# Performing data imputation for categorical data
cat_imputer = SimpleImputer(strategy = 'most_frequent')
cat_list = ['Credit_Mix','Payment_of_Min_Amount','Spending_Behavior','Payment_Volume']
imputed_data[cat_list] = cat_imputer.fit_transform(imputed_data[cat_list])

# Performing data imputation for numerical data
num_imputer = IterativeImputer(max_iter = 10, random_state = 42)
num_list = ['Monthly_Inhand_Salary','Num_of_Loan','Num_of_Delayed_Payment','Changed_Credit_Limit','Num_Credit_Inquiries','Credit_History_Age','Amount_invested_monthly','Monthly_Balance']
imputed_data[num_list] = num_imputer.fit_transform(imputed_data[num_list])

# Analyzing NA and Outlier counts for the new data
count_na_and_outliers(imputed_data)

In [None]:
imputed_data.describe()

# Part 2: Training

## Task 2.1: Splitting The Data

## Task 2.2: Normalizing The Training Set

## Task 2.3: Model Training

# Part 3: Testing & Evaluation

## Task 3.1: Running The Models

## Task 3.2: Evaluation Results 

## Task 3.3: Evaluation Results' Plotting