### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN

# Part 1: Data Exploration

In [2]:
dataset = pd.read_csv('train.csv')

  dataset = pd.read_csv('train.csv')


In [3]:
print(f'Data shape: {dataset.shape}\n')
print(f'Data features: \n{dataset.dtypes}')

Data shape: (100000, 28)

Data features: 
ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                          object
SSN                          object
Occupation                   object
Annual_Income                object
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment       object
Changed_Credit_Limit         object
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance       

#### Dropping irrelevent columns

In [4]:
dataset.drop(columns = ['ID','Customer_ID','Name','SSN'], inplace = True)

#### View unique values for each column

In [5]:
# Function to help in identifying the unique values of a feature and its type to help in feature engineering
def unique_vals_and_type(data: pd.DataFrame, feature: str, cap: int):
  print(f'The unique values of \'{feature}\' feature of type {data[feature].dtype}: {dataset[feature].unique().tolist()[0:cap]}')

for feature in dataset.columns:
  unique_vals_and_type(dataset, feature, 25)

The unique values of 'Month' feature of type object: ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August']
The unique values of 'Age' feature of type object: ['23', '-500', '28_', '28', '34', '54', '55', '21', '31', '33', '34_', '7580', '30', '30_', '24', '24_', '44', '45', '40', '41', '32', '33_', '35', '35_', '36']
The unique values of 'Occupation' feature of type object: ['Scientist', '_______', 'Teacher', 'Engineer', 'Entrepreneur', 'Developer', 'Lawyer', 'Media_Manager', 'Doctor', 'Journalist', 'Manager', 'Accountant', 'Musician', 'Mechanic', 'Writer', 'Architect']
The unique values of 'Annual_Income' feature of type object: ['19114.12', '34847.84', '34847.84_', '143162.64', '30689.89', '30689.89_', '35547.71_', '35547.71', '73928.46', '131313.4', '10909427.0', '34081.38_', '34081.38', '114838.41', '114838.41_', '31370.8', '33751.27', '88640.24', '88640.24_', '54392.16', '54392.16_', '8701.545', '8701.545_', '25546.26', '25546.26_']
The unique values of 'Month

#### Transforming Features & Filtering Entries

In [6]:
# Encode the Month feature to numerical values
def encode_month(month):
  calendar = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}
  return calendar.get(month)
dataset['Month'] = dataset['Month'].apply(encode_month).astype(int)

In [7]:
def remove_underscores_int(data: pd.DataFrame, feature: str):
  if (data[feature].dtype != 'object'): return
  data[feature].replace(r'_', '',inplace=True, regex=True)
  data[feature] = data[feature].astype(int)

remove_underscores_int(dataset, 'Age')
# Filter age to account only for entries that are within a reasonable range
before = len(dataset)
dataset = dataset[(15 <= dataset['Age']) & (dataset['Age'] <= 110)]
print(f'Number of entries whose age is out of the range [15,110] before filtering = {before - len(dataset)}')

Number of entries whose age is out of the range [15,110] before filtering = 3949


In [8]:
# Filter missing values from Occupation
before = len(dataset)
dataset = dataset[(dataset['Occupation'] != '_______')]
print(f'Number of removed entries with missing Occupation = {before - len(dataset)}')

Number of removed entries with missing Occupation = 6805


In [9]:
def remove_underscores_float(data: pd.DataFrame, feature: str):
  if (data[feature].dtype != 'object'): return
  data[feature].replace(r'_', '',inplace=True, regex=True)
  data[feature] = data[feature].astype(float)

remove_underscores_float(dataset, 'Annual_Income')
# Clip outliers which are more than 3 standard deviations away from the mean annual income
avg = dataset['Annual_Income'].mean()
sdev = dataset['Annual_Income'].std()
outliers = len(dataset[(dataset['Annual_Income'] > avg+3*sdev) | (dataset['Annual_Income'] < avg-3*sdev)])
dataset['Annual_Income'] = dataset['Annual_Income'].clip(lower = avg-3*sdev, upper = avg+3*sdev)
print(f'Number of clipped outliers for annual income = {outliers}')

Number of clipped outliers for annual income = 740
