In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import sys
sys.path.append('../')
from Function import *

In [2]:
data = pd.read_csv('..\data\credit_customers.csv')
data.columns = data.columns.str.strip()
data = data.rename(columns={"class": "Target"})

## Handling Outliers

In [3]:
data = trimming(data, "credit_amount",2.5)

## Categorical Cols

In [4]:
num_data = data._get_numeric_data()
cat_data = data.drop(columns=data._get_numeric_data().columns)

In [5]:
# Divide categorical cols into 2 types:
#   + Range cols: 
range_cols = ['checking_status','employment','savings_status','foreign_worker','own_telephone','Target']
#   + Categorical cols: others
ocat_cols = [col for col in cat_data.columns if col not in range_cols]

## Deal with Range cols (Label Encoder)

In [6]:
range_data = cat_data[range_cols]
print('--------------------Unique Values--------------------------')
print('\n')
for col in range_cols:
    print(f'{col} : {range_data[col].unique()}')
print('\n')
print('--------------------Process-----------------------------')
checking_mapping = {'<0': 1, '0<=X<200': 2, 'no checking': 0, '>=200': 3}
employment_mapping = {'>=7': 4, '1<=X<4': 2, '4<=X<7': 3, 'unemployed': 0, '<1': 1}
savings_mapping = {'no known savings': 0, '<100': 1, '500<=X<1000': 4, '>=1000': 5, '100<=X<500': 2}
foreign_mapping = {'no': 0, 'yes': 1}
tele_mapping = {'none': 0, 'yes': 2}
target_mapping = {'good':0, 'bad': 1}
mapping_list = [checking_mapping,employment_mapping,savings_mapping,foreign_mapping,tele_mapping,target_mapping]

for i,col in enumerate(range_cols):
    data[col] = data[col].map(mapping_list[i])
for col in range_cols:
    print(f'{col} : {data[col].unique()}')

--------------------Unique Values--------------------------


checking_status : ['<0' '0<=X<200' 'no checking' '>=200']
employment : ['>=7' '1<=X<4' '4<=X<7' 'unemployed' '<1']
savings_status : ['no known savings' '<100' '500<=X<1000' '>=1000' '100<=X<500']
foreign_worker : ['yes' 'no']
own_telephone : ['yes' 'none']
Target : ['good' 'bad']


--------------------Process-----------------------------
checking_status : [1 2 0 3]
employment : [4 2 3 0 1]
savings_status : [0 1 4 5 2]
foreign_worker : [1 0]
own_telephone : [2 0]
Target : [0 1]


## Deal with others cols (ordinal Encoder)

In [8]:
ocat_data = cat_data[ocat_cols]
onehot = OneHotEncoder()
encoded_data = onehot.fit_transform(data[ocat_cols])




--------------------Process-----------------------------


# Final Data

In [10]:
final_data = pd.concat((pd.DataFrame(encoded_data.toarray()),num_data.reset_index(drop = True),data[range_cols].reset_index(drop = True)), axis = 1)
final_data.to_csv('../processed_data/data_lab+one.csv')