In [1]:
# data manipulation 
import numpy as np
import pandas as pd
from numpy import mean
from numpy import std
# modeling utilities
from sklearn import metrics
from sklearn import preprocessing
from sklearn import  linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap


In [2]:
credit_db = pd.read_csv('clean_dataset.csv')

In [3]:
encoder = OrdinalEncoder()
credit_db["Industry"] = encoder.fit_transform(credit_db[["Industry"]])
credit_db["Ethnicity"] = encoder.fit_transform(credit_db[["Ethnicity"]])
credit_db["Citizen"] = encoder.fit_transform(credit_db[["Citizen"]])

In [4]:
credit_db.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,1,30.83,0.0,1,1,7.0,4.0,1.25,1,1,1,0,0.0,202,0,1
1,0,58.67,4.46,1,1,9.0,1.0,3.04,1,1,6,0,0.0,43,560,1
2,0,24.5,0.5,1,1,9.0,1.0,1.5,1,0,0,0,0.0,280,824,1
3,1,27.83,1.54,1,1,7.0,4.0,3.75,1,1,5,1,0.0,100,3,1
4,1,20.17,5.625,1,1,7.0,4.0,1.71,1,0,0,0,1.0,120,0,1


### Control Variable

In [5]:
X, X_test, y, y_test = train_test_split(credit_db.iloc[:,0:-1], credit_db.iloc[:,-1], 
                                                    test_size=0.33, random_state=42)

In [6]:
model = LogisticRegression()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [7]:
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.819 (0.065)


### Principal Component Analysis (PCA)
Principal Component Analysis is the process of computing the principle components of a dataset, and using a specified amount of them to preform a change of basis on the data, ignoring the rest. By projecting each data point onto the specified components it achieves a lower-dimentional dataframe for the actual data itself.

In [8]:
n_score_mean = []
n_score_std = []

pcd_db = credit_db

def pca_eval(target):
    steps = [('pca', PCA(n_components=target)), ('m', LogisticRegression())]
    model = Pipeline(steps=steps)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    n_score_mean.append(mean(n_scores))
    n_score_std.append(std(n_scores))


for i in range(1,16):
    pca_eval(i)


In [13]:
i = 14
print('Inital Number of Components = 15')
print('Total Components : Accuracy Mean (STD)')
for score in n_score_mean:
    print('\t%.0f\t : \t%.3f (%.3f)' % (i+1, n_score_mean[i], n_score_std[i]))
    i = i-1

Inital Number of Components : 15
Total Components : Accuracy Mean (STD)
	15	 : 	0.842 (0.059)
	14	 : 	0.850 (0.049)
	13	 : 	0.846 (0.058)
	12	 : 	0.837 (0.052)
	11	 : 	0.811 (0.043)
	10	 : 	0.801 (0.049)
	9	 : 	0.792 (0.052)
	8	 : 	0.781 (0.056)
	7	 : 	0.779 (0.054)
	6	 : 	0.768 (0.057)
	5	 : 	0.769 (0.061)
	4	 : 	0.750 (0.064)
	3	 : 	0.670 (0.064)
	2	 : 	0.668 (0.057)
	1	 : 	0.664 (0.059)


Interestingly enough, at the time of our experimentations, removing none leads to a 84.2% accuracy, removing one leads to 85.0%, and removing 2, leads to a 84.6%.
With this data it is save to infer that by removing 1 component from the dataframe, PCA manages to increase the accuracy of the linear regression model between the components and the target value. (i.e.