In [1]:
import numpy as np
import pandas as pd
import sklearn
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Credit Approvals Data Set 

__Source:__ [https://archive.ics.uci.edu/ml/datasets/Credit+Approval]

All attribute names and values have been changed to meaningless symbols to protect the confidentiality of the data

In [2]:
#importing and cleaning data
credit_raw = pd.read_csv('credit_approvals.csv', header=None)
credit_raw[1] = credit_raw[1].map(lambda x: None if x is '?' else float(x))
credit_raw[15] = credit_raw[15].map(lambda x: 1 if x is '+' else 0)
for col in credit_raw:
    credit_raw[col] = credit_raw[col].map(lambda x: None if x is '?' else x)
credit_raw.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,1


In [3]:
#creating df for features
#paring down attributes '5' and '6' to top 3-4 values
credit_raw_abbrev = credit_raw.loc[
    ((credit_raw[5] == 'c') | (credit_raw[5] == 'q')  
     | (credit_raw[5] == 'w')  | (credit_raw[5] == 'i')) 
    & ((credit_raw[6] == 'v') | (credit_raw[6] == 'h') 
       | (credit_raw[6] == 'bb'))]
credit_raw_abbrev.dropna()
features = pd.get_dummies(credit_raw_abbrev[5])
features['0'] = np.where((credit_raw_abbrev[0]=='a'), 1, 0)
features['8'] = np.where((credit_raw_abbrev[8]=='t'), 1, 0)
features['cube_root_2'] = (credit_raw_abbrev[2].dropna())**(1/3)
features['cube_root_7'] = (credit_raw_abbrev[7].dropna())**(1/3)
features['15'] = credit_raw_abbrev[15].dropna()
features.head()

Unnamed: 0,c,i,q,w,0,8,cube_root_2,cube_root_7,15
0,0,0,0,1,0,1,0.0,1.077217,1
1,0,0,1,0,1,1,1.646057,1.448631,1
2,0,0,1,0,1,1,0.793701,1.144714,1
3,0,0,0,1,0,1,1.1548,1.553616,1
4,0,0,0,1,0,1,1.778447,1.195819,1


In [4]:
print('Whole Data Set Value Counts\n', 100*features['15'].value_counts()/len(features))

#splitting data into train and test groups 75%, 25%
np.random.seed(222)
train = features.sample(frac=0.75, replace=False)
test = features.sample(frac=0.25, replace=False)

#ensuring similar proportions
print('Train Data Set Value Counts\n', 100*train['15'].value_counts()/len(train))
print('Test Data Set Value Counts\n', 100*test['15'].value_counts()/len(test))

Whole Data Set Value Counts
 0    52.694611
1    47.305389
Name: 15, dtype: float64
Train Data Set Value Counts
 0    52.4
1    47.6
Name: 15, dtype: float64
Test Data Set Value Counts
 1    53.571429
0    46.428571
Name: 15, dtype: float64


Target and Train both have similar distributions to the res of the sample

### Modeling
#### Attributes 0 and 2

In [5]:
#modeling attributes 0 and 2
data_train = train[['cube_root_2'] + ['0']]
target_train = train['15']

data_test = test[['cube_root_2'] + ['0']]
target_test = test['15']

In [6]:
#binary data so use bernoulli classifier
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(data_train, target_train)
y_pred_test = bnb.predict(data_test)

print('Number of mislabeled points out of a total {} points: {}'.format(
     data_test.shape[0],
     (target_test != y_pred_test).sum()
))
#percent mislabeled
print('Percent of Test Mislabeled: ', (100 * ((target_test != y_pred_test).sum()) /data_test.shape[0]), '%')

Number of mislabeled points out of a total 84 points: 43
Percent of Test Mislabeled:  51.19047619047619 %


#### Attributes 0, 2, and 7

In [7]:
#modeling attributes 0, 2, 7
data_train = train[['cube_root_2'] + ['0'] + ['cube_root_7']]
target_train = train['15']

data_test = test[['cube_root_2'] + ['0'] + ['cube_root_7']]
target_test = test['15']

In [8]:
#binary data so use bernoulli classifier
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(data_train, target_train)
y_pred_test = bnb.predict(data_test)

print('Number of mislabeled points out of a total {} points: {}'.format(
     data_test.shape[0],
     (target_test != y_pred_test).sum()
))
#percent mislabeled
print('Percent of Test Mislabeled: ', (100 * ((target_test != y_pred_test).sum()) /data_test.shape[0]), '%')

Number of mislabeled points out of a total 84 points: 43
Percent of Test Mislabeled:  51.19047619047619 %


#### Attributes 0, 2, 7, 8

In [9]:
#modeling attributes 0, 2, 7, 8
data_train = train[['cube_root_2'] + ['0'] + ['cube_root_7'] + ['8']]
target_train = train['15']

data_test = test[['cube_root_2'] + ['0'] + ['cube_root_7'] + ['8']]
target_test = test['15']

In [10]:
#binary data so use bernoulli classifier
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(data_train, target_train)
y_pred_test = bnb.predict(data_test)

print('Number of mislabeled points out of a total {} points: {}'.format(
     data_test.shape[0],
     (target_test != y_pred_test).sum()
))
#percent mislabeled
print('Percent of Test Mislabeled: ', (100 * ((target_test != y_pred_test).sum()) /data_test.shape[0]), '%')

Number of mislabeled points out of a total 84 points: 10
Percent of Test Mislabeled:  11.904761904761905 %


#### Attributes 0, 2, 5, 7, 8

In [11]:
#modeling attributes 0, 2, 5, 7, 8
data_train = train[['cube_root_2'] + ['0'] + ['cube_root_7'] + ['8'] + ['c'] + ['i'] + ['q'] + ['w']]
target_train = train['15']

data_test = test[['cube_root_2'] + ['0'] + ['cube_root_7'] + ['8'] + ['c'] + ['i'] + ['q'] + ['w']]
target_test = test['15']

In [12]:
#binary data so use bernoulli classifier
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(data_train, target_train)
y_pred_test = bnb.predict(data_test)

print('Number of mislabeled points out of a total {} points: {}'.format(
     data_test.shape[0],
     (target_test != y_pred_test).sum()
))
#percent mislabeled
print('Percent of Test Mislabeled: ', (100 * ((target_test != y_pred_test).sum()) /data_test.shape[0]), '%')

Number of mislabeled points out of a total 84 points: 11
Percent of Test Mislabeled:  13.095238095238095 %


The lowest percent of test points mislabeled came with attributes 0, 2, 7, and 8.  With these attributes, only 11.9% of points were mislabeled.