### This script file is used to analysis Boston.csv dataset.

<b>Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression using various subsets of the predictors. Describe your findings. In particular, report the features, confusion matrix, AUC that appears to provide the best results on a held out data.</b>

In [2]:
import pandas as pd
import numpy as np
from statistics import median
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_curve, auc

In [3]:
df = pd.read_csv('data/Boston.csv')
df = df.drop('index', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
crim       506 non-null float64
zn         506 non-null float64
indus      506 non-null float64
chas       506 non-null int64
nox        506 non-null float64
rm         506 non-null float64
age        506 non-null float64
dis        506 non-null float64
rad        506 non-null int64
tax        506 non-null int64
ptratio    506 non-null float64
black      506 non-null float64
lstat      506 non-null float64
medv       506 non-null float64
dtypes: float64(11), int64(3)
memory usage: 55.4 KB


In [4]:
description = df.describe()
print(description)

             crim          zn       indus        chas         nox          rm  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean     3.613524   11.363636   11.136779    0.069170    0.554695    6.284634   
std      8.601545   23.322453    6.860353    0.253994    0.115878    0.702617   
min      0.006320    0.000000    0.460000    0.000000    0.385000    3.561000   
25%      0.082045    0.000000    5.190000    0.000000    0.449000    5.885500   
50%      0.256510    0.000000    9.690000    0.000000    0.538000    6.208500   
75%      3.677082   12.500000   18.100000    0.000000    0.624000    6.623500   
max     88.976200  100.000000   27.740000    1.000000    0.871000    8.780000   

              age         dis         rad         tax     ptratio       black  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean    68.574901    3.795043    9.549407  408.237154   18.455534  356.674032   
std     28.148861    2.1057

In [5]:
# full set predictors
predictors = list(df.columns)
predictors.remove('crim')
print(predictors)

['zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv']


In [6]:
# Convert 'crim' column into binary representation
df['crim_bi'] = np.zeros(len(df.crim))
df.loc[df['crim'] <= median(np.array(df['crim'])), 'crim_bi'] = 1
df.loc[df['crim'] > median(np.array(df['crim'])), 'crim_bi'] = 0

In [7]:
def caculate_auc(features):
    X_full = np.array(df[features])
    Y_full = np.array(df['crim_bi'])
    
    logit = LogisticRegression()
    res_logit = logit.fit(X_full, Y_full)
    Y_pred_full = res_logit.predict(X_full)
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_full, Y_pred_full)
    return auc(false_positive_rate, true_positive_rate)

In [8]:
# first round iteration
for p in predictors:
    auc_res = caculate_auc([p])
    print([p], auc_res)

['zn'] 0.705533596838
['indus'] 0.772727272727
['chas'] 0.517786561265
['nox'] 0.800395256917
['rm'] 0.565217391304
['age'] 0.794466403162
['dis'] 0.770750988142
['rad'] 0.758893280632
['tax'] 0.754940711462
['ptratio'] 0.654150197628
['black'] 0.664031620553
['lstat'] 0.709486166008
['medv'] 0.656126482213


Pick the highest value => nox = 0.800395256917

In [9]:
predictors.remove('nox')
print(predictors)

['zn', 'indus', 'chas', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv']


In [10]:
# second round iteration
for p in predictors:
    pred_sec = ['nox']
    pred_sec.append(p)
    auc_res = caculate_auc(pred_sec)
    print(pred_sec, auc_res)

['nox', 'zn'] 0.790513833992
['nox', 'indus'] 0.788537549407
['nox', 'chas'] 0.828063241107
['nox', 'rm'] 0.833992094862
['nox', 'age'] 0.812252964427
['nox', 'dis'] 0.780632411067
['nox', 'rad'] 0.832015810277
['nox', 'tax'] 0.810276679842
['nox', 'ptratio'] 0.800395256917
['nox', 'black'] 0.839920948617
['nox', 'lstat'] 0.810276679842
['nox', 'medv'] 0.822134387352


Pick the highest value => nox, black = 0.839920948617

In [11]:
predictors.remove('black')
print(predictors)

['zn', 'indus', 'chas', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat', 'medv']


In [12]:
# third round iteration
for p in predictors:
    pred_third = ['nox', 'black']
    pred_third.append(p)
    auc_res = caculate_auc(pred_third)
    print(pred_third, auc_res)

['nox', 'black', 'zn'] 0.851778656126
['nox', 'black', 'indus'] 0.814229249012
['nox', 'black', 'chas'] 0.835968379447
['nox', 'black', 'rm'] 0.841897233202
['nox', 'black', 'age'] 0.818181818182
['nox', 'black', 'dis'] 0.792490118577
['nox', 'black', 'rad'] 0.849802371542
['nox', 'black', 'tax'] 0.822134387352
['nox', 'black', 'ptratio'] 0.828063241107
['nox', 'black', 'lstat'] 0.826086956522
['nox', 'black', 'medv'] 0.837944664032


Pick the highest value => nox, black, zn = 0.851778656126

In [13]:
predictors.remove('zn')
print(predictors)

['indus', 'chas', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat', 'medv']


In [14]:
# forth round iteration
for p in predictors:
    pred_forth = ['nox', 'black', 'zn']
    pred_forth.append(p)
    auc_res = caculate_auc(pred_forth)
    print(pred_forth, auc_res)

['nox', 'black', 'zn', 'indus'] 0.814229249012
['nox', 'black', 'zn', 'chas'] 0.841897233202
['nox', 'black', 'zn', 'rm'] 0.843873517787
['nox', 'black', 'zn', 'age'] 0.830039525692
['nox', 'black', 'zn', 'dis'] 0.796442687747
['nox', 'black', 'zn', 'rad'] 0.839920948617
['nox', 'black', 'zn', 'tax'] 0.802371541502
['nox', 'black', 'zn', 'ptratio'] 0.835968379447
['nox', 'black', 'zn', 'lstat'] 0.826086956522
['nox', 'black', 'zn', 'medv'] 0.853754940711


Pick the highest value => nox, black, zn, medv = 0.853754940711

In [15]:
predictors.remove('medv')
print(predictors)

['indus', 'chas', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']


In [16]:
# fifth round iteration
for p in predictors:
    pred_fifth = ['nox', 'black', 'zn', 'medv']
    pred_fifth.append(p)
    auc_res = caculate_auc(pred_fifth)
    print(pred_fifth, auc_res)

['nox', 'black', 'zn', 'medv', 'indus'] 0.820158102767
['nox', 'black', 'zn', 'medv', 'chas'] 0.851778656126
['nox', 'black', 'zn', 'medv', 'rm'] 0.841897233202
['nox', 'black', 'zn', 'medv', 'age'] 0.837944664032
['nox', 'black', 'zn', 'medv', 'dis'] 0.802371541502
['nox', 'black', 'zn', 'medv', 'rad'] 0.839920948617
['nox', 'black', 'zn', 'medv', 'tax'] 0.826086956522
['nox', 'black', 'zn', 'medv', 'ptratio'] 0.849802371542
['nox', 'black', 'zn', 'medv', 'lstat'] 0.828063241107


None of the result is greater than the maximum in the forth iteration result.

So we can conclude the best predictor combination is ['nox', 'black', 'zn', 'medv']

In [17]:
predictors_op = list(['nox','black','zn','medv'])
X_full_op = np.array(df[predictors_op])
Y_full_op = np.array(df['crim'])

Y_full_op[Y_full_op <= median(Y_full_op)] = 1
Y_full_op[Y_full_op != 1] = 0

logit = LogisticRegression()
res_logit = logit.fit(X_full_op, Y_full_op)
Y_pred_full_op = res_logit.predict(X_full_op)

# Confusion matrix
confusion_matrix(Y_full_op, Y_pred_full_op)

array([[219,  34],
       [ 40, 213]])

In [18]:
# Accuracy, precision and recall
print("Accuracy full:", np.round(accuracy_score(Y_full_op, Y_pred_full_op), 3))
print("Precision full:", np.round(precision_score(Y_full_op, Y_pred_full_op), 3))
print("Recall full:", np.round(recall_score(Y_full_op, Y_pred_full_op), 3))

Accuracy full: 0.854
Precision full: 0.862
Recall full: 0.842
