# Boston Crime Data

Use sci-kit learn: 
RandomForestRegressor & Support Vector Regression. 

Dimension Reduction using Principal Component Analysis

In [3]:
from sklearn.datasets import load_boston
boston = load_boston()

In [4]:
type(boston)

sklearn.datasets.base.Bunch

In [5]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [6]:
type(boston['data'])

numpy.ndarray

In [7]:
boston['feature_names']

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'],
      dtype='<U7')

In [8]:
print(boston['DESCR'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [9]:
import pandas as pd

In [10]:
df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
df.max(axis=0)

CRIM        88.9762
ZN         100.0000
INDUS       27.7400
CHAS         1.0000
NOX          0.8710
RM           8.7800
AGE        100.0000
DIS         12.1265
RAD         24.0000
TAX        711.0000
PTRATIO     22.0000
B          396.9000
LSTAT       37.9700
dtype: float64

In [11]:
hdf = df.head(10)
hdf.style.background_gradient()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1


In [12]:
def top20(col):
    is_top20 = col > col.quantile(.8)
    return ['font-weight: bold' if v else '' for v in is_top20]

In [13]:
hdf.style.apply(top20)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1


# Now apply some Machine Learning

In [14]:
#split data
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(boston['data'], boston['target'], test_size=0.3)

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
reg = RandomForestRegressor()
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

0.87030544179091329

In [18]:
reg.score?

In [19]:
#number of features
reg.n_features_

13

In [24]:
#number of features consistant
X_train.shape

(354, 13)

In [25]:
row = X_train[17]
row.shape

(13,)

In [26]:
row.reshape(-1, 13)

array([[   8.26725,    0.     ,   18.1    ,    1.     ,    0.668  ,
           5.875  ,   89.6    ,    1.1296 ,   24.     ,  666.     ,
          20.2    ,  347.88   ,    8.88   ]])

In [29]:
#prediction value
reg.predict(row.reshape(-1, 13))

array([ 40.37])

In [30]:
#actual value
y_train[17]

50.0

In [31]:
#Support Vector Regression
from sklearn.svm import SVR

In [32]:
reg = SVR()
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

-0.0019768721924964261

Not good!

In [33]:
#scale data
from sklearn import preprocessing
Xs = preprocessing.scale(boston['data'])

In [34]:
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, boston['target'], test_size=0.3)

In [35]:
df = pd.DataFrame(Xs, columns=boston['feature_names'])
df.max(axis=0)

CRIM       9.941735
ZN         3.804234
INDUS      2.422565
CHAS       3.668398
NOX        2.732346
RM         3.555044
AGE        1.117494
DIS        3.960518
RAD        1.661245
TAX        1.798194
PTRATIO    1.638828
B          0.441052
LSTAT      3.548771
dtype: float64

In [36]:
reg = SVR()
reg.fit(Xs_train, ys_train)
reg.score(Xs_test, ys_test)

0.55526597756760854

Much improved score after scaling

# Dimension Reduction

In [37]:
#Principal Component Analysis
from sklearn.decomposition import PCA

In [38]:
#Reduce dimensions 13 ->5
pca = PCA(n_components=5)
pca.fit(boston['data'])

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [39]:
Xp = pca.transform(boston['data'])
Xp.shape

(506, 5)

In [40]:
reg = RandomForestRegressor()
Xp_train, Xp_test, yp_train, yp_test = train_test_split(Xp, boston['target'], test_size=0.3)
reg.fit(Xp_train, yp_train)
reg.score(Xp_test, yp_test)

0.53161065377201178

Random Forest not as good performance with reduced dimensions, in this case

In [41]:
#Standardize features by removing the mean and scaling to unit variance
#Pipeline of transforms with a final estimator. Sequentially apply a list of transforms and a final estimator.

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [42]:
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ('svr', SVR()),
])

In [43]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.49125501859540871

In [44]:
pipe.steps

[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('pca',
  PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)),
 ('svr',
  SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))]

In [45]:
pipe.get_params()

{'pca': PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
   svd_solver='auto', tol=0.0, whiten=False),
 'pca__copy': True,
 'pca__iterated_power': 'auto',
 'pca__n_components': 5,
 'pca__random_state': None,
 'pca__svd_solver': 'auto',
 'pca__tol': 0.0,
 'pca__whiten': False,
 'scale': StandardScaler(copy=True, with_mean=True, with_std=True),
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'steps': [('scale', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('pca',
   PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
     svd_solver='auto', tol=0.0, whiten=False)),
  ('svr',
   SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
     kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))],
 'svr': SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
   kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
 'svr__C': 1.0,
 'sv

In [46]:
pipe.set_params(svr__C=0.9)

Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svr', SVR(C=0.9, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])

# Store model

In [47]:
import pickle

In [48]:
#dump model as model.pickle
with open('model.pickle', 'wb') as out:
    pickle.dump(pipe, out)

In [49]:
#if want to use model again
with open('model.pickle', 'rb') as fp:
    pipe1 = pickle.load(fp)

In [50]:
pipe1.steps

[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('pca',
  PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)),
 ('svr',
  SVR(C=0.9, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))]

In [51]:
#get same score as before
pipe1.score(X_test, y_test)

0.49125501859540871