In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [3]:
pwd

'/Users/samiksha/Downloads'

### 1. Load the file into a dataframe

In [4]:
DATA_DIR = '/Users/samiksha/Downloads'
FILE_NAME = 'Housing.csv'
data_path = os.path.join(DATA_DIR, FILE_NAME)
datHousing = pd.read_csv(data_path)

### Inspect the top rows

In [5]:
datHousing.head()

Unnamed: 0,CRIM,ZN,INDUS,RIVER,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,LSTAT,MEDV
0,3.32105,0.0,19.58,Yes,0.871,5.403,100.0,1.3216,5,403,14.7,26.82,13.4
1,1.12658,0.0,19.58,Yes,0.871,5.012,88.0,1.6102,5,403,14.7,12.12,15.3
2,1.41385,0.0,19.58,Yes,0.871,6.129,96.0,1.7494,5,403,14.7,15.12,17.0
3,3.53501,0.0,19.58,Yes,0.871,6.152,82.6,1.7455,5,403,14.7,15.02,15.6
4,1.27346,0.0,19.58,Yes,0.605,6.25,92.6,1.7984,5,403,14.7,5.5,27.0


### Convert Headers to Lower Case

In [6]:
datHousing.rename(columns=lambda x:x.lower(),inplace=True)

## 2. Creating Dummy Variable

In [7]:
# Create a dummy variable for river
datHousing['rivercode']=datHousing.river.map({'No':0, 'Yes':1})
#datHousing.drop(['river'], inplace = True, axis = 1)

In [8]:
# Check if the variable is added in the table
datHousing.head()

Unnamed: 0,crim,zn,indus,river,nox,rm,age,dis,rad,tax,pratio,lstat,medv,rivercode
0,3.32105,0.0,19.58,Yes,0.871,5.403,100.0,1.3216,5,403,14.7,26.82,13.4,1
1,1.12658,0.0,19.58,Yes,0.871,5.012,88.0,1.6102,5,403,14.7,12.12,15.3,1
2,1.41385,0.0,19.58,Yes,0.871,6.129,96.0,1.7494,5,403,14.7,15.12,17.0,1
3,3.53501,0.0,19.58,Yes,0.871,6.152,82.6,1.7455,5,403,14.7,15.02,15.6,1
4,1.27346,0.0,19.58,Yes,0.605,6.25,92.6,1.7984,5,403,14.7,5.5,27.0,1


In [9]:
# Check mean for rivercode
datHousing['rivercode'].mean()

0.0691699604743083

## 3. Train-test split

In [10]:
X = datHousing.drop(['river','medv'], axis=1)
y = datHousing['medv']

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

### Standarization: centering and scaling

In [12]:
numerical_features = ['crim', 'zn', 'indus', 'nox','rm','age','dis','rad','tax','pratio','lstat','rivercode']

In [13]:
# 1. Import the class you will use
from sklearn.preprocessing import StandardScaler
# 2. Create an instance of the class
scaler = StandardScaler()
# 3. Use the fit method of the instance
scaler.fit(X_train[numerical_features])
# 4. Use the transform method to perform the transformation
X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [14]:
X_train[numerical_features].head()

Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,pratio,lstat,rivercode
203,-0.392853,-0.49813,-1.02067,-0.351216,0.359798,-1.218603,-0.359967,-0.51376,-0.667895,-0.913236,-1.016351,-0.262265
441,0.707342,-0.49813,1.041113,1.61775,0.160365,1.052647,-0.854604,1.69679,1.550259,0.7997,0.931489,-0.262265
172,-0.360307,-0.49813,1.59728,0.624706,0.228742,1.095167,-0.954394,-0.630104,0.177402,1.275515,0.254755,-0.262265
95,-0.383434,0.560606,-0.862184,-0.839178,-0.786944,-0.045773,1.537147,-0.164726,-0.739835,0.561792,0.057089,-0.262265
54,-0.317761,-0.49813,-0.420478,-0.111516,-0.806888,0.071155,-0.052117,-0.630104,-0.60195,1.180352,-0.199603,-0.262265


In [15]:
X_train[numerical_features].describe().round(3)

Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,pratio,lstat,rivercode
count,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0
std,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001
min,-0.4,-0.498,-1.547,-1.421,-3.892,-2.289,-1.268,-0.979,-1.321,-2.816,-1.354,-0.262
25%,-0.392,-0.498,-0.862,-0.882,-0.578,-0.878,-0.827,-0.63,-0.758,-0.533,-0.818,-0.262
50%,-0.375,-0.498,-0.359,-0.231,-0.094,0.284,-0.25,-0.514,-0.464,0.253,-0.19,-0.262
75%,-0.003,0.248,1.041,0.625,0.487,0.92,0.652,1.697,1.55,0.8,0.601,-0.262
max,9.736,3.737,2.456,2.739,3.542,1.152,3.809,1.697,1.82,1.656,3.328,3.813


### Multiple Linear Regression

In [16]:
# 1. Import the Estimator class you will use
from sklearn.linear_model import LinearRegression
# 2. Create an instance of the class
ml_reg = LinearRegression()
# 3. Use the fit method of the instance
ml_reg.fit(X_train, y_train)
# 4. Use the predict method to get the predictions
y_pred_ml_reg = ml_reg.predict(X_train)

In [17]:
pd.Series(ml_reg.coef_, index=X_train.columns).sort_values(ascending=False).round(2)

rm           2.72
rad          2.23
zn           1.04
rivercode    0.33
age          0.00
indus       -0.02
crim        -0.92
pratio      -1.92
nox         -1.97
tax         -2.30
dis         -2.94
lstat       -3.58
dtype: float64

In [18]:
# re-training the model with all features again.
ml_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
from sklearn.metrics import mean_squared_error, r2_score
mse_ml_reg = mean_squared_error(y_true=y_train, y_pred=y_pred_ml_reg)
print('{:0.2f}M'.format(mse_ml_reg/1e6))

0.00M


In [20]:
print('variance score: %.2f' % r2_score (y_train, y_pred_ml_reg))

variance score: 0.76


In [21]:
from sklearn.metrics import mean_squared_error, r2_score
print (('mean squared error: %.2f') % mean_squared_error(y_train, y_pred_ml_reg)) 

mean squared error: 19.33


## 4. Create new dataset without target variable

In [22]:
datHousingSub = datHousing.drop(['river','medv'], axis=1)

In [23]:
datHousingSub.head()

Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,pratio,lstat,rivercode
0,3.32105,0.0,19.58,0.871,5.403,100.0,1.3216,5,403,14.7,26.82,1
1,1.12658,0.0,19.58,0.871,5.012,88.0,1.6102,5,403,14.7,12.12,1
2,1.41385,0.0,19.58,0.871,6.129,96.0,1.7494,5,403,14.7,15.12,1
3,3.53501,0.0,19.58,0.871,6.152,82.6,1.7455,5,403,14.7,15.02,1
4,1.27346,0.0,19.58,0.605,6.25,92.6,1.7984,5,403,14.7,5.5,1


## 5. 7. Fit 2 cluster model

In [24]:
# Using scikit-learn to perform K-Means clustering
from sklearn.cluster import KMeans
    
# Specify the number of clusters (2) and fit the data datHousingSub
kmeans = KMeans(n_clusters=2, random_state=0).fit(datHousingSub)
# 7 instead of datHousingSub write datHousing

In [25]:
centroids = (kmeans.cluster_centers_)
print(centroids)

[[3.88774444e-01 1.55826558e+01 8.42089431e+00 5.11847425e-01
  6.38800542e+00 6.06322493e+01 4.44127154e+00 4.45528455e+00
  3.11926829e+02 1.78092141e+01 1.04174526e+01 7.31707317e-02]
 [1.22991617e+01 3.01980663e-14 1.84518248e+01 6.70102190e-01
  6.00621168e+00 8.99678832e+01 2.05447007e+00 2.32700730e+01
  6.67642336e+02 2.01963504e+01 1.86745255e+01 5.83941606e-02]]


* I would name the first cluster as 'not so safe to live' because of high non-retail business and high crime rate in the town it is not a safe place to live. It also has the lowest proportion of residential land.
* The second cluster is 'safe to live' as there is low non-retail business and low crime rate in the town. And it also has the highest proportion of residential lands.

In [26]:
labels = (kmeans.labels_)
print(labels)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 

In [27]:
# Calculate silhouette_score
from sklearn.metrics import silhouette_score

print(silhouette_score(datHousingSub, kmeans.labels_))

0.7717962604192585


## 6. 3 cluster model

In [28]:
# Using scikit-learn to perform K-Means clustering
from sklearn.cluster import KMeans
    
# Specify the number of clusters (3) and fit the data datHousingSub
kmeans = KMeans(n_clusters=3, random_state=0).fit(datHousingSub)

In [29]:
centroids = (kmeans.cluster_centers_)
print(centroids)

[[2.44205703e-01 1.73764259e+01 6.70262357e+00 4.84713688e-01
  6.47416350e+00 5.61661597e+01 4.83579772e+00 4.32699620e+00
  2.75212928e+02 1.78733840e+01 9.55292776e+00 7.60456274e-02]
 [1.22991617e+01 3.01980663e-14 1.84518248e+01 6.70102190e-01
  6.00621168e+00 8.99678832e+01 2.05447007e+00 2.32700730e+01
  6.67642336e+02 2.01963504e+01 1.86745255e+01 5.83941606e-02]
 [7.47468585e-01 1.11320755e+01 1.26841509e+01 5.79169811e-01
  6.17423585e+00 7.17132075e+01 3.46240000e+00 4.77358491e+00
  4.03018868e+02 1.76500000e+01 1.25624528e+01 6.60377358e-02]]


In [30]:
labels = (kmeans.labels_)
print(labels)

[2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 0 0 0 0 2 2
 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 0
 0 0 0 0 0 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 2 2 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 0 0 0 2 2 0 0 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 

In [31]:
# Calculate silhouette_score
from sklearn.metrics import silhouette_score

print(silhouette_score(datHousingSub, kmeans.labels_))

0.6217529916045139


* I dont think having the third cluster is helpful because, the silhouette score is less compared to the two cluster model which means that three cluster model is not any better but worse than the two cluster model.

## 7. Convert list of labels to a data frame

In [1]:
datHousing.drop(['river'], inplace = True, axis = 1)
# Using scikit-learn to perform K-Means clustering
from sklearn.cluster import KMeans
    
# Specify the number of clusters (2) and fit the data datHousing
kmeans = KMeans(n_clusters=2, random_state=0).fit(datHousing)

NameError: name 'datHousing' is not defined

In [2]:
centroids = (kmeans.cluster_centers_)
print(centroids)

NameError: name 'kmeans' is not defined

In [3]:
df_labels = pd.DataFrame(labels)
df_labels.rename(columns={0:'Cluster'}, inplace=True)

NameError: name 'pd' is not defined

### Merge Cluster column into dummy-converted data frame

In [37]:
dat_housing_Clust = pd.concat([datHousing, df_labels], axis=1)

In [38]:
dat_housing_Clust.head()

Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,pratio,lstat,medv,rivercode,Cluster
0,3.32105,0.0,19.58,0.871,5.403,100.0,1.3216,5,403,14.7,26.82,13.4,1,2
1,1.12658,0.0,19.58,0.871,5.012,88.0,1.6102,5,403,14.7,12.12,15.3,1,2
2,1.41385,0.0,19.58,0.871,6.129,96.0,1.7494,5,403,14.7,15.12,17.0,1,2
3,3.53501,0.0,19.58,0.871,6.152,82.6,1.7455,5,403,14.7,15.02,15.6,1,2
4,1.27346,0.0,19.58,0.605,6.25,92.6,1.7984,5,403,14.7,5.5,27.0,1,2


## 8 Separate cluster into two data sets for further analysis

In [104]:
datHousingC1 = dat_housing_Clust.loc[dat_housing_Clust['Cluster']==1]

In [105]:
datHousingC1.head()

Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,pratio,lstat,medv,rivercode,Cluster
27,8.98296,0.0,18.1,0.77,6.212,97.4,2.1222,24,666,20.2,17.6,17.8,1,1
28,3.8497,0.0,18.1,0.77,6.395,91.0,2.5052,24,666,20.2,13.27,21.7,1,1
29,5.20177,0.0,18.1,0.77,6.127,83.4,2.7227,24,666,20.2,11.48,22.7,1,1
30,4.22239,0.0,18.1,0.77,5.803,89.0,1.9047,24,666,20.2,14.64,16.8,1,1
31,3.47428,0.0,18.1,0.718,8.78,82.9,1.9047,24,666,20.2,5.29,21.9,1,1


## 9. Separate cluster into two data sets for further analysis

In [106]:
datHousingC2 = dat_housing_Clust.loc[dat_housing_Clust['Cluster']==2]

In [107]:
datHousingC2.head()

Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,pratio,lstat,medv,rivercode,Cluster
0,3.32105,0.0,19.58,0.871,5.403,100.0,1.3216,5,403,14.7,26.82,13.4,1,2
1,1.12658,0.0,19.58,0.871,5.012,88.0,1.6102,5,403,14.7,12.12,15.3,1,2
2,1.41385,0.0,19.58,0.871,6.129,96.0,1.7494,5,403,14.7,15.12,17.0,1,2
3,3.53501,0.0,19.58,0.871,6.152,82.6,1.7455,5,403,14.7,15.02,15.6,1,2
4,1.27346,0.0,19.58,0.605,6.25,92.6,1.7984,5,403,14.7,5.5,27.0,1,2


## 10

In [108]:
X = datHousingC1.drop(['medv'], axis=1)
y = datHousingC1['medv']

In [109]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
numerical_features = ['crim', 'zn', 'indus', 'nox','rm','age','dis','rad','tax','pratio','lstat','rivercode']

Regression

In [110]:
# 1. Import the class you will use
from sklearn.preprocessing import StandardScaler
# 2. Create an instance of the class
scaler = StandardScaler()
# 3. Use the fit method of the instance
scaler.fit(X_train[numerical_features])
# 4. Use the transform method to perform the transformation
X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [111]:
X_train[numerical_features].head()

Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,pratio,lstat,rivercode
416,-0.114957,0.0,-0.219265,0.208383,1.183172,0.046905,-0.366447,0.219265,-0.219265,0.219265,1.022529,-0.219265
389,-0.33517,0.0,-0.219265,0.547153,-0.835294,0.688957,-0.518206,0.219265,-0.219265,0.219265,0.297695,-0.219265
444,0.046612,0.0,-0.219265,1.19243,-0.162472,0.506646,-0.240091,0.219265,-0.219265,0.219265,0.729074,-0.219265
474,-0.343047,0.0,-0.219265,-1.324149,-0.781642,0.411527,0.646885,0.219265,-0.219265,0.219265,-0.099936,-0.219265
418,5.032308,0.0,-0.219265,0.208383,-0.013117,0.776149,-0.394507,0.219265,-0.219265,0.219265,0.263948,-0.219265


In [112]:
X_train[numerical_features].describe().round(3)

Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,pratio,lstat,rivercode
count,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0,109.0
mean,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,1.005,0.0,1.005,1.005,1.005,1.005,1.005,1.005,1.005,1.005,1.005,1.005
min,-0.996,0.0,-0.219,-2.163,-3.487,-3.956,-1.512,-4.561,-0.219,-4.561,-2.327,-0.219
25%,-0.546,0.0,-0.219,-0.921,-0.492,-0.183,-0.719,0.219,-0.219,0.219,-0.68,-0.219
50%,-0.289,0.0,-0.219,0.208,0.199,0.404,-0.217,0.219,-0.219,0.219,-0.112,-0.219
75%,0.138,0.0,-0.219,0.757,0.635,0.689,0.471,0.219,-0.219,0.219,0.653,-0.219
max,5.032,0.0,4.561,1.676,2.069,0.776,3.417,0.219,4.561,0.219,2.81,4.561


In [113]:
# 1. Import the Estimator class you will use
from sklearn.linear_model import LinearRegression
# 2. Create an instance of the class
ml_reg = LinearRegression()
# 3. Use the fit method of the instance
ml_reg.fit(X_train, y_train)
# 4. Use the predict method to get the predictions
y_pred_ml_reg = ml_reg.predict(X_train)

In [114]:
pd.Series(ml_reg.coef_, index=X_train.columns).sort_values(ascending=False).round(2)

rivercode    2.38
age          0.60
rad          0.35
pratio       0.35
Cluster      0.00
zn          -0.00
tax         -0.35
indus       -0.35
rm          -1.15
dis         -1.48
crim        -1.91
nox         -2.24
lstat       -4.96
dtype: float64

In [115]:
# re-training the model with all features again.
ml_reg.fit(X_train, y_train)
print('variance score: %.2f' % r2_score (y_train, y_pred_ml_reg))

variance score: 0.69


In [116]:
from sklearn.metrics import mean_squared_error, r2_score
print (('mean squared error: %.2f') % mean_squared_error(y_train, y_pred_ml_reg)) 

mean squared error: 19.34


## 11

In [117]:
X = datHousingC2.drop(['medv'], axis=1)
y = datHousingC2['medv']

In [118]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
numerical_features = ['crim', 'zn', 'indus', 'nox','rm','age','dis','rad','tax','pratio','lstat','rivercode']

Regression

In [125]:
# 1. Import the class you will use
from sklearn.preprocessing import StandardScaler
# 2. Create an instance of the class
scaler = StandardScaler()
# 3. Use the fit method of the instance
scaler.fit(X_train[numerical_features])
# 4. Use the transform method to perform the transformation
X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [126]:
X_train[numerical_features].head()

Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,pratio,lstat,rivercode
147,-0.628104,-0.493496,-0.336775,-0.206841,-0.426675,0.773731,-0.522307,1.465581,1.19855,0.059352,0.584802,-0.223607
179,1.75319,-0.493496,1.006315,2.018211,-0.056691,1.017808,-0.934691,0.222058,0.014101,-1.167252,2.424705,-0.223607
137,-0.516779,-0.493496,-0.540273,-0.392262,0.412182,0.515902,-0.362836,0.222058,-0.761918,1.285956,-0.301019,-0.223607
184,0.990327,-0.493496,1.006315,2.018211,-0.070331,0.92499,-0.8469,0.222058,0.014101,-1.167252,0.249841,-0.223607
151,-0.619354,-0.493496,-0.336775,-0.206841,0.021739,0.072437,-0.356039,1.465581,1.19855,0.059352,-0.077182,-0.223607


In [127]:
X_train[numerical_features].describe().round(3)

Unnamed: 0,crim,zn,indus,nox,rm,age,dis,rad,tax,pratio,lstat,rivercode
count,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0
mean,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0
std,1.006,1.006,1.006,1.006,1.006,1.006,1.006,1.006,1.006,1.006,1.006,1.006
min,-0.744,-0.493,-1.535,-1.216,-2.11,-2.214,-0.938,-3.509,-2.355,-1.167,-1.714,-0.224
25%,-0.666,-0.493,-0.89,-0.962,-0.529,-0.906,-0.688,-1.021,-0.476,-1.167,-0.811,-0.224
50%,-0.552,-0.493,-0.337,-0.207,-0.064,0.42,-0.469,0.222,0.014,0.059,-0.03,-0.224
75%,0.678,-0.493,1.006,0.322,0.497,0.88,0.455,0.222,0.535,0.781,0.488,-0.224
max,3.567,3.095,1.331,2.018,3.771,1.018,3.166,1.466,2.71,1.405,3.474,4.472


In [128]:
# 1. Import the Estimator class you will use
from sklearn.linear_model import LinearRegression
# 2. Create an instance of the class
ml_reg = LinearRegression()
# 3. Use the fit method of the instance
ml_reg.fit(X_train, y_train)
# 4. Use the predict method to get the predictions
y_pred_ml_reg = ml_reg.predict(X_train)

In [129]:
# re-training the model with all features again.
ml_reg.fit(X_train, y_train)
print('variance score: %.2f' % r2_score (y_train, y_pred_ml_reg))

variance score: 0.82


In [130]:
from sklearn.metrics import mean_squared_error, r2_score
print (('mean squared error: %.2f') % mean_squared_error(y_train, y_pred_ml_reg)) 

mean squared error: 10.27
