In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


In [4]:
from sklearn import datasets

In [5]:
california = datasets.fetch_california_housing()

In [6]:
california

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [7]:
california.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [8]:
california.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [9]:
california.data

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [10]:
california.target_names

['MedHouseVal']

In [11]:
california.DESCR

'.. _california_housing_dataset:\n\nCalifornia Housing dataset\n--------------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 20640\n\n    :Number of Attributes: 8 numeric, predictive attributes and the target\n\n    :Attribute Information:\n        - MedInc        median income in block\n        - HouseAge      median house age in block\n        - AveRooms      average number of rooms\n        - AveBedrms     average number of bedrooms\n        - Population    block population\n        - AveOccup      average house occupancy\n        - Latitude      house block latitude\n        - Longitude     house block longitude\n\n    :Missing Attribute Values: None\n\nThis dataset was obtained from the StatLib repository.\nhttp://lib.stat.cmu.edu/datasets/\n\nThe target variable is the median house value for California districts.\n\nThis dataset was derived from the 1990 U.S. census, using one row per census\nblock group. A block group is the smallest geographical unit

In [12]:
dir(california)

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']

In [13]:
cal_input=pd.DataFrame(california.data,columns=california.feature_names)
cal_target=pd.DataFrame(california.target,columns=["MedHouseVal"])

In [14]:
cal_input

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [15]:
cal_target

Unnamed: 0,MedHouseVal
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422
...,...
20635,0.781
20636,0.771
20637,0.923
20638,0.847


In [16]:
#check for null values
print(cal_input.isnull().sum())
print()
print(cal_target.isnull().sum())

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

MedHouseVal    0
dtype: int64


In [17]:
#check correlation
pd.concat([cal_input,cal_target],axis=1).corr()
#most features have weak correlation with targets 

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
MedHouseVal,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


In [18]:
#statistical description of dataset
pd.concat([cal_input,cal_target],axis=1).describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [19]:
#remove outliers
from scipy.stats import zscore
cal_combined=pd.concat([cal_input,cal_target],axis=1)
z_score=zscore(cal_combined)
print(cal_combined.shape)
cal_combined_final=cal_combined.loc[abs(z_score<3).all(axis=1)]
print(cal_combined_final.shape)

(20640, 9)
(19794, 9)


In [20]:
#separating input, outputfrom cal_combined_final
df_x=cal_combined_final.drop(columns=["MedHouseVal"])
y=cal_combined_final[["MedHouseVal"]]

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
def maxr2_score(regr,df_x,y):
    max_r_score=0
    for r_state in range(42,101):
        x_train, x_test, y_train, y_test = train_test_split(df_x, y, random_state = r_state,test_size=0.20)
        regr.fit(x_train,y_train)
        y_pred = regr.predict(x_test)
        r2_scr=r2_score(y_test,y_pred)
        print("r2_score corresponding to random state: ",r_state," is: ",r2_scr)
        if r2_scr>max_r_score:
            max_r_score=r2_scr
            final_r_state=r_state
        
    print()
    print()
    print("max r2 score corresponding to ",final_r_state," is ",max_r_score)
    return final_r_state

In [22]:
#Linear regression and check r2 score for different random states
from sklearn.linear_model import LinearRegression
lreg=LinearRegression()
r_state=maxr2_score(lreg,df_x,y)

r2_score corresponding to random state:  42  is:  0.6162271478185639
r2_score corresponding to random state:  43  is:  0.6278148395219303
r2_score corresponding to random state:  44  is:  0.6344448221540232
r2_score corresponding to random state:  45  is:  0.6475766350631864
r2_score corresponding to random state:  46  is:  0.6375619780691225
r2_score corresponding to random state:  47  is:  0.6164362623468982
r2_score corresponding to random state:  48  is:  0.6566174856191146
r2_score corresponding to random state:  49  is:  0.6295463453883521
r2_score corresponding to random state:  50  is:  0.6316988005126118
r2_score corresponding to random state:  51  is:  0.6269243741994341
r2_score corresponding to random state:  52  is:  0.6199746571822637
r2_score corresponding to random state:  53  is:  0.6209280632175735
r2_score corresponding to random state:  54  is:  0.6209397967077755
r2_score corresponding to random state:  55  is:  0.640433039388741
r2_score corresponding to random st

In [23]:
from sklearn.model_selection import cross_val_score
print("Mean r2 score for Linear Regression: ",cross_val_score(lreg,df_x,y,cv=5,scoring="r2").mean())
print("Standard Deviation in r2 score for Linear Regression: ",cross_val_score(lreg,df_x,y,cv=5,scoring="r2").std())

Mean r2 score for Linear Regression:  0.5765823611801845
Standard Deviation in r2 score for Linear Regression:  0.04964785750838827


In [24]:
#GridSearch CV for KNeighbors
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
neighbors={"n_neighbors":range(1,30)}
knr=KNeighborsRegressor()
gknr = GridSearchCV(knr, neighbors, cv=10)
gknr.fit(df_x,y)
gknr.best_params_

{'n_neighbors': 12}

In [25]:
#Calculating r2 score for KNN model based on n_neighbors value
knr=KNeighborsRegressor(n_neighbors=12)
r_state=maxr2_score(knr,df_x,y)

r2_score corresponding to random state:  42  is:  0.12361779293163788
r2_score corresponding to random state:  43  is:  0.10830794494065665
r2_score corresponding to random state:  44  is:  0.10524712495288346
r2_score corresponding to random state:  45  is:  0.12557881636505897
r2_score corresponding to random state:  46  is:  0.12180820936754633
r2_score corresponding to random state:  47  is:  0.11422858535749958
r2_score corresponding to random state:  48  is:  0.12046073583466399
r2_score corresponding to random state:  49  is:  0.11136112395268194
r2_score corresponding to random state:  50  is:  0.11951143464812575
r2_score corresponding to random state:  51  is:  0.13240744609172161
r2_score corresponding to random state:  52  is:  0.12461092400425422
r2_score corresponding to random state:  53  is:  0.11428777005142698
r2_score corresponding to random state:  54  is:  0.11017582784284674
r2_score corresponding to random state:  55  is:  0.11995695595025146
r2_score correspondi

In [26]:
#Calculate mean r2 score and std deviation in r2 score for KNN Regresssion
from sklearn.model_selection import cross_val_score
print("Mean r2 score for KNN Regression: ",cross_val_score(knr,df_x,y,cv=5,scoring="r2").mean())
print("standard deviation in r2 score for KNR Regression: ",cross_val_score(knr,df_x,y,cv=5,scoring="r2").std())

Mean r2 score for KNN Regression:  -0.015147010992789433
standard deviation in r2 score for KNR Regression:  0.0604824017733203


In [27]:
#Lasso Regression
from sklearn.linear_model import Lasso
lsreg=Lasso()
parameters={"alpha":[0.001,0.01,0.1,1]}
clf = GridSearchCV(lsreg, parameters, cv=10)
clf.fit(df_x,y)
clf.best_params_

{'alpha': 0.001}

In [28]:
#Check r2 score for Lasso
lsreg=Lasso(alpha=0.001)
r_state=maxr2_score(lsreg,df_x,y)

r2_score corresponding to random state:  42  is:  0.6163926711546639
r2_score corresponding to random state:  43  is:  0.6277358313379254
r2_score corresponding to random state:  44  is:  0.6341163009784441
r2_score corresponding to random state:  45  is:  0.6473778250764135
r2_score corresponding to random state:  46  is:  0.6375538913108725
r2_score corresponding to random state:  47  is:  0.6162265117211276
r2_score corresponding to random state:  48  is:  0.6564879517807596
r2_score corresponding to random state:  49  is:  0.6295070800860797
r2_score corresponding to random state:  50  is:  0.6316568579363079
r2_score corresponding to random state:  51  is:  0.6266208197525136
r2_score corresponding to random state:  52  is:  0.6204311251792815
r2_score corresponding to random state:  53  is:  0.6205897531947091
r2_score corresponding to random state:  54  is:  0.6217155412221103
r2_score corresponding to random state:  55  is:  0.6403032648874172
r2_score corresponding to random s

In [29]:
#Calculate mean r2 score and std deviation in r2 score for Lasso Regresssion
from sklearn.model_selection import cross_val_score
print("Mean r2 score for Lasso Regression: ",cross_val_score(lsreg,df_x,y,cv=5,scoring="r2").mean())
print("standard deviation in r2 score for Lasso Regression: ",cross_val_score(lsreg,df_x,y,cv=5,scoring="r2").std())

Mean r2 score for Lasso Regression:  0.5760323290988051
standard deviation in r2 score for Lasso Regression:  0.050361447612704835


In [30]:
#Let's save KNN regression with random state corresponding to r2 score 74
x_train, x_test, y_train, y_test = train_test_split(df_x,y,random_state = 74, test_size = 0.30)
knr=KNeighborsRegressor()
knr.fit(x_train,y_train)
y_pred=knr.predict(x_test)

In [31]:
#Calculate RMSE and r2 score
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("RMSE is: ",np.sqrt(mean_squared_error(y_test,y_pred)))
print("r2_score is: ",r2_score(y_test,y_pred))

RMSE is:  1.0582591783166642
r2_score is:  0.10390337141271055


In [None]:
from sklearn.externals import joblib
joblib.dump(knr, 'California Housing_knr.pkl')