# California Housing Prices

In [91]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [92]:
df=pd.read_csv('californiahousing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [93]:
print(df.shape)
print(df.dtypes)
print(df.isnull().sum())

(20640, 10)
longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [94]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [95]:
# first store all target data to a variable
feature_cols = ['longitude', 'latitude', 'housing_median_age', 'total_rooms','population', 'households', 'median_income']
x = df[feature_cols]
x.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income
0,-122.23,37.88,41.0,880.0,322.0,126.0,8.3252
1,-122.22,37.86,21.0,7099.0,2401.0,1138.0,8.3014
2,-122.24,37.85,52.0,1467.0,496.0,177.0,7.2574
3,-122.25,37.85,52.0,1274.0,558.0,219.0,5.6431
4,-122.25,37.85,52.0,1627.0,565.0,259.0,3.8462


In [96]:
# check the type and shape of X
print(type(x))
print(x.shape)

<class 'pandas.core.frame.DataFrame'>
(20640, 7)


In [97]:
# select a Series from the DataFrame
y = df['median_house_value']
y.head()

0    452600.0
1    358500.0
2    352100.0
3    341300.0
4    342200.0
Name: median_house_value, dtype: float64

In [98]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=12)

In [99]:
print('Observation - Your output should appear at this point as:')
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

Observation - Your output should appear at this point as:
(14448, 7)
(6192, 7)
(14448,)
(6192,)


In [100]:
lm = LinearRegression()

In [101]:
lm.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [102]:
lm.coef_

array([-4.26423865e+04, -4.29062784e+04,  1.10598656e+03, -7.02381716e-01,
       -4.83110976e+01,  1.52988063e+02,  3.81850928e+04])

In [103]:
lm.intercept_

-3548605.497506155

In [104]:
lm.score(x_train,y_train)

0.6355548811684777

# Model Precdiction:
Predict the value

In [105]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [106]:
for train_index , test_index in kf.split([1,2,3,4,5,6,7]):
    print(train_index,test_index)

[3 4 5 6] [0 1 2]
[0 1 2 5 6] [3 4]
[0 1 2 3 4] [5 6]


In [107]:
def get_score(model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    return model.score(x_test, y_test)

In [108]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

In [109]:
pred=lm.predict(x_test)
print("Predicted Results : ",pred)
print("actual price",y_test)

Predicted Results :  [237240.14129452 206572.41444171 223341.94821783 ... 229912.69602421
 174857.66131851 199761.99078821]
actual price 6906     211400.0
767      195200.0
10555    241800.0
17456    128300.0
20617     70800.0
           ...   
6023      93300.0
3651     168600.0
14846    153300.0
1364     225000.0
18990    250000.0
Name: median_house_value, Length: 6192, dtype: float64


Observation :
    We have predicted the value based on actaul value of housing dataset

In [113]:
print('error:')
print('Mean absolute error:',mean_absolute_error(y_test,pred))

error:
Mean absolute error: 51705.99834768337


In [114]:
from sklearn.externals import joblib
joblib.dump(t,'rf_regressor.pkl')

['rf_regressor.pkl']

# Conclusion:
    We have built a complete machine learning project with the Predicted result value of linear regression on Scikitlearn.