Analysis Tasks to be performed:

1. Build a model of housing prices to predict median house values in California using the provided dataset.

2. Train the model to learn from the data to predict the median housing price in any district, given all the other metrics.

3. Predict housing prices based on median_income and plot the regression chart for it.



In [1]:
import pandas as pd


In [37]:
#load the data

raw_data = pd.read_csv('housing.csv')

In [38]:
raw_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


### Fill the missing values with the mean of the respective column.


In [39]:
#check for missing values.
raw_data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

There are 207 missing values in column total bedrooms.

In [40]:
#fill the missing values with the mean value of the number of total bedrooms
total_bedrooms_mean = raw_data['total_bedrooms'].mean()
print(total_bedrooms_mean)
raw_data['total_bedrooms'].fillna(total_bedrooms_mean, inplace=True)

537.8705525375618


In [41]:
raw_data.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64

In [109]:
#Extract input (X) and output (Y) data from the dataset.
X_input = raw_data.iloc[:,:-1]
X_input

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,INLAND
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,INLAND
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,INLAND
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,INLAND


In [110]:
y_output = raw_data.iloc[:,-1]
y_output

0        452600
1        358500
2        352100
3        341300
4        342200
          ...  
20635     78100
20636     77100
20637     92300
20638     84700
20639     89400
Name: median_house_value, Length: 20640, dtype: int64

### Convert categorical column in the dataset to numerical data.

In [106]:
#from sklearn.preprocessing import OneHotEncoder

#onehotencoder = OneHotEncoder(sparse=False)
#X_input = onehotencoder.fit(X_input)
#X_input = onehotencoder.transform(X_input)

In [115]:
#dummies = pd.get_dummies(X_input['ocean_proximity'])
#dummies

In [116]:
X_input = pd.get_dummies(X_input)

In [117]:
X_input.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,129.0,322,126,8.3252,0,0,0,1,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,0,0,0,1,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,0,0,0,1,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,0,0,0,1,0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,0,0,0,1,0


### Split the dataset

In [144]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(X_input, y_output, test_size=0.2, random_state=110)

In [166]:
print(f'Train features dataset shape is:{X_train.shape}')
print(f'Test features dataset shape is :{X_test.shape}')


print(f'Train target shape is :{y_train.shape}')
print(f'Test target shape is :{y_test.shape}')

Train features dataset shape is:(16512, 13)
Test features dataset shape is :(4128, 13)
Train target shape is :(16512,)
Test target shape is :(4128,)


### Standardize training and test datasets.

In [146]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [148]:
X_train_scaler = scaler.fit_transform(X_train)

X_test_scaler = scaler.fit_transform(X_test)

#y_train_scaler = scaler.fit_transform(y_train)

#y_test_scaler = scaler.fit_transform(y_test)

### Perform Linear Regression 

In [149]:
#Perform Linear Regression on training data.
from sklearn.linear_model import LinearRegression
lg=LinearRegression()

In [150]:
lg.fit(X_train,y_train)

LinearRegression()

In [160]:
#Predict output for test dataset using the fitted model.
y_predict = lg.predict(X_test)
y_predict
#y_predict.shape
#y_test.shape

array([127905.23765756, 140880.70343512, 276482.86830202, ...,
        88199.41586195,  57641.60273655, 120516.0596013 ])

In [164]:
#Print root mean squared error (RMSE) from Linear Regression.
from sklearn.metrics import mean_squared_error
#from math import sqrt

rmse = mean_squared_error(y_predict,y_test, squared=False)
print(f'The Root Mean Squared Error is: {rmse}')

The Root Mean Squared Error is: 66242.7379459723


### Perform Decision Tree Regression