Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
#Dataset from sklearn

housing = fetch_california_housing(as_frame=True)

In [3]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [4]:
df = housing.frame

In [5]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [6]:
type(df), df.shape

(pandas.core.frame.DataFrame, (20640, 9))

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [8]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [9]:
features = df.drop(columns = ['MedHouseVal'])
target = df.MedHouseVal #Using MedHouseVal column as the target (being the default target)

In [10]:
features.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')

In [11]:
target

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: MedHouseVal, Length: 20640, dtype: float64

In [12]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=101)

In [13]:
print(len(x_train), len(x_test),len(y_train), len(y_test))

16512 4128 16512 4128


In [14]:
# LinearModel

linear_model = LinearRegression()
linear_model.fit(x_train,y_train) 

LinearRegression()

In [15]:
# Obtaining predictions

predicted_values = linear_model.predict(x_test)
predicted_values

array([3.14285047, 5.47805354, 1.68815171, ..., 2.7666517 , 2.77869323,
       3.75003416])

In [16]:
linear_model.score(x_test, y_test)

0.5970946680312599

In [17]:
# Feature with the highest coefficient is AveBedrms (6.04821496e-01 = 0.604821496)

linear_model.coef_

array([ 4.35942782e-01,  9.66161365e-03, -1.07190554e-01,  6.04821496e-01,
       -4.34745213e-06, -4.10947310e-03, -4.15463799e-01, -4.28700232e-01])

In [18]:
# Mean of the target column round to 2d.p is 2.07

Target_mean = df['MedHouseVal'].mean()
round(Target_mean, 2)

2.07

In [19]:
# Median of the target column round to 2d.p is 1.8

Target_median = df['MedHouseVal'].median()
round(Target_median, 2)

1.8

In [20]:
# MedInc feature is the highest correlation (0.688075) with the target variable

df.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
MedHouseVal,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


In [21]:
#Mean Absolute Error round to 2d.p = 0.54

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, predicted_values)
mae.round(2)

0.54

In [22]:
#R-squared round to 2d.p = 0.6

from sklearn.metrics import r2_score
r2 = r2_score(y_test, predicted_values)
r2.round(2)

0.6

In [23]:
#Root Mean Squared Error

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
rmse.round(2)

0.74

 RIDGE REGRESSION MODEL

In [24]:
#fit the model to the training dataset

from sklearn.linear_model import Ridge
ridge_reg = Ridge()
ridge_reg.fit(x_train, y_train)

Ridge()

In [25]:
# Obtaining Ridge predictions

predicted = ridge_reg.predict(x_test)
predicted

array([3.14288277, 5.47781265, 1.68834943, ..., 2.76656229, 2.77870638,
       3.75005132])

In [26]:
ridge_reg.score(x_test, y_test)

0.5970835420389657

In [27]:
# Using the Ridge regression Model: The Feature with the highest coefficient is AveBedrms (6.03656315e-01 = 0.603656315)


ridge_reg.coef_

array([ 4.35824754e-01,  9.66371155e-03, -1.06963743e-01,  6.03656315e-01,
       -4.33765902e-06, -4.10902458e-03, -4.15447390e-01, -4.28668008e-01])

In [28]:
# RidgeRegression: Mean Absolute Error round to 2d.p = 0.54

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, predicted)
mae.round(2)

0.54

In [29]:
# RidgeRegression: R-squared round to 2d.p = 0.6

from sklearn.metrics import r2_score
r2 = r2_score(y_test, predicted)
r2.round(2)

0.6

In [30]:
#Root Mean Squared Error

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
rmse.round(2)

0.74

In [31]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, predicted) # Computation of accuracy_score was not possible.

ValueError: continuous is not supported