In [1]:
#importing libraries required
from sklearn.datasets import fetch_california_housing

#Obtaining data
data = fetch_california_housing()
x = data['data']
y = data['target']

print(data['DESCR'])


Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to C:\Users\Administrator\scikit_learn_data


.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [2]:
print(x.shape)
print(y.shape)

(20640, 8)
(20640,)


In [6]:
#model
from sklearn.linear_model import LinearRegression

#instantiating
lR = LinearRegression()

#fitting values
lR.fit(x,y)

#Predicting values on x
y_pred = lR.predict(x)

print(y_pred)
print('\n')
print(y_pred.shape)

[4.13164983 3.97660644 3.67657094 ... 0.17125141 0.31910524 0.51580363]


(20640,)


In [7]:
#printing the coefficients

print(f'beta0 = {lR.intercept_}')

#for beta1
for i in range(x.shape[1]):
    print(f'beta_{i+1}: {lR.coef_[i]}')

beta0 = -36.941920207184516
beta_1: 0.4366932931343246
beta_2: 0.009435778033238069
beta_3: -0.10732204139090418
beta_4: 0.6450656935198134
beta_5: -3.976389421207444e-06
beta_6: -0.00378654265497081
beta_7: -0.42131437752714374
beta_8: -0.43451375467477804


In [8]:
r_squared = lR.score(x,y)
print(r_squared)

0.6062326851998051


In [10]:
#data preprocessing using transformers
#Transform the given data using StandardScaler

from sklearn.preprocessing import StandardScaler

#instantiating
scaler = StandardScaler()

#fitting the data
scaler.fit(x)

#scaling the dataset a.k.a predicting values
Xt = scaler.transform(x)

In [12]:
#creating a dataset table in pandas using the results above

#importing pandas, numPy
import pandas as pd
import numpy as np

#creating table
vals = np.vstack((x.mean(axis=0), x.var(axis=0), Xt.mean(axis=0), Xt.var(axis=0))).T
features = data['feature_names']
cols = ['unscaled mean', 'unscaled variance', 'scaled mean', 'scaled variance']

df = pd.DataFrame(vals, index = features, columns = cols )

In [13]:
df

Unnamed: 0,unscaled mean,unscaled variance,scaled mean,scaled variance
MedInc,3.870671,3.609148,6.6097e-17,1.0
HouseAge,28.639486,158.3886,5.508083e-18,1.0
AveRooms,5.429,6.121236,6.6097e-17,1.0
AveBedrms,1.096675,0.2245806,-1.060306e-16,1.0
Population,1425.476744,1282408.0,-1.101617e-17,1.0
AveOccup,3.070655,107.8648,3.442552e-18,1.0
Latitude,35.631861,4.562072,-1.079584e-15,1.0
Longitude,-119.569704,4.013945,-8.526513e-15,1.0


In [14]:
#Using a pipeline

#importing libraries
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

#instantiating 
#constructing pipeline
scaler = StandardScaler()
polynomial_features = PolynomialFeatures(degree=2)
linReg = LinearRegression()

#constructing the pipe
pipe = Pipeline([
    ('scaler', scaler),
    ('polynomial', polynomial_features),
    ('regressor', linReg)
])


In [15]:
#the pipelne has been created
#to actually view the steps involved:

pipe.named_steps

{'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'polynomial': PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                    order='C'),
 'regressor': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)}

In [16]:
#accessing the estimator in a pipeline
pipe.named_steps['polynomial']

PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                   order='C')

In [17]:
#fitting models nd predicting labels

pipe.fit(x,y)

y_pred = pipe.predict(x)

In [18]:
#printing results
print(y_pred)
print(f'r_squared_error:{pipe.score(x,y)} ')

[4.00298901 3.92349228 3.99012926 ... 0.83369975 0.88801566 0.97559649]
r_squared_error:0.6832976293317492 
