In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, roc_curve, mean_absolute_error
from sklearn.decomposition import PCA

In [4]:
X,y = fetch_california_housing(return_X_y=True)

In [5]:
X.shape

(20640, 8)

In [7]:
y.shape

(20640,)

In [8]:
X[0]

array([   8.3252    ,   41.        ,    6.98412698,    1.02380952,
        322.        ,    2.55555556,   37.88      , -122.23      ])

In [9]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [15]:
X_train.shape

(16512, 8)

In [16]:
y_train.shape

(16512,)

In [19]:
linear = LinearRegression()
linear.fit(X_train,y_train)
y_pred_linear = linear.predict(X_test)
r2_score(y_test,y_pred_linear)

0.5989628208542918

In [22]:
X_train.shape

(16512, 8)

In [23]:
y_train.shape

(16512,)

In [45]:
pca = PCA(n_components=2)
# X_train_trans = pca.fit(X_train)

X_trans = pca.fit_transform(X)

In [47]:
X_trans.shape

(20640, 2)

In [48]:
type(X_trans)

numpy.ndarray

In [49]:
df_X = pd.DataFrame(X_trans)
df_y = pd.DataFrame(y)

In [51]:
df_X.columns = [['a','b']]

In [54]:
df_X.shape

(20640, 2)

In [55]:
df_y.shape

(20640, 1)

In [62]:
df_y.iloc[:,0]

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: 0, Length: 20640, dtype: float64

In [64]:
# fig = plt.figure(figsize=(10,5))
# ax = plt.subplot(1,2,1)

import plotly.express as px
px.scatter_3d(x=df_X.iloc[:,0],y=df_X.iloc[:,1],z=df_y.iloc[:,0])

### Applying Lasso Regression for improvising the model performance

In [65]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [86]:
lreg = Lasso(alpha=0.025)
lreg.fit(X_train,y_train)

In [87]:
y_pred = lreg.predict(X_test)

In [80]:
from sklearn.metrics import r2_score

In [88]:
r2_score(y_test,y_pred)

0.5705008920719551

In [89]:
lreg.coef_

array([ 3.85440086e-01,  1.12792201e-02,  0.00000000e+00,  1.59212836e-03,
        3.13433042e-06, -4.80462790e-03, -3.51997939e-01, -3.53740269e-01])