In [1]:
import numpy as np
import os
os.getcwd()
import tarfile
import joblib 
from sklearn.datasets._base import _pkl_filepath, get_data_home

archive_path = "cal_housing.tgz" # change the path if it's not in the current directory
data_home = get_data_home(data_home=None) # change data_home if you are not using ~/scikit_learn_data
if not os.path.exists(data_home):
    os.makedirs(data_home)
filepath = _pkl_filepath(data_home, 'cal_housing.pkz')

with tarfile.open(mode="r:gz", name=archive_path) as f:
    cal_housing = np.loadtxt(
        f.extractfile('CaliforniaHousing/cal_housing.data'),
        delimiter=',')
    # Columns are not in the same order compared to the previous
    # URL resource on lib.stat.cmu.edu
    columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
    cal_housing = cal_housing[:, columns_index]

    joblib.dump(cal_housing, filepath, compress=6)

In [2]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd


In [3]:
housing = fetch_california_housing(as_frame=True)
X = housing.data
y = housing.target

In [4]:
X

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [5]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: MedHouseVal, Length: 20640, dtype: float64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
model = LinearRegression()
model.fit(X_train, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [8]:
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse:.2f}")


Mean Squared Error: 0.56


In [9]:
X_test.iloc[0]

MedInc           1.681200
HouseAge        25.000000
AveRooms         4.192201
AveBedrms        1.022284
Population    1392.000000
AveOccup         3.877437
Latitude        36.060000
Longitude     -119.010000
Name: 20046, dtype: float64

In [10]:
# Predict the price of a new house (example input)
sample = X_test.iloc[0]
predicted_price = model.predict([sample])
print(f"Predicted price (in $100,000s): {predicted_price[0]:.2f}")


Predicted price (in $100,000s): 0.72




In [11]:
housing.data.shape, housing.target.shape

((20640, 8), (20640,))

In [12]:
df = housing.frame
df


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [13]:
df.dtypes.index

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'MedHouseVal'],
      dtype='object')

In [14]:
df.isnull().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [15]:
df.index

RangeIndex(start=0, stop=20640, step=1)

In [16]:
n = len(df)
n

20640

In [17]:
df.loc[df['MedInc'] == 1.681200]

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
20046,1.6812,25.0,4.192201,1.022284,1392.0,3.877437,36.06,-119.01,0.477


In [29]:
np.random.seed(2)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [30]:
coba_y_train = df_train.pop('MedHouseVal')
y_val_orig = df_val.pop('MedHouseVal')
y_test_orig = df_test.pop('MedHouseVal')

In [31]:
y_train = np.log1p(coba_y_train)
y_val = np.log1p(y_val_orig)
y_test = np.log1p(y_test_orig)

In [32]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [33]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [34]:
df_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
10385,6.4114,15.0,7.527559,1.049869,2529.0,3.318898,33.60,-117.65
1943,4.1843,12.0,6.330084,1.041783,2033.0,2.831476,38.62,-120.91
7490,1.7411,35.0,5.369159,1.294393,909.0,4.247664,33.93,-118.23
16889,3.4412,39.0,4.173405,1.074573,2156.0,1.937107,37.59,-122.37
11416,7.8195,16.0,8.602349,1.058725,2142.0,3.593960,33.70,-117.98
...,...,...,...,...,...,...,...,...
10901,2.6771,23.0,3.748454,1.076289,958.0,1.975258,33.74,-117.83
14426,2.5568,32.0,3.731935,1.090909,731.0,1.703963,32.80,-117.25
16877,6.2210,52.0,6.571429,0.966667,530.0,2.523810,37.60,-122.40
15999,4.6071,52.0,6.030189,1.075472,689.0,2.600000,37.75,-122.47


In [35]:
X_train = df_train
w_0, w = train_linear_regression_reg(X_train, y_train, r=0)

y_pred = w_0 + X_train.dot(w)
print('train', rmse(y_train, y_pred))

X_val = df_val
y_pred = w_0 + X_val.dot(w)
print('val', rmse(y_val, y_pred))

train 0.21749466234925535
val 0.21347423438272703


In [36]:
X_train = df_train
w_0, w = train_linear_regression_reg(X_train, y_train, r=0.01)

X_val = df_val
y_pred = w_0 + X_val.dot(w)
print('validation:', rmse(y_val, y_pred))

X_test = df_test
y_pred = w_0 + X_test.dot(w)
print('test:', rmse(y_test, y_pred))

validation: 0.21346123852570006
test: 0.23115486764763288


In [37]:
X_test.iloc[0]

MedInc           1.907600
HouseAge        32.000000
AveRooms         3.789575
AveBedrms        0.980695
Population    2052.000000
AveOccup         3.961390
Latitude        36.710000
Longitude     -119.550000
Name: 2378, dtype: float64

In [38]:
sample = X_test.loc[df['MedInc'] == 1.907600]
y_pred = w_0 + sample.dot(w)
#y_pred = y_pred[0]
y_pred

2378    0.664113
dtype: float64

In [39]:
df_full_train = pd.concat([df_train, df_val]).reset_index(drop=True)

In [40]:
df_full_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,6.4114,15.0,7.527559,1.049869,2529.0,3.318898,33.60,-117.65
1,4.1843,12.0,6.330084,1.041783,2033.0,2.831476,38.62,-120.91
2,1.7411,35.0,5.369159,1.294393,909.0,4.247664,33.93,-118.23
3,3.4412,39.0,4.173405,1.074573,2156.0,1.937107,37.59,-122.37
4,7.8195,16.0,8.602349,1.058725,2142.0,3.593960,33.70,-117.98
...,...,...,...,...,...,...,...,...
16507,3.0598,13.0,4.310056,1.069832,776.0,2.167598,34.04,-117.67
16508,6.8089,5.0,7.191667,0.975000,1109.0,3.080556,33.88,-117.44
16509,4.6417,17.0,6.829201,1.074380,1219.0,3.358127,34.69,-118.12
16510,1.5281,29.0,5.095890,1.095890,1137.0,3.115068,39.29,-121.68


In [41]:
X_full_train = df_full_train.drop('MedHouseVal', axis=1)

KeyError: "['MedHouseVal'] not found in axis"

In [42]:
X_full_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,6.4114,15.0,7.527559,1.049869,2529.0,3.318898,33.60,-117.65
1,4.1843,12.0,6.330084,1.041783,2033.0,2.831476,38.62,-120.91
2,1.7411,35.0,5.369159,1.294393,909.0,4.247664,33.93,-118.23
3,3.4412,39.0,4.173405,1.074573,2156.0,1.937107,37.59,-122.37
4,7.8195,16.0,8.602349,1.058725,2142.0,3.593960,33.70,-117.98
...,...,...,...,...,...,...,...,...
16507,3.0598,13.0,4.310056,1.069832,776.0,2.167598,34.04,-117.67
16508,6.8089,5.0,7.191667,0.975000,1109.0,3.080556,33.88,-117.44
16509,4.6417,17.0,6.829201,1.074380,1219.0,3.358127,34.69,-118.12
16510,1.5281,29.0,5.095890,1.095890,1137.0,3.115068,39.29,-121.68


In [43]:
y_full_train = np.concatenate([y_train, y_val])

In [44]:
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

In [45]:
w0

np.float64(-13.6192698126983)

In [46]:
X_test = df_test
y_pred = w0 + X_test.dot(w)

score = rmse(y_test, y_pred)
score

np.float64(0.23080686988290444)

In [None]:
sample = X_test.loc[df['MedInc'] == 1.907600]

In [49]:
x_try = df_test.loc[df['MedInc'] == 1.907600].to_dict()
x_try

{'MedInc': {2378: 1.9076},
 'HouseAge': {2378: 32.0},
 'AveRooms': {2378: 3.7895752895752897},
 'AveBedrms': {2378: 0.9806949806949807},
 'Population': {2378: 2052.0},
 'AveOccup': {2378: 3.9613899613899615},
 'Latitude': {2378: 36.71},
 'Longitude': {2378: -119.55}}

In [54]:
df_small = pd.DataFrame([x_try][0])
df_small

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
2378,1.9076,32.0,3.789575,0.980695,2052.0,3.96139,36.71,-119.55


In [55]:
X_small = df_small

In [57]:
y_pred = w0 + X_small.dot(w)

y_pred

2378    0.661163
dtype: float64