In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# import data
filepath = os.sep.join(['data'] + ['housing.csv'])
data = pd.read_csv(filepath, sep=',')

print(data.shape)
print(data.dtypes.value_counts())
print(data.head())

(20640, 10)
float64    9
object     1
dtype: int64
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


# Data Cleaning

In [3]:
from sklearn.impute import SimpleImputer
cat_col = 'ocean_proximity'
data_numerical = data.drop(cat_col, axis=1)
imputer = SimpleImputer(strategy="median")
imputer.fit(data_numerical)

In [4]:
X = imputer.transform(data_numerical)
data_simple_imputer_tr = pd.DataFrame(X, columns=data_numerical.columns)
data = pd.concat([data_simple_imputer_tr, data[[cat_col]]], axis=1)


## Handling Categorical Data

In [5]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# # https://www.kaggle.com/code/davidhjek/california-dreaming-and-housing
# # 1. Why Label Encoding instead of OneHotEncoder
cat_encoder = OneHotEncoder()
data_oneHotEncoded = data.copy()

In [6]:
cat_col = 'ocean_proximity'
new_cat_data = cat_encoder.fit_transform(data_oneHotEncoded[[cat_col]]).astype(int)
data_oneHotEncoded = data_oneHotEncoded.drop(cat_col, axis=1)
new_cat_data

<20640x5 sparse matrix of type '<class 'numpy.int32'>'
	with 20640 stored elements in Compressed Sparse Row format>

In [7]:
cats = cat_encoder.categories_
new_cols = ['_'.join([cat_col, cat]) for cat in cats[0]]
new_df = pd.DataFrame(new_cat_data.toarray(),
                        index=data_oneHotEncoded.index,
                        columns=new_cols)
# new_df
data_oneHotEncoded = pd.concat([data_oneHotEncoded, new_df], axis=1)

In [8]:
data_oneHotEncoded

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,0,1,0,0,0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,0,1,0,0,0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,0,1,0,0,0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,0,1,0,0,0


## Transformation Pipeline

In [5]:
from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# num_pipeline = Pipeline([
#     ('imputer', SimpleImputer(strategy="median")),      # Imputing Missing values
#     ('std_scaler', StandardScaler()),                    # Feature Scaling with Standard Scaler
#     ])
# data_tr = num_pipeline.fit_transform(data_numerical)

In [7]:
# Handling cat and num in one pipeline
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
# num_attribs = list(data_numerical)             # columns with numerical attributes
# cat_attribs = ["ocean_proximity"]              # columns with categorical attributes
# full_pipeline = ColumnTransformer([
#       ("num", num_pipeline, num_attribs),        
#       ("cat", OneHotEncoder(), cat_attribs),
#     ])
# housing_prepared = full_pipeline.fit_transform(data)

In [9]:
# Select the object (string) columns

mask = data.dtypes == object
categorical_cols = data.columns[mask]

data[categorical_cols].apply(lambda x: x.nunique())

# Determine how many extra columns would be created
num_ohc_cols = (data[categorical_cols]
                .apply(lambda x: x.nunique())
                .sort_values(ascending=False))

# Remove the string columns from the dataframe
data = data.drop(num_ohc_cols.index, axis=1)


## Create Train and Test Set

In [10]:
from sklearn.model_selection import train_test_split
y_col = 'median_house_value'

# Split the data that is not one-hot encoded
feature_cols = [x for x in data.columns if x != y_col]
X_data = data[feature_cols]
y_data = data[y_col]

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, 
                                                    test_size=0.3, random_state=42)

# Split the data that is one-hot encoded
feature_cols = [x for x in data_oneHotEncoded.columns if x != y_col]
X_data_oneHotEncoded = data_oneHotEncoded[feature_cols]
y_data_oneHotEncoded = data_oneHotEncoded[y_col]

X_train_ohc, X_test_ohc, y_train_ohc, y_test_ohc = train_test_split(X_data_oneHotEncoded, y_data_oneHotEncoded, 
                                                    test_size=0.3, random_state=42)

In [11]:
X_train_ohc.shape

(14448, 13)

In [12]:
# Compare the indices to ensure they are identical
(X_train_ohc.index == X_train.index).all() # every single value must be true for this to result in True

True

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

LR = LinearRegression()

# Storage for error values
error_df = list()

# Data that have not been one-hot encoded
LR = LR.fit(X_train, y_train)
y_train_pred = LR.predict(X_train)
y_test_pred = LR.predict(X_test)

error_df.append(pd.Series({'train': mean_squared_error(y_train, y_train_pred),
                           'test' : mean_squared_error(y_test,  y_test_pred)},
                           name='no enc'))

# Data that have been one-hot encoded
LR = LR.fit(X_train_ohc, y_train_ohc)
y_train_ohc_pred = LR.predict(X_train_ohc)
y_test_ohc_pred = LR.predict(X_test_ohc)

error_df.append(pd.Series({'train': mean_squared_error(y_train_ohc, y_train_ohc_pred),
                           'test' : mean_squared_error(y_test_ohc,  y_test_ohc_pred)},
                            name='one-hot enc'))

# Assemble the results
error_df = pd.concat(error_df, axis=1)
error_df
# The more parameters that you're able to use, the more 
# likely you are to overfit your data. Our model is to
# complex.

Unnamed: 0,no enc,one-hot enc
train,4865155000.0,4728483000.0
test,4856977000.0,4733529000.0


## Training and Evaluating on the Training Set