In [3]:
import pandas as pd
import numpy as np
import sklearn as skl

In [110]:
# 1
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
housing = fetch_california_housing(as_frame=True)
d = housing.frame
d.head()

#sumamry stats
d.isnull().sum() # no nulls great!
d.describe()
d.skew()

# log-transform right-skewed cols
skew_cols = ['AveRooms','AveBedrms','Population','AveOccup']
for col in skew_cols:
    d[f"Log_{col}"] = np.log(d[col])


# explore corr between target and feature vars
d.corr()['MedHouseVal']

# drop features with low correlation whose log transform has higher correlation
d = d.drop(['AveOccup','Population', 'AveBedrms','AveRooms'], axis=1)

# standardize with minmax scaler for ridge
scaler = MinMaxScaler()

d_s = d.copy()
d_s[d.columns.tolist()] = scaler.fit_transform(d_s[d.columns.tolist()])


In [99]:
# 2
X = d.drop(['MedHouseVal'], axis=1)
y = d['MedHouseVal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

X_s = d_s.drop(['MedHouseVal'], axis=1)
y_s = d_s['MedHouseVal']
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_s, y_s, test_size=0.2, random_state = 42)

In [100]:
# Model 1: Ridge
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
r = Ridge()
r.fit(X_train_s, y_train_s)
y_preds = r.predict(X_test_s)
mae = mean_absolute_error(y_test_s, y_preds)
print(f"Coefs for Ridge: {r.coef_}")
print(f"Intercept for Ridge: {r.intercept_}")
print(f"R-squared: {r2_score(y_test_s,y_preds)}")
print(f"MAE for Ridge: {mae}")

# model 2: random forest
rf = RandomForestRegressor(
    
)
rf.fit(X_train, y_train)
y_preds = rf.predict(X_test)
mae = mean_absolute_error(y_preds, y_test)
print(f"MAE for RF: {mae}")
print(f"R-squared for RF: {r2_score(y_test,y_preds)}")


Coefs for Ridge: [ 1.30252624  0.1181386  -0.79175358 -0.83846035 -0.56464552  0.93937062
  0.11187431 -1.42443504]
Intercept for Ridge: 0.8482102079434827
R-squared: 0.6497577529149967
MAE for Ridge: 0.1011965362368499
MAE for RF: 0.3285248848837211
R-squared for RF: 0.8057832339124946


In [113]:
# Best model: Random Forest
# Improvements 
feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
print(f"feature importances in RF baseline:\n {feature_importances}")
# MedInc is most important feature by about ~0.4, with coef of 0.52
# feature engineering:
d['MedInc_AveOccup_Interaction'] = d['MedInc']/d['Log_AveOccup']
d2 = d.copy()
X = d2.drop(['MedHouseVal'], axis=1)
y = d2['MedHouseVal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
rf = RandomForestRegressor(
    
)
rf.fit(X_train, y_train)
y_preds = rf.predict(X_test)
mae = mean_absolute_error(y_preds, y_test)
print(f"MAE for RF: {mae}")
print(f"R-squared for RF: {r2_score(y_test,y_preds)}")


feature importances in RF baseline:
 MedInc                         0.048137
HouseAge                       0.042459
Latitude                       0.100234
Longitude                      0.107800
Log_AveRooms                   0.029139
Log_AveBedrms                  0.025176
Log_Population                 0.028499
Log_AveOccup                   0.029718
MedInc_AveOccup_Interaction    0.588838
dtype: float64
MAE for RF: 0.3081582569525195
R-squared for RF: 0.8206603732875758


In [121]:
# Final MAE:
final_mae = 0.3081582569525195
y_final = pd.DataFrame()
y_final['predictions'] = y_preds

y_final.to_csv('final_predictions.csv')