## Housing Prices in California (for sale)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import numpy as np

In [None]:
# Read in the data- US Housing Prices
data = pd.read_csv('housing_prices_us.csv')
data.head()

In [None]:
# Filter the data
data = data[(data['state'] == 'California') & (data['status'] == 'for_sale')]
len(data)

In [None]:
# Clean the data
# Zipcode is not null or zero
data = data[(data['zip_code'].isna() == False) & (data['zip_code'] != 0.0)]
data['zip_code'] = data['zip_code'].astype(int).astype(str)

# Status as string
data['status'] = data['status'].astype(str)
data['city'] = data['city'].astype(str)
data['bed'] = data['bed'].fillna(0).astype(int)
data['bath'] = data['bath'].fillna(0).astype(int)
data['price'] = data['price'].fillna(0).astype(int)
data['house_size'] = data['house_size'].fillna(0).astype(int)
data['zip_code'] = data['zip_code'].astype(str)
# Filter out where bed and bath are zero
data = data[(data['bed'] != 0) & (data['bath'] != 0) & (data['price'] != 0) & (data['house_size'] != 0)]


In [None]:
# Ensure data types
data.dtypes

In [None]:
# Explore the data- important features
data.columns
# Likely important features- house size bed, bath, city, zip_code

In [None]:
data.head()

In [None]:
# Linear regression model- human chosen features
# Create subset of relevant columns
subset = data[['price', 'bed', 'bath', 'acre_lot', 'city', 'house_size']]
subset = subset.dropna()
# Turn city into dummy variable
features = subset.loc[:, subset.columns != 'price']
features = pd.get_dummies(features, columns=['city'])
labels = subset['price']

In [None]:
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.2)

features_train.to_csv('features_train.csv')
features_test.to_csv('features_test.csv')
labels_train.to_csv('labels_train.csv')
labels_train.to_csv('labels_test.csv')

In [None]:
model = LinearRegression()

In [None]:
model.fit(features_train, labels_train)

In [None]:
print(model.coef_)

In [None]:
train_predictions = model.predict(features_train)
test_predictions = model.predict(features_test)

In [None]:
# Testing
train_acc = mean_squared_error(labels_train, train_predictions)
test_acc = mean_squared_error(labels_test, test_predictions)
print(np.sqrt(test_acc))

In [None]:
from sklearn.metrics import r2_score
r2 = r2_score(labels_test, test_predictions)
print(r2)

In [None]:
# Linear regression model- model chosen features
## Improving our model but using GridSearchCV to find the best hyperparameters
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [100,500,1000, 1500],
                  'max_depth'    : [4,6,8,10]
                 }
GBR = GradientBoostingRegressor()
grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters, cv = 2, n_jobs=-1)
grid_GBR.fit(features_train, labels_train)
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)

In [None]:
# Testing

In [None]:
# LIME- further explain feature relevance

In [None]:
# Add visualizations