## Housing Prices in California (for sale)

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [15]:
# Read in the data- US Housing Prices
data = pd.read_csv('housing_prices_us.csv')
data.head()

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
0,103378.0,for_sale,105000.0,3.0,2.0,0.12,1962661.0,Adjuntas,Puerto Rico,601.0,920.0,
1,52707.0,for_sale,80000.0,4.0,2.0,0.08,1902874.0,Adjuntas,Puerto Rico,601.0,1527.0,
2,103379.0,for_sale,67000.0,2.0,1.0,0.15,1404990.0,Juana Diaz,Puerto Rico,795.0,748.0,
3,31239.0,for_sale,145000.0,4.0,2.0,0.1,1947675.0,Ponce,Puerto Rico,731.0,1800.0,
4,34632.0,for_sale,65000.0,6.0,2.0,0.05,331151.0,Mayaguez,Puerto Rico,680.0,,


In [16]:
# Filter the data
data = data[(data['state'] == 'California') & (data['status'] == 'for_sale')]
len(data)

101034

In [17]:
# Clean the data
# Zipcode is not null or zero
data = data[(data['zip_code'].isna() == False) & (data['zip_code'] != 0.0)]
data['zip_code'] = data['zip_code'].astype(int).astype(str)

# Status as string
data['status'] = data['status'].astype(str)
data['city'] = data['city'].astype(str)
data['bed'] = data['bed'].fillna(0).astype(int)
data['bath'] = data['bath'].fillna(0).astype(int)
data['price'] = data['price'].fillna(0).astype(int)
data['house_size'] = data['house_size'].fillna(0).astype(int)
data['zip_code'] = data['zip_code'].astype(str)
# Filter out where bed and bath are zero
data = data[(data['bed'] != 0) & (data['bath'] != 0) & (data['price'] != 0) & (data['house_size'] != 0)]


In [18]:
# Ensure data types
data.dtypes

brokered_by       float64
status             object
price               int32
bed                 int32
bath                int32
acre_lot          float64
street            float64
city               object
state              object
zip_code           object
house_size          int32
prev_sold_date     object
dtype: object

In [19]:
# Explore the data- important features
data.columns
# Likely important features- house size bed, bath, city, zip_code

Index(['brokered_by', 'status', 'price', 'bed', 'bath', 'acre_lot', 'street',
       'city', 'state', 'zip_code', 'house_size', 'prev_sold_date'],
      dtype='object')

In [20]:
data.head()

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
1208977,4311.0,for_sale,199900,3,1,0.18,1466188.0,Blythe,California,92225,1014,
1208998,4311.0,for_sale,172999,3,2,0.16,987585.0,Blythe,California,92225,1132,1984-06-29
1209109,64877.0,for_sale,79900,4,2,0.16,1533451.0,Blythe,California,92225,1272,
1209110,54422.0,for_sale,69000,3,1,0.91,1626662.0,Blythe,California,92225,1134,
1209111,109780.0,for_sale,75000,3,2,0.33,540514.0,Blythe,California,92225,1248,


In [21]:
# Linear regression model- human chosen features
# Create subset of relevant columns
subset = data[['price', 'bed', 'bath', 'acre_lot', 'city', 'zip_code', 'house_size']]
subset = subset.dropna()
# Turn city into dummy variable
features = subset.loc[:, subset.columns != 'price']
features = pd.get_dummies(features, columns=['city', 'zip_code'])
labels = subset['price']

In [22]:
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.2)

model = LinearRegression()

In [None]:
model.fit(features_train, labels_train)

In [None]:
print(model.coef_)

In [None]:
train_predictions = model.predict(features_train)
test_predictions = model.predict(features_test)

In [None]:
# Testing
train_acc = mean_squared_error(labels_train, train_predictions)
test_acc = mean_squared_error(labels_test, test_predictions)
print(test_acc)

In [None]:
# Linear regression model- model chosen features
## Improving our model but using GridSearchCV to find the best hyperparameters
# from sklearn import svm, datasets
# from sklearn.model_selection import GridSearchCV
# hyperparameters = {'min_samples_leaf': [1, 10, 50, 100, 200, 300], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20]}
# clf = DecisionTreeClassifier()
# search = GridSearchCV(clf, hyperparameters, cv = 6, return_train_score = True)
# search.fit(train_data[features], train_data[target])
# params = search.best_params_
# best_depth = params['max_depth']
# print(best_depth)

In [None]:
# Testing

In [None]:
# LIME- further explain feature relevance

In [None]:
# Add visualizations