In [1]:
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


In [2]:
df = pd.read_csv('training.csv')
df.head()

df = df.dropna(subset=['BEDS', 'BATHS', 'SQUARE_FEET', 'CITY', 'PRICE'])
df['CITY'] = df.apply(lambda row : row['CITY'].upper().strip(), axis=1)
df['HOA'] = df['HOA'].fillna(0)

ALL_CITIES = sorted(df['CITY'].unique())
ALL_PROPERTY_TYPES = sorted(df['PROPERTY_TYPE'].unique())

CLEANED_CITIES = ALL_CITIES.copy()
CLEANED_CITIES.remove("BELMONT")
CLEANED_P_TYPES = ALL_PROPERTY_TYPES.copy()
CLEANED_P_TYPES.remove("TOWNHOUSE")

In [3]:
city_encodings = pd.get_dummies(df['CITY'])
property_type_encodings = pd.get_dummies(df['PROPERTY_TYPE'])
filtered_df = df[['BEDS', 'BATHS', 'SQUARE_FEET', 'HOA', 'PRICE']]

merged = pd.concat([filtered_df, city_encodings, property_type_encodings], axis=1)
merged.head()

Unnamed: 0,BEDS,BATHS,SQUARE_FEET,HOA,PRICE,BELMONT,BURLINGAME,CAMPBELL,CUPERTINO,DALY CITY,...,STANFORD,SUNNYVALE,UNION CITY,WOODSIDE,CONDO,MOBILE_HOME,MULTI_FAMILY_2_4,MULTI_FAMILY_5,SINGLE_FAMILY_HOME,TOWNHOUSE
0,3.0,3.5,1925.0,250.0,1094000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,4.0,4.5,1917.0,121.0,1375000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2.0,1.5,1006.0,323.0,600000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3.0,2.0,1117.0,0.0,1375000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,4.0,2.0,1324.0,0.0,1300000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
final = merged.drop(['BELMONT', 'TOWNHOUSE'], axis=1)

X = final.drop(['PRICE'], axis=1)
y = final['PRICE']

y.head()

0    1094000.0
1    1375000.0
2     600000.0
3    1375000.0
4    1300000.0
Name: PRICE, dtype: float64

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LinearRegression().fit(X_train, y_train)

model.score(X_test, y_test)

coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(model.coef_))], axis = 1)
coefficients.head(50)

import statsmodels.api as sm
est = sm.OLS(y_train, X_train)
print(est.fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                  PRICE   R-squared (uncentered):                   0.931
Model:                            OLS   Adj. R-squared (uncentered):              0.930
Method:                 Least Squares   F-statistic:                              2814.
Date:                Tue, 18 Jan 2022   Prob (F-statistic):                        0.00
Time:                        15:55:37   Log-Likelihood:                     -1.2069e+05
No. Observations:                8217   AIC:                                  2.414e+05
Df Residuals:                    8178   BIC:                                  2.417e+05
Df Model:                          39                                                  
Covariance Type:            nonrobust                                                  
                          coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------

In [10]:
N_BEDS = 2
N_BATHS = 1
SQ_FT = 968
HOA = 400
CITY = "palo alto".upper()
PROPERTY_TYPE = "townhouse".upper()
    
# create input for prediction
x_in = [N_BEDS, N_BATHS, SQ_FT, HOA]
cities = [0 for x in CLEANED_CITIES]
property_types = [0 for x in CLEANED_P_TYPES]

if PROPERTY_TYPE not in ALL_PROPERTY_TYPES:
    raise Exception("Property type {} not found in expected types: {}".format(PROPERTY_TYPE, str(PROPERTY_TYPES)))
else:
    try: 
        property_types[CLEANED_P_TYPES.index(PROPERTY_TYPE)] = 1
    except ValueError:
        pass
if CITY not in ALL_CITIES:
    raise Exception("City {} not found in expected cities: {}".format(CITY, str(CITIES)))
else:
    try: 
        cities[CLEANED_CITIES.index(CITY)] = 1
    except ValueError:
        pass  

x_in += cities
x_in += property_types

print(x_in)

model.predict([x_in])

[2, 1, 968, 400, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]




array([2314505.822732])