In [1]:
import pandas as pd
import numpy as np
import pickle

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.pipeline import Pipeline
%matplotlib inline

In [2]:
def ohe_the_df(dataframe):
    
    num_feats = ['beds', 'sq_footage', 'year_built',
                   'has_address_2', 'baths_incl_half', 'walk_score']
    cat_feats = ['city', 'parking_spots', 'clean_mls_types', 'laundry_type']

    num_feats_df = dataframe[num_feats]
    cat_feats_df = dataframe[cat_feats]
    
    ohe_X = ohe.transform(cat_feats_df)
    feature_labels = ohe.get_feature_names(cat_feats)
    cat_feats_ohe = pd.DataFrame(data = ohe_X, 
                                 columns=feature_labels,
                                index = num_feats_df.index)
    combined_dataframe = pd.concat([num_feats_df, cat_feats_ohe], axis = 1)
    return combined_dataframe

def get_cv_r2s(model, X, y):
    cv_r2s = cross_val_score(model, X, y, cv=5, scoring = 'r2')
    print('Scores: ', cv_r2s, '\n')
    print(f'Simple mean cv r^2: {np.mean(cv_r2s):.3f} +- {np.std(cv_r2s):.3f}')

In [29]:
with open('pickles/cleaned_210115_data_laundry_revised_add_ws.pickle', 'rb') as read_file:
    df = pickle.load(read_file)

**Model Constraint: units must be under 3100 sq. feet to be evaluated using this model**

In [30]:
dp_before = df.shape[0]
df = df[df.sq_footage <= 3100].reset_index()
dp_after = df.shape[0]
print(dp_before - dp_after, 'datapoints dropped')

40 datapoints dropped


In [31]:
X_all = df[['city',
       'beds', 'sq_footage', 'year_built', 'laundry_type',
       'parking_spots', 'has_address_2', 'baths_incl_half',
       'clean_mls_types', 'walk_score']]

y_all = df['price']

# train/test split
X, X_test, y, y_test = train_test_split(X_all, y_all, test_size = 0.2, random_state = 25)

In [32]:
# fit the ohe

num_feats = ['beds', 'sq_footage', 'year_built',
                   'has_address_2', 'baths_incl_half', 'walk_score']
cat_feats = ['city', 'parking_spots', 'clean_mls_types', 'laundry_type']

num_feats_df = X[num_feats]
cat_feats_df = X[cat_feats]

ohe = OneHotEncoder(sparse=False, handle_unknown = 'ignore')
ohe.fit(cat_feats_df)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=False)

---

**model 1: simple linear regression**
    
---

In [33]:
def m1_premod(X):  
    X_ohe = ohe_the_df(X)
    return X_ohe

lr_m1 = LinearRegression()

print(get_cv_r2s(lr_m1, m1_premod(X), y))

lr_m1.fit(m1_premod(X), y)

Scores:  [0.66712646 0.69126022 0.5841167  0.60933754 0.63753337] 

Simple mean cv r^2: 0.638 +- 0.038
None


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [34]:
m1_premod(X).columns

Index(['beds', 'sq_footage', 'year_built', 'has_address_2', 'baths_incl_half',
       'walk_score', 'city_Alameda', 'city_Albany', 'city_Atherton',
       'city_Belmont', 'city_Berkeley', 'city_Brisbane', 'city_Burlingame',
       'city_Campbell', 'city_Castro Valley', 'city_Cupertino',
       'city_Daly City', 'city_Dublin', 'city_East Palo Alto',
       'city_Emeryville', 'city_Foster City', 'city_Fremont', 'city_Gilroy',
       'city_Half Moon Bay', 'city_Hayward', 'city_Hillsborough',
       'city_La Honda', 'city_Livermore', 'city_Los Altos', 'city_Los Gatos',
       'city_Menlo Park', 'city_Millbrae', 'city_Milpitas', 'city_Morgan Hill',
       'city_Mountain View', 'city_Newark', 'city_Oakland', 'city_Pacifica',
       'city_Palo Alto', 'city_Piedmont', 'city_Pleasanton',
       'city_Portola Valley', 'city_Redwood City', 'city_San Bruno',
       'city_San Carlos', 'city_San Francisco', 'city_San Jose',
       'city_San Leandro', 'city_San Mateo', 'city_Santa Clara',
       'cit

In [35]:
coefs = zip(m1_premod(X).columns, lr_m1.coef_)
for i in coefs:
    print(i)

('beds', 46.29684686926407)
('sq_footage', 1.8438817907545078)
('year_built', -1.1448672853016575)
('has_address_2', 41.365209139859914)
('baths_incl_half', 631.6025294998827)
('walk_score', 4.292703835813256)
('city_Alameda', -1267.516705602226)
('city_Albany', -429.16055338696503)
('city_Atherton', 211.93386562512137)
('city_Belmont', 406.7310473542459)
('city_Berkeley', 255.13692191054386)
('city_Brisbane', -135.9692218847968)
('city_Burlingame', -210.25645907121728)
('city_Campbell', -227.46374652384668)
('city_Castro Valley', 688.4399608174303)
('city_Cupertino', -508.12258559602907)
('city_Daly City', -164.2892958828616)
('city_Dublin', -1329.1698677620054)
('city_East Palo Alto', -581.3744454573671)
('city_Emeryville', -117.88972802956351)
('city_Foster City', -187.9848036117106)
('city_Fremont', -489.0381855586327)
('city_Gilroy', -782.5132736413474)
('city_Half Moon Bay', 880.0799363518784)
('city_Hayward', -1164.9077882391161)
('city_Hillsborough', 1172.7918552696603)
('city_

---

**model 2: feature engineering v1**

---

In [36]:
def m2_premod(X):    
    X_ohe = ohe_the_df(X)
    X_eng = X_ohe.copy()
    
    X_eng['walk_score_sq_footage'] = X_ohe['walk_score']*X_ohe['sq_footage']
    X_eng['walk_score_year_built'] = X_ohe['walk_score']*X_ohe['year_built']
    X_eng['mls_type_year_built'] = X_ohe['clean_mls_types_house']*X_ohe['year_built']
    X_eng['beds_baths'] = X_ohe['beds']*X_ohe['baths_incl_half']
    
   
    
    return X_eng

lr_m2 = LinearRegression()

print(get_cv_r2s(lr_m2, m2_premod(X), y))

lr_m2.fit(m2_premod(X), y)

Scores:  [0.66286909 0.72460829 0.59961024 0.62995196 0.66705125] 

Simple mean cv r^2: 0.657 +- 0.042
None


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

---

**model 3: feature engineering v2**
    
---

In [37]:
def m3_premod(X):
    X_ohe = ohe_the_df(X)
    X_eng = X_ohe.copy()
    
    X_eng['walk_score_sq_footage'] = X_ohe['walk_score']*X_ohe['sq_footage']
    X_eng['walk_score_year_built'] = X_ohe['walk_score']*X_ohe['year_built']
    X_eng['mls_type_year_built'] = X_ohe['clean_mls_types_house']*X_ohe['year_built']
    X_eng['beds_baths'] = X_ohe['beds']*X_ohe['baths_incl_half']
    
    X_eng['beds_walk_score'] = X_ohe['walk_score']*X_ohe['beds']
    X_eng['address_2_walk_score'] = X_ohe['walk_score']*X_ohe['has_address_2']
    X_eng['year_built_address_2'] = X_ohe['year_built']*X_ohe['has_address_2']
    
    return X_eng

lr_m3 = LinearRegression()

print(get_cv_r2s(lr_m3, m3_premod(X), y))

lr_m3.fit(m3_premod(X), y)

Scores:  [0.65954506 0.72316851 0.60717657 0.65136845 0.67807882] 

Simple mean cv r^2: 0.664 +- 0.038
None


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

---

**model 4: feature engineering v3**

---

In [38]:
def m4_premod(X):
    X_ohe = ohe_the_df(X)
    X_eng = X_ohe.copy()
    
    X_eng['walk_score_sq_footage'] = X_ohe['walk_score']*X_ohe['sq_footage']
    X_eng['walk_score_year_built'] = X_ohe['walk_score']*X_ohe['year_built']
    X_eng['mls_type_year_built'] = X_ohe['clean_mls_types_house']*X_ohe['year_built']
    X_eng['beds_baths'] = X_ohe['beds']*X_ohe['baths_incl_half']
    
    X_eng['beds_walk_score'] = X_ohe['walk_score']*X_ohe['beds']
    X_eng['address_2_walk_score'] = X_ohe['walk_score']*X_ohe['has_address_2']
    X_eng['year_built_address_2'] = X_ohe['year_built']*X_ohe['has_address_2']
    
    X_eng['walk_score_walk_score'] = X_ohe['walk_score']**3
    
    '''
    the below features were tested, but reduced the R^2
    '''
    # X_eng['walk_score_walk_score'] = X_ohe['walk_score']**2
    # X_eng['beds_beds_beds'] = X_ohe['beds']**3
    # X_eng['beds_beds_beds'] = X_ohe['beds']**3
    
    
    
    return X_eng

lr_m4 = LinearRegression()
    
print(get_cv_r2s(lr_m4, m4_premod(X), y))

lr_m4.fit(m4_premod(X), y)

Scores:  [0.66133568 0.7263213  0.60932311 0.64955064 0.67502474] 

Simple mean cv r^2: 0.664 +- 0.038
None


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

---

**model 5: consolidate mls_types**

---

In [48]:
def m5_premod(X):
    '''
    consolidate some of the mls_type columns with the hope of making fewer,  
    but stronger mls_type features
    '''
    X_ohe = ohe_the_df(X)
    X_eng = X_ohe.copy()
    
    X_eng['walk_score_sq_footage'] = X_ohe['walk_score']*X_ohe['sq_footage']
    X_eng['walk_score_year_built'] = X_ohe['walk_score']*X_ohe['year_built']
    X_eng['mls_type_year_built'] = X_ohe['clean_mls_types_house']*X_ohe['year_built']
    X_eng['beds_baths'] = X_ohe['beds']*X_ohe['baths_incl_half']
    
    X_eng['beds_walk_score'] = X_ohe['walk_score']*X_ohe['beds']
    X_eng['address_2_walk_score'] = X_ohe['walk_score']*X_ohe['has_address_2']
    X_eng['year_built_address_2'] = X_ohe['year_built']*X_ohe['has_address_2']
    
    X_eng['walk_score_walk_score'] = X_ohe['walk_score']**3
    
    ''''''
    
    X_eng['clean_mls_types_house'] = X_ohe['clean_mls_types_house']
    + X_ohe['clean_mls_types_single-family']
    X_eng['clean_mls_types_condo'] = X_ohe['clean_mls_types_condo']
    + X_ohe['clean_mls_types_duplex'] + X_ohe['clean_mls_types_townhouse']
    X_eng['clean_mls_types_other'] = X_ohe['clean_mls_types_other'] 
    + X_ohe['clean_mls_types_other rental property']
    
    X_eng = X_eng.drop(['clean_mls_types_single-family',
                        'clean_mls_types_duplex',
                        'clean_mls_types_townhouse',
                        'clean_mls_types_other rental property'
                       ], axis = 1)
    
    return X_eng

lr_m5 = LinearRegression()

print(get_cv_r2s(lr_m5, m5_premod(X), y))

lr_m5.fit(m5_premod(X), y)

Scores:  [0.66633396 0.72537886 0.60855783 0.65733205 0.67192336] 

Simple mean cv r^2: 0.666 +- 0.037
None


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [49]:
len(m5_premod(X).columns)

75

---

**model 6: use model 5's preprocessing, but use lassoCV instead**

---

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import r2_score

In [51]:
m6_X_tr = m5_premod(X)

In [52]:
s_m6 = StandardScaler()
s_m6.fit(m6_X_tr.values)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [53]:
m6_X_tr = s_m6.transform(m6_X_tr.values)
m6_X_te = s_m6.transform(m5_premod(X_test).values)

In [54]:
alphavec = 10**np.linspace(-2,2,200)

ridge_m6 = RidgeCV(alphas = alphavec, cv=5)
ridge_m6.fit(m6_X_tr, y)

RidgeCV(alphas=array([1.00000000e-02, 1.04737090e-02, 1.09698580e-02, 1.14895100e-02,
       1.20337784e-02, 1.26038293e-02, 1.32008840e-02, 1.38262217e-02,
       1.44811823e-02, 1.51671689e-02, 1.58856513e-02, 1.66381689e-02,
       1.74263339e-02, 1.82518349e-02, 1.91164408e-02, 2.00220037e-02,
       2.09704640e-02, 2.19638537e-02, 2.30043012e-02, 2.40940356e-02,
       2.52353917e-02, 2.64308149e-0...
       4.15040476e+01, 4.34701316e+01, 4.55293507e+01, 4.76861170e+01,
       4.99450512e+01, 5.23109931e+01, 5.47890118e+01, 5.73844165e+01,
       6.01027678e+01, 6.29498899e+01, 6.59318827e+01, 6.90551352e+01,
       7.23263390e+01, 7.57525026e+01, 7.93409667e+01, 8.30994195e+01,
       8.70359136e+01, 9.11588830e+01, 9.54771611e+01, 1.00000000e+02]),
        cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
        store_cv_values=False)

In [56]:
ridge_m6.alpha_

0.03827494478516311

In [57]:
predicted_y_values = ridge_m6.predict(m6_X_te)

print('rmse: ', np.sqrt(mean_squared_error(y_test, predicted_y_values)))
print('r2: ', r2_score(y_test, predicted_y_values))

rmse:  1005.1507743686847
r2:  0.6729348058370268
