In [1]:
#Author: Christine Nguyen

import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
sales = pd.read_pickle('sales_cleaned_data.pkl')

In [5]:
sales.head()

Unnamed: 0,CoolingYN,FireplaceYN,HighSchoolDistrict,LaundryYN,MLSAreaMajor,PoolPrivateYN,SignOnPropertyYN,ViewYN,quantile_closePrice,BathroomsTotalInteger,ClosePrice,ParkingTotal,PhotosCount,LivingArea,YearBuilt,ParcelNumber
0,1,1,Unknown,1,699 - Not Defined,0,3,1,"(1175000.0, 1275000.0]",2,1225000.0,2.394564,2,1615.0,1955.0,7572011000.0
1,0,1,Unknown,3,699 - Not Defined,0,0,1,"(1275000.0, 1400000.0]",2,1285000.0,2.394564,20,1800.0,1949.0,7572008000.0
4,0,1,Unknown,3,699 - Not Defined,0,1,1,"(1400000.0, 1580040.0]",3,1455000.0,2.0,32,1921.0,1960.0,7564011000.0
5,1,1,Unknown,1,699 - Not Defined,3,1,1,"(1580040.0, 1929200.0]",3,1900000.0,2.0,31,3580.0,1962.0,7558006000.0
6,0,1,Palos Verdes Peninsula Unified,1,162 - Monte Malaga,0,1,1,"(1400000.0, 1580040.0]",3,1500000.0,2.0,27,2942.0,1967.0,7578021000.0


In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [35]:
#one hot encoder only takes in either str or floats/ints data types, so we must convert the categories to the appropriate data types.
sales.HighSchoolDistrict = sales.HighSchoolDistrict.astype('str')
sales.MLSAreaMajor = sales.MLSAreaMajor.astype('str')
sales.LaundryYN = sales.LaundryYN.astype('int64')
sales.PoolPrivateYN = sales.PoolPrivateYN.astype('int64')
sales.SignOnPropertyYN = sales.SignOnPropertyYN.astype('int64')
sales.ViewYN = sales.ViewYN.astype('int64')
sales.FireplaceYN = sales.FireplaceYN.astype('int64')
sales.CoolingYN = sales.CoolingYN.astype('int64')
sales.YearBuilt = sales.YearBuilt.astype('int64')

In [13]:
numerical = [
'BathroomsTotalInteger',
'PhotosCount',
# 'ClosePrice',
'LivingArea',
'YearBuilt',
'ParkingTotal'
]

categorical =[
# 'quantile_closePrice',
'MLSAreaMajor',
'ViewYN',
'FireplaceYN',
'LaundryYN',
'CoolingYN',
'HighSchoolDistrict',
'PoolPrivateYN',
'SignOnPropertyYN'
]

# Evaluate the pipeline

In [68]:
from sklearn.pipeline import make_pipeline
# from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing

In [112]:
ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore') #this is to encode the categorical variables
scaler = preprocessing.StandardScaler() #this is to scale the continous variables
linreg = LinearRegression() #the linear regression model
rf = RandomForestRegressor()

In [107]:
from sklearn.model_selection import cross_val_score

In [23]:
X = numerical + categorical #name of the independent variables
y = sales.ClosePrice #target variable column

In [108]:
#split the data set to training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sales[X], 
                                                    y, test_size=0.2, random_state=42)

In [109]:
ct = make_column_transformer(
    (ohe, categorical),
    (scaler, numerical),
    remainder='passthrough')

# train = ct.fit_transform(X_train)
# test = ct.fit_transform(X_test)

In [None]:
pipe_rf = make_pipeline(ct, rf)

In [111]:
cross_val_score(pipe_rf, X_train, y_train, cv=5).mean() #training score for random forest regressor

0.6600885599298284

In [115]:
pipe_rf.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='ignore',
                                                                sparse=False),
                                                  ['MLSAreaMajor', 'ViewYN',
                                                   'FireplaceYN', 'LaundryYN',
                                                   'CoolingYN...
                 RandomForestRegressor(b

In [116]:
pipe_rf.predict(X_test)

array([1209095.        ,  695257.49      ,  716875.        ,
        894947.6       , 1744333.99      ,  728270.        ,
       1686909.828     ,  810325.        , 1196093.8       ,
       1680031.316     , 1011044.66666667, 1293978.638     ,
       1181534.98      , 1499079.03      ,  873988.98      ,
       1548244.242     ,  959145.49      , 1372759.        ,
       1113100.18333333,  963009.25      , 1051517.59      ,
       1002235.        , 1242373.75      , 1371892.298     ,
       1149360.76      , 1248601.138     , 1117541.2       ,
        631413.        , 1489669.398     , 1040525.07      ,
       1607517.344     , 1514729.104     , 1550415.414     ,
       1386532.05      , 1574952.226     , 1449431.964     ,
       1329749.328     , 1119435.414     , 1140198.        ,
        933211.45      , 1354493.86      , 1287339.6       ,
       1202518.078     , 1408365.29      , 1299275.904     ,
       1450820.388     , 1794906.472     ,  942314.        ,
       1211112.        ,