In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression, f_classif

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from pathlib import Path
path = Path('')
import warnings
warnings.filterwarnings("ignore")

**Find the original data here:**\
https://opendata-downloads.s3.amazonaws.com/opa_properties_public.csv \
**Find the data codebook here**:\
https://metadata.phila.gov/#home/datasetdetails/5543865f20583086178c4ee5/representationdetails/55d624fdad35c7e854cb21a4/?view_287_page=1

In [2]:
# read data
data = pd.read_csv('opa_properties_public.csv', header=0)
print(data.shape)
print(data.columns)
data.head()

(581191, 78)
Index(['objectid', 'assessment_date', 'basements', 'beginning_point',
       'book_and_page', 'building_code', 'building_code_description',
       'category_code', 'category_code_description', 'census_tract',
       'central_air', 'cross_reference', 'date_exterior_condition', 'depth',
       'exempt_building', 'exempt_land', 'exterior_condition', 'fireplaces',
       'frontage', 'fuel', 'garage_spaces', 'garage_type',
       'general_construction', 'geographic_ward', 'homestead_exemption',
       'house_extension', 'house_number', 'interior_condition', 'location',
       'mailing_address_1', 'mailing_address_2', 'mailing_care_of',
       'mailing_city_state', 'mailing_street', 'mailing_zip', 'market_value',
       'market_value_date', 'number_of_bathrooms', 'number_of_bedrooms',
       'number_of_rooms', 'number_stories', 'off_street_open',
       'other_building', 'owner_1', 'owner_2', 'parcel_number', 'parcel_shape',
       'quality_grade', 'recording_date', 'registry_nu

Unnamed: 0,objectid,assessment_date,basements,beginning_point,book_and_page,building_code,building_code_description,category_code,category_code_description,census_tract,...,unit,utility,view_type,year_built,year_built_estimate,zip_code,zoning,pin,lat,lng
0,96022750,2022-03-08 14:22:35,,SWC REED ST,53764990.0,SR,VACANT LAND RESIDE < ACRE,14,,32.0,...,,,I,,,19146.0,RSA5,1001681390,-75.185501,39.934553
1,96022751,2022-03-08 14:16:10,,SWC REED ST,53764990.0,SR,VACANT LAND RESIDE < ACRE,2,Multi Family,32.0,...,,,I,,,19146.0,RSA5,1001681388,-75.185177,39.934722
2,96022752,2022-03-08 14:14:15,,SWC REED ST,53764990.0,SR,VACANT LAND RESIDE < ACRE,14,,32.0,...,,,I,,,19146.0,RSA5,1001681384,-75.185423,39.93491
3,96022753,2022-03-08 14:08:24,,SWC REED ST,53764990.0,SR,VACANT LAND RESIDE < ACRE,6,Vacant Land,32.0,...,,,I,,,19146.0,RSA5,1001681383,-75.185423,39.93491
4,96022754,2022-03-08 14:14:36,,SWC REED ST,53764990.0,SR,VACANT LAND RESIDE < ACRE,6,Vacant Land,32.0,...,,,I,,,19146.0,RSA5,1001681385,-75.185151,39.934839


### Step 1: select related columns 

In [3]:
# place columns of interest into a list
cols_interest = ['assessment_date', 'basements', 'building_code_description', 'category_code_description','census_tract',
                 'central_air', 'depth', 'exempt_building', 'exempt_land', 'exterior_condition', 
                 'fireplaces', 'frontage', 'fuel', 'garage_spaces', 'garage_type', 
                 'geographic_ward', 'house_extension', 'house_number', 'interior_condition', 
                 'market_value', 'market_value_date', 'number_of_bathrooms', 'number_of_bedrooms', 'number_of_rooms',
                 'number_stories', 'other_building', 'parcel_number', 'parcel_shape','quality_grade',
                 'recording_date', 'sale_date', 'sale_price', 'separate_utilities', 'sewer', 
                 'site_type', 'street_designation', 'street_direction', 'taxable_building', 'taxable_land', 
                 'topography', 'total_area', 'total_livable_area', 'type_heater', 'unfinished', 
                 'unit', 'utility', 'view_type', 'year_built', 'year_built_estimate', 
                 'zip_code']
len(cols_interest)

50

In [4]:
# read in data using columns of interest 
data_step1 = data[cols_interest]
# compute percentage of missing values for each column
data_step1.isnull().sum()/len(data_step1)

assessment_date              0.940806
basements                    0.437612
building_code_description    0.000022
category_code_description    0.000241
census_tract                 0.000071
central_air                  0.505662
depth                        0.001173
exempt_building              0.000372
exempt_land                  0.000372
exterior_condition           0.047583
fireplaces                   0.007855
frontage                     0.001182
fuel                         0.973475
garage_spaces                0.008280
garage_type                  0.137328
geographic_ward              0.000071
house_extension              0.954120
house_number                 0.000000
interior_condition           0.048943
market_value                 0.000372
market_value_date            1.000000
number_of_bathrooms          0.007688
number_of_bedrooms           0.007042
number_of_rooms              0.057768
number_stories               0.007030
other_building               0.997751
parcel_numbe

### Step 2: drop columns with more than 90% missing values 

In [5]:
# drop columns with over 90% missing values
data_step2 = data_step1.drop(['assessment_date', 'fuel', 'house_extension', 'market_value_date', 
                              'other_building', 'quality_grade', 'separate_utilities', 'sewer', 'unfinished', 
                              'unit', 'utility'], axis = 1)
print(len(data_step2.columns))
print(data_step2.columns)
print(data_step2.info())

39
Index(['basements', 'building_code_description', 'category_code_description',
       'census_tract', 'central_air', 'depth', 'exempt_building',
       'exempt_land', 'exterior_condition', 'fireplaces', 'frontage',
       'garage_spaces', 'garage_type', 'geographic_ward', 'house_number',
       'interior_condition', 'market_value', 'number_of_bathrooms',
       'number_of_bedrooms', 'number_of_rooms', 'number_stories',
       'parcel_number', 'parcel_shape', 'recording_date', 'sale_date',
       'sale_price', 'site_type', 'street_designation', 'street_direction',
       'taxable_building', 'taxable_land', 'topography', 'total_area',
       'total_livable_area', 'type_heater', 'view_type', 'year_built',
       'year_built_estimate', 'zip_code'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581191 entries, 0 to 581190
Data columns (total 39 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   ----

### Step 3: transform  categorical data

In [6]:
# transform categorical data 
data_tf = data_step2.copy()

# feature 0: basements
data_tf['basements'] = data_step2['basements'].replace({'0':'None','1':'full','2':'full','3':'full','4':'full',
                                                        'A':'full', 'B': 'full','C': 'full','D': 'full','E':'partial',
                                                        'F':'partial','G': 'partial','H': 'partial','I':'full',
                                                        'J':'partial'})
print(data_tf.basements.value_counts(dropna=False))

NaN        254336
full       169102
partial    146574
None        11179
Name: basements, dtype: int64


In [7]:
# feature 1: building_code_description
m = ['MA','ST','ME','VA']
n = ['MASON','STONE','METAL','VACANT'] 
for i in range(len(m)):
    data_tf.loc[data_tf['building_code_description'].str.contains(m[i],na=False),'building_code_description'] = n[i]
data_tf.loc[(~data_tf.building_code_description.isin(n))&(data_tf.building_code_description.notnull()),
            'building_code_description'] = 'MASON'
print(data_tf.building_code_description.value_counts(dropna=False))

MASON     512256
VACANT     37054
STONE      26141
METAL       5727
NaN           13
Name: building_code_description, dtype: int64


In [8]:
# feature 2: category_code_description
print(data_tf.category_code_description.value_counts(dropna=False))

Single Family    462056
Vacant Land       43845
Multi Family      42484
Mixed Use         14347
Commercial        14004
Industrial         4315
NaN                 140
Name: category_code_description, dtype: int64


In [9]:
# feature 3: census_tract, can only run once
data_tf['census_tract'].fillna(0, inplace=True)
data_tf['census_tract'] = data_tf['census_tract'].astype('int')
data_tf.loc[(data_tf['census_tract']>0)&(data_tf['census_tract']<=300), 'census_tract'] = 1
data_tf.loc[(data_tf['census_tract']>=301)&(data_tf['census_tract']<=600),'census_tract'] = 2
data_tf.loc[data_tf['census_tract']>=601, 'census_tract'] = 3
data_tf['census_tract'] = data_tf['census_tract'].replace({0:np.NaN,1:'A',2:'B',3:'C'})

In [10]:
print(data_tf.census_tract.value_counts(dropna=False))

A      433021
B      147212
C         917
NaN        41
Name: census_tract, dtype: int64


In [11]:
# feature 4: central_air
data_tf['central_air'] = data_step2['central_air'].replace({'0': 'N', '1': 'Y','Y': 'Y', 'N': 'N'})
print(data_tf.central_air.value_counts(dropna=False))

NaN    293886
N      174067
Y      113238
Name: central_air, dtype: int64


In [12]:
# feature 8: exterior_condition
data_tf['exterior_condition'] = data_step2['exterior_condition'].replace({0:'none',1:'new',2:'rehabilitated',
                                                                          3:'above average',4:'rehabilitated',5:'average', 
                                                                          6: 'below average',7:'vacant',8:'sealed',
                                                                          9:'compromised'})
print(data_tf.exterior_condition.value_counts(dropna=False))

rehabilitated    450108
above average     43662
NaN               27655
none              19411
average           17679
new               13770
vacant             5702
below average      3204
Name: exterior_condition, dtype: int64


In [13]:
# feature 12: garage_type
data_tf['garage_type'] = data_step2['garage_type'].replace({'A':'attached', 'B':'attached','C':'detached',
                                                            'F':'attached', 'S':'attached', 'T':'detached',
                                                            '0':'none', '0.0':'none', 0.0:'none',
                                                            '1':'attached', '1.0':'attached', 1.0:'attached',
                                                            '2':'attached', '2.0':'attached', 2.0:'attached',
                                                            '3':'attached', '3.0':'attached', 3.0:'attached'})
print(data_tf.garage_type.value_counts(dropna=False))

none        305799
attached    176718
NaN          79814
detached     18860
Name: garage_type, dtype: int64


In [14]:
# feature 13: geographic_ward, can only run once
data_tf['geographic_ward'].fillna(0, inplace=True)
data_tf['geographic_ward'] = data_tf['geographic_ward'].astype('int')
data_tf.loc[(data_tf['geographic_ward']>0)&(data_tf['geographic_ward']<=30), 'geographic_ward'] = 1
data_tf.loc[(data_tf['geographic_ward']>=31)&(data_tf['geographic_ward']<=50),'geographic_ward'] = 2
data_tf.loc[data_tf['geographic_ward']>=51, 'geographic_ward'] = 3
data_tf['geographic_ward'] = data_tf['geographic_ward'].replace({0:np.NaN,1:'A',2:'B',3:'C'})

In [15]:
data_tf.geographic_ward.value_counts(dropna=False)

A      241479
B      193817
C      145854
NaN        41
Name: geographic_ward, dtype: int64

In [16]:
# feature 14: house_number
data_tf['house_number'] = np.where(data_step2['house_number']%2==0, 'South or West', 'North or East')

In [17]:
data_tf.house_number.value_counts(dropna=False)

South or West    293063
North or East    288128
Name: house_number, dtype: int64

In [18]:
# feature 15: interior_condition
data_tf['interior_condition'] = data_step2['interior_condition'].replace({0:'none',1:'new',2:'new',
                                                                          3:'above average',4:'average', 
                                                                          5: 'below average',6:'vacant',7:'sealed'})
print(data_tf.exterior_condition.value_counts(dropna=False))

rehabilitated    450108
above average     43662
NaN               27655
none              19411
average           17679
new               13770
vacant             5702
below average      3204
Name: exterior_condition, dtype: int64


In [19]:
# feature 21: parcel_number, can run only once
data_tf['parcel_number'].fillna(0, inplace=True)
data_tf['parcel_number'] = data_tf['parcel_number'].astype('int')
data_tf.loc[(data_tf['parcel_number']>0)&(data_tf['parcel_number']<=4e8),'parcel_number'] = 1
data_tf.loc[(data_tf['parcel_number']>=4e8+1)&(data_tf['parcel_number']<=7e8),'parcel_number'] = 2
data_tf.loc[data_tf['parcel_number']>=7e8+1,'parcel_number'] = 3
data_tf['parcel_number'] = data_tf['parcel_number'].replace({0:np.NaN,1:'A',2:'B',3:'C'})

In [20]:
print(data_tf.parcel_number.value_counts(dropna=False))

A    279344
B    220468
C     81379
Name: parcel_number, dtype: int64


In [21]:
# feature 22: parcel_shape
data_tf['parcel_shape'] = data_step2['parcel_shape'].replace({' ':np.NaN})
print(data_tf.parcel_shape.value_counts(dropna=False))

E      528235
A       38513
NaN      6590
B        6309
C        1478
D          66
Name: parcel_shape, dtype: int64


In [22]:
# feature 23: recording_date, can only run once
data_tf['recording_date'] = pd.to_datetime(data_tf['recording_date'], errors='coerce')
data_tf['recording_date'] = data_tf['recording_date'] - data_tf['recording_date'].min()
data_tf['recording_date'].fillna(pd.Timedelta(seconds=0), inplace=True)
data_tf['recording_date'] = data_tf['recording_date'].astype('str').apply(lambda x:x[:-5]).astype('int64')
data_tf.loc[data_tf['recording_date']==0,'recording_date'] = np.NaN

In [24]:
# feature 24: sale_date, can only run once
data_tf['sale_date'] 

0        2020-12-10
1        2020-12-10
2        2020-12-10
3        2020-12-10
4        2020-12-10
            ...    
581186   1943-01-01
581187   1943-01-01
581188   1943-01-01
581189   1943-01-01
581190   1943-01-01
Name: sale_date, Length: 581191, dtype: datetime64[ns]

In [25]:
# feature 26: site_type
data_tf['site_type'].value_counts(dropna=False)

NaN    285033
A      257318
B       37545
D         638
C         390
E         204
G          33
F          30
Name: site_type, dtype: int64

In [26]:
# feaure 27: street_designation
data_tf['street_designation'].value_counts(dropna=False)

ST      426727
AVE      91130
RD       31076
LA        8201
DR        5920
PL        5719
BLV       2765
SQ        2417
TER       1614
WAY       1353
CT        1204
CIR        805
LN         609
PK         583
PKY        377
PLZ        168
BLVD       145
MEW         79
ALY         74
WLK         67
ML          47
PIKE        31
HTS         24
PTH         12
PKWY        10
ROW          9
MEWS         7
WALK         6
NaN          5
PATH         3
MALL         3
EXP          1
Name: street_designation, dtype: int64

In [27]:
# feature 28: street_direction   
data_tf['street_direction'].value_counts(dropna=False)

NaN    354729
N       96431
S       58525
W       37234
E       34272
Name: street_direction, dtype: int64

In [28]:
# feature 31: topography      
data_tf['topography'] = data_step2['topography'].replace({'0':'None'})
data_tf['topography'].value_counts(dropna=False)

F       506514
NaN      38882
A        30412
E         4816
B          256
C          174
D          114
None        23
Name: topography, dtype: int64

In [29]:
# feature 34: type_heater  
data_tf['type_heater'] = data_step2['type_heater'].replace({'0':'None'})
data_tf['type_heater'].value_counts(dropna=False)

NaN     285933
H       125267
A        97967
B        60880
G         4916
C         2979
None      1907
E          755
D          587
Name: type_heater, dtype: int64

In [30]:
# feature 35: view_type 
data_tf['view_type'] = data_step2['view_type'].replace({'0':'None'})
data_tf['view_type'].value_counts(dropna=False)

I       521733
NaN      20996
A        15277
C         7613
None      4965
D         4018
H         2738
E         2081
B         1770
Name: view_type, dtype: int64

In [31]:
# feature 36: year_built  
data_tf['year_built'] = data_tf['year_built'].replace({'196Y':'1965',0:np.NaN})
data_tf['year_built'] = 2022 - data_tf['year_built'].astype(float)

In [32]:
# feature 37: year_built_estimate
data_tf['year_built_estimate'] = data_step2['year_built_estimate'].replace({'0':'N'})
data_tf['year_built_estimate'].value_counts(dropna=False)

Y      438150
NaN    142747
N         294
Name: year_built_estimate, dtype: int64

In [33]:
# feature 38: zip_code, can only run once
data_tf['zip_code'].fillna(0, inplace=True)
data_tf['zip_code'] = data_tf['zip_code'].astype('int')
data_tf.loc[(data_tf['zip_code']>0)&(data_tf['zip_code']<=19120),'zip_code'] = 1
data_tf.loc[(data_tf['zip_code']>=19121)&(data_tf['zip_code']<=19140),'zip_code'] = 2
data_tf.loc[data_tf['zip_code']>=19141,'zip_code'] = 3
data_tf['zip_code'] = data_tf['zip_code'].replace({0:np.NaN,1:'A',2:'B',3:'C'})

In [34]:
data_tf['zip_code'].value_counts(dropna=False)

B      259902
C      202936
A      118314
NaN        39
Name: zip_code, dtype: int64

In [38]:
data_tf

Unnamed: 0,basements,building_code_description,category_code_description,census_tract,central_air,depth,exempt_building,exempt_land,exterior_condition,fireplaces,...,taxable_building,taxable_land,topography,total_area,total_livable_area,type_heater,view_type,year_built,year_built_estimate,zip_code
0,,VACANT,,A,,,100.0,0.0,,,...,0.0,0.0,F,,,,I,,,C
1,,VACANT,Multi Family,A,,,29400.0,0.0,,,...,0.0,0.0,F,,,,I,,,C
2,,VACANT,,A,,,14200.0,0.0,,,...,0.0,0.0,F,,,,I,,,C
3,,VACANT,Vacant Land,A,,,0.0,5200.0,,,...,0.0,0.0,F,681.00,,,I,,,C
4,,VACANT,Vacant Land,A,,,0.0,21000.0,,,...,0.0,0.0,F,712.00,,,I,,,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581186,,MASON,Commercial,A,,325.0,5186505.0,774995.0,rehabilitated,0.0,...,0.0,0.0,F,55250.00,52020.0,,I,85.0,,B
581187,,MASON,Commercial,A,,112.5,579681.0,86619.0,rehabilitated,0.0,...,0.0,0.0,F,15750.00,5757.0,,I,61.0,,B
581188,,VACANT,Vacant Land,A,,115.0,0.0,1397000.0,,0.0,...,0.0,0.0,F,52420.45,0.0,,I,,,A
581189,,MASON,Commercial,A,,250.9,8093262.0,1209338.0,rehabilitated,0.0,...,0.0,0.0,F,115138.01,78053.0,,I,174.0,,C


In [23]:

# transform record date/ sale date to year/month 
# calculate difference between 'recording_date' and 'sale_date', add columns names "Sale_Record_dayDiff"
# data_transform['year'] = data_transform.DatetimeIndex(data_transform['ArrivalDate']).year/month
StartDate_Sale = data_transform['sale_date']
EndDate_Record = data_transform['recording_date']
dateDiff = pd.to_datetime(EndDate_Record, errors='coerce') - pd.to_datetime(StartDate_Sale, errors='coerce')
data_transform['Sale_Record_dayDiff'] = dateDiff.dt.days     # transform to float64
# drop two date column 
data_transform = data_transform.drop(['sale_date','recording_date'],axis=1)

print(len(data_transform.columns))
data_transform

41


Unnamed: 0,basements,building_code_description,category_code_description,census_tract,central_air,depth,exempt_building,exempt_land,exterior_condition,fireplaces,...,total_area,total_livable_area,type_heater,view_type,year_built_estimate,zip_code,lat,lng,Sale_Record_dayDiff,Built_years
0,,VACANT LAND RESIDE < ACRE,,32.0,,,100.0,0.0,,,...,,,,I,,19146.0,-75.185501,39.934553,371.0,
1,,VACANT LAND RESIDE < ACRE,Multi Family,32.0,,,29400.0,0.0,,,...,,,,I,,19146.0,-75.185177,39.934722,371.0,
2,,VACANT LAND RESIDE < ACRE,,32.0,,,14200.0,0.0,,,...,,,,I,,19146.0,-75.185423,39.934910,371.0,
3,,VACANT LAND RESIDE < ACRE,Vacant Land,32.0,,,0.0,5200.0,,,...,681.00,,,I,,19146.0,-75.185423,39.934910,371.0,
4,,VACANT LAND RESIDE < ACRE,Vacant Land,32.0,,,0.0,21000.0,,,...,712.00,,,I,,19146.0,-75.185151,39.934839,371.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581186,,SCHOOL 2 STY MASONRY,Commercial,266.0,,325.0,5186505.0,774995.0,4.0,0.0,...,55250.00,52020.0,,I,,19126.0,-75.148028,40.058936,0.0,85.0
581187,,MISC FIRE/POLICE MASONRY,Commercial,169.0,,112.5,579681.0,86619.0,4.0,0.0,...,15750.00,5757.0,,I,,19132.0,-75.174560,39.992045,0.0,61.0
581188,,VACANT LAND RESIDE ACRE+,Vacant Land,257.0,,115.0,0.0,1397000.0,,0.0,...,52420.45,0.0,,I,,19118.0,-75.200310,40.074676,0.0,2022.0
581189,,SCHOOL 2 STY MASONRY,Commercial,260.0,,250.9,8093262.0,1209338.0,4.0,0.0,...,115138.01,78053.0,,I,,19150.0,-75.170305,40.073286,0.0,174.0


In [25]:
# transform infinite data columns 
data_transform =data_transform[~data_transform.isin([np.nan, np.inf, -np.inf]).any(1)]

### Split dataset

In [41]:
Data_all = data_transform.copy()

X_data = Data_all.drop(['market_value'], axis=1)
y_data = Data_all.market_value

# divide data into training, validation and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=123)
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)
Data_all.shape

(47498, 41)

### Preprocess using pipeline 

In [28]:
# Create transformer for numeric features
# Fill missing values with median and then scale
numeric_features = ['depth', 'exempt_building', 'exempt_land','taxable_building', 'taxable_land',
                    'fireplaces', 'frontage', 'garage_spaces','geographic_ward','sale_price', 
                    'number_of_bathrooms','number_of_bedrooms', 'number_of_rooms', 'number_stories',
                    'total_area', 'total_livable_area','Sale_Record_dayDiff','Built_years','lat','lng']
numeric_transformer = Pipeline(steps=
                               [("fill_missing", SimpleImputer(strategy="median")), 
                                ("scaler", StandardScaler())
                               ])

# Create transformer for categorical features
# Fill missing values with mode and then one-hot encode
categorical_features = ['basements', 'category_code_description', 'central_air','exterior_condition',
                   'garage_type', 'general_construction', 'house_number','interior_condition',
                   'parcel_number', 'parcel_shape','site_type','view_type',
                   'street_designation','street_direction','topography','type_heater','zip_code','year_built_estimate']
# Convert all to strings
for feat in categorical_features:
    X_train_sub[feat]=X_train_sub[feat].astype(str)
# Create transformer pipeline for categorical data
categorical_transformer = Pipeline(steps=
                                   [("fill_missing",SimpleImputer(strategy='constant', fill_value='missing')),
                                    ("encoder",OneHotEncoder(handle_unknown="ignore"))
                                   ])

# Combine the two transformers into single ColumnTransformer preprocessor
preprocessor = ColumnTransformer(transformers=
                                 [("num", numeric_transformer, numeric_features),
                                  ("cat", categorical_transformer, categorical_features)])


In [29]:
# Create pipeline with preprocessor and model
model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", LinearRegression())])

# Fit the pipeline on the training data
model_pipeline.fit(X_train_sub, y_train_sub)

# Use the pipeline to get predictions on test set and evaluate
val_preds = model_pipeline.predict(X_val)
r2 = r2_score(y_val,val_preds)
print("R-squared on test set: {:.3f}".format(r2))

accuracy = model_pipeline.score(X_val,y_val)
print("Accuracy on validation set: {:.3f}".format(accuracy))

MSE = mean_squared_error(y_val,val_preds)
print("MSE on validation set: {:.3f}".format(accuracy))

R-squared on test set: 1.000
Accuracy on validation set: 1.000
MSE on validation set: 1.000


### Model selection 

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor


clfs = []
clfs.append(LinearRegression())
clfs.append(Lasso())
clfs.append(Ridge())
clfs.append(ElasticNet())
clfs.append(DecisionTreeRegressor())
clfs.append(RandomForestRegressor())
clfs.append(KNeighborsRegressor())
clfs.append(GradientBoostingRegressor())


for classifier in clfs:
    model_pipeline.set_params(model = classifier)
    # cross-val_score on X_train, y_train
    model_pipeline.fit(X_train_sub, y_train_sub)
    accuracy = model_pipeline.score(X_val, y_val)

    print('---------------------------------')
    print(str(classifier))
    print('Accuracy for this classifier is', accuracy)

---------------------------------
LinearRegression()
Accuracy for this classifier is 0.9999999678646587
---------------------------------
Lasso()
Accuracy for this classifier is 0.9999999998655403
---------------------------------
Ridge()
Accuracy for this classifier is 0.9999981106891036
---------------------------------
ElasticNet()
Accuracy for this classifier is 0.9466641205467486
---------------------------------
DecisionTreeRegressor()
Accuracy for this classifier is 0.9920539679782204
---------------------------------
RandomForestRegressor()
Accuracy for this classifier is 0.9951284748709962
---------------------------------
KNeighborsRegressor()
Accuracy for this classifier is 0.9442350520157171
---------------------------------
GradientBoostingRegressor()
Accuracy for this classifier is 0.9955149799221633
