In [1]:
# import packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import pickle
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

**Find the original data here:**
https://opendata-downloads.s3.amazonaws.com/opa_properties_public.csv

**Find the data codebook here**:
https://metadata.phila.gov/#home/datasetdetails/5543865f20583086178c4ee5/representationdetails/55d624fdad35c7e854cb21a4/?view_287_page=1

In [2]:
# read original data
df = pd.read_csv('opa_properties_public.csv', header=0)
print(df.shape)
print(df.columns)
df.head()

(581414, 78)
Index(['objectid', 'assessment_date', 'basements', 'beginning_point',
       'book_and_page', 'building_code', 'building_code_description',
       'category_code', 'category_code_description', 'census_tract',
       'central_air', 'cross_reference', 'date_exterior_condition', 'depth',
       'exempt_building', 'exempt_land', 'exterior_condition', 'fireplaces',
       'frontage', 'fuel', 'garage_spaces', 'garage_type',
       'general_construction', 'geographic_ward', 'homestead_exemption',
       'house_extension', 'house_number', 'interior_condition', 'location',
       'mailing_address_1', 'mailing_address_2', 'mailing_care_of',
       'mailing_city_state', 'mailing_street', 'mailing_zip', 'market_value',
       'market_value_date', 'number_of_bathrooms', 'number_of_bedrooms',
       'number_of_rooms', 'number_stories', 'off_street_open',
       'other_building', 'owner_1', 'owner_2', 'parcel_number', 'parcel_shape',
       'quality_grade', 'recording_date', 'registry_nu

Unnamed: 0,objectid,assessment_date,basements,beginning_point,book_and_page,building_code,building_code_description,category_code,category_code_description,census_tract,central_air,cross_reference,date_exterior_condition,depth,exempt_building,exempt_land,exterior_condition,fireplaces,frontage,fuel,garage_spaces,garage_type,general_construction,geographic_ward,homestead_exemption,house_extension,house_number,interior_condition,location,mailing_address_1,mailing_address_2,mailing_care_of,mailing_city_state,mailing_street,mailing_zip,market_value,market_value_date,number_of_bathrooms,number_of_bedrooms,number_of_rooms,number_stories,off_street_open,other_building,owner_1,owner_2,parcel_number,parcel_shape,quality_grade,recording_date,registry_number,sale_date,sale_price,separate_utilities,sewer,site_type,state_code,street_code,street_designation,street_direction,street_name,suffix,taxable_building,taxable_land,topography,total_area,total_livable_area,type_heater,unfinished,unit,utility,view_type,year_built,year_built_estimate,zip_code,zoning,pin,lat,lng
0,78579872,2021-07-05 09:29:19,A,NWC OF 24TH & OXFORD ST,53912171.0,SR,VACANT LAND RESIDE < ACRE,1,Single Family,139.0,Y,,,94.0,188045.0,0.0,1.0,,16.0,A,,,B,29.0,0.0,,2334,1.0,2334 W OXFORD ST,,,,PHILADELPHIA PA,,19121.0,263000.0,,,3.0,10.0,1.0,,,BONDS ISHAKEA,,291113044,E,,2021-12-08 00:00:00,012N110470,2021-08-06 00:00:00,263000.0,,,,PA,62120,ST,W,OXFORD,,0.0,74955.0,F,1542.0,1242.0,A,,,,,2019.0,,19121.0,RSA5,1001680150,-75.17421,39.979126
1,78579873,2021-05-25 16:02:19,0,NEC LOCUST,53912526.0,590,RES CONDO 5+ STY MASONRY,1,Single Family,8.0,Y,,,0.0,0.0,0.0,3.0,0.0,0.0,,0.0,,A,8.0,0.0,29.0,219,3.0,219-29 S 18TH ST,,,,PHILADELPHIA PA,812 W SEDGWICK ST,19119.0,309800.0,,0.0,0.0,,1.0,636.0,,CEISLER LAWRENCE A,HARTOCOLLIS LINA,888092908,E,B,2021-12-08 00:00:00,002S200670,2021-07-16 00:00:00,272500.0,,,,PA,88140,ST,S,18TH,,278820.0,30980.0,F,0.0,524.0,,,1417.0,,C,1900.0,Y,19103.0,RMX3,1001629392,-75.170423,39.949254
2,78579874,2022-02-17 13:44:33,,"88'2 7/8"" E SEPVIVA",53911650.0,WA0,HSE WORSHIP ALL 1 STY MAS,1,Single Family,160.0,Y,,,50.0,488400.0,0.0,1.0,,,A,,,B,31.0,0.0,,2216,1.0,2216 E SUSQUEHANNA AVE,,,,,,,660000.0,,2.0,3.0,,3.0,,,GRANATO JOHN J III,GRANATO NATHALIE DUMONT,313034105,E,,2021-12-07 00:00:00,018N160291,2021-07-06 00:00:00,655000.0,,,,,75660,AVE,E,SUSQUEHANNA,,0.0,171600.0,F,800.0,2499.0,A,,,,I,2020.0,,19125.0,RSA5,1001678063,-75.128893,39.978672
3,78579875,2021-10-12 15:40:41,A,"476' 1 1/4"" S WIGARD",53911139.0,JJ1,AMUSE TV/RADIO STA MAS+O,1,Single Family,216.0,Y,,,109.0,489118.0,81282.0,1.0,0.0,80.0,A,,,A,21.0,,,7354,1.0,7354 RIDGE AVE,,,,,,,570400.0,,2.0,3.0,6.0,2.0,,,ZHENG YINDA,WANG ANGI,212514436,A,,2021-12-06 00:00:00,130N210107,2021-08-10 00:00:00,605950.0,,,,,68280,AVE,,RIDGE,,0.0,0.0,F,8219.0,2716.0,A,,15.0,,I,2021.0,,19128.0,RSD3,1001677510,-75.235043,40.045622
4,78579876,,,SWC REED ST,53764990.0,SR,VACANT LAND RESIDE < ACRE,14,,32.0,,,,,,,,,,,,,,36.0,,,1413,,1413L S TAYLOR ST,100 W OXFORD ST E-2300,,,PHILADELPHIA PA,,19122.0,,,,,,,2824.0,,MAMIE NICHOLS LIMITED PARTERSHIP,,886000014,E,,2021-12-02 00:00:00,10S23 37,2020-12-10 00:00:00,1.0,,,,PA,76780,ST,S,TAYLOR,L,,,F,,,,,,,I,,,19146.0,RSA5,1001681386,-75.185151,39.934839


### Step 1: prescreen related columns and rows 

**1.1 remove clearly unrelated columns**

**Columns kept after step 1.1**: 'assessment_date', 'basements', 'category_code_description','census_tract', 'central_air',
                 'depth', 'exterior_condition', 'fireplaces', 'frontage', 'fuel', 
                 'garage_spaces', 'garage_type', 'geographic_ward', 'house_number', 'interior_condition', 
                 'market_value', 'market_value_date', 'number_of_bathrooms', 'number_of_bedrooms', 'number_of_rooms',
                 'number_stories', 'quality_grade', 'sale_date', 'sale_price', 'separate_utilities',
                 'sewer', 'site_type', 'street_designation','street_direction', 'topography', 
                 'total_area','total_livable_area', 'type_heater', 'unfinished', 'unit', 
                 'utility', 'view_type', 'year_built', 'zip_code', 'lat', 
                 'lng'

In [3]:
# place columns of interest into a list
cols_step11 = ['assessment_date', 'basements', 'category_code_description','census_tract', 'central_air',
              'depth', 'exterior_condition', 'fireplaces', 'frontage', 'fuel', 
              'garage_spaces', 'garage_type', 'geographic_ward', 'house_number', 'interior_condition', 
              'market_value', 'market_value_date', 'number_of_bathrooms', 'number_of_bedrooms', 'number_of_rooms',
              'number_stories', 'quality_grade', 'sale_date', 'sale_price', 'separate_utilities',
              'sewer', 'site_type', 'street_designation','street_direction', 'topography', 
              'total_area','total_livable_area', 'type_heater', 'unfinished', 'unit', 
              'utility', 'view_type', 'year_built', 'zip_code', 'lat', 
              'lng']
len(cols_step11)
data_step11 = df[cols_step11]

**1.2 drop columns with more than 50% missing values**

In [4]:
# compute percentage of missing values for each column
data_step11.isnull().sum()/len(data_step11)

assessment_date              0.941171
basements                    0.438185
category_code_description    0.000193
census_tract                 0.000189
central_air                  0.506192
depth                        0.001089
exterior_condition           0.047782
fireplaces                   0.007786
frontage                     0.001097
fuel                         0.974443
garage_spaces                0.008084
garage_type                  0.137986
geographic_ward              0.000189
house_number                 0.000000
interior_condition           0.049151
market_value                 0.000423
market_value_date            1.000000
number_of_bathrooms          0.007585
number_of_bedrooms           0.007059
number_of_rooms              0.057656
number_stories               0.007053
quality_grade                0.903150
sale_date                    0.000014
sale_price                   0.000026
separate_utilities           0.955835
sewer                        0.984617
site_type   

In [5]:
# drop columns with over 50% missing values
data_step12 = data_step11.drop(['assessment_date', 'fuel', 'market_value_date', 'quality_grade', 'separate_utilities', 
                              'sewer', 'street_direction', 'unfinished', 'unit', 'utility'], axis = 1)
print(len(data_step12.columns))
print(data_step12.columns)
print(data_step12.info())

31
Index(['basements', 'category_code_description', 'census_tract', 'central_air',
       'depth', 'exterior_condition', 'fireplaces', 'frontage',
       'garage_spaces', 'garage_type', 'geographic_ward', 'house_number',
       'interior_condition', 'market_value', 'number_of_bathrooms',
       'number_of_bedrooms', 'number_of_rooms', 'number_stories', 'sale_date',
       'sale_price', 'site_type', 'street_designation', 'topography',
       'total_area', 'total_livable_area', 'type_heater', 'view_type',
       'year_built', 'zip_code', 'lat', 'lng'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581414 entries, 0 to 581413
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   basements                  326647 non-null  object 
 1   category_code_description  581302 non-null  object 
 2   census_tract               581304 non-null  float64
 3   central_air         

**1.3 drop rows where sale_date or sale_price is missing**

In [6]:
data_step13 = data_step12[data_step12['sale_date'].notna()]
data_step13 = data_step13[data_step13['sale_price'].notna()]

**1.4 read in data whose 'category_code_description' is 'Multi Family' or 'Single Family'**

In [7]:
data_step14 = data_step13.query('category_code_description == "Multi Family"|category_code_description == "Single Family"')
print(data_step14.shape)

(504489, 31)


In [8]:
data_step1_final = data_step14
print(data_step1_final.shape)

(504489, 31)


### Step 2: recode categorical data

**2 replace and combine category**

In [9]:
data_step2 = data_step1_final

In [10]:
# feature 0: basements
data_step2['basements'] = data_step1_final['basements'].replace({'0':'None','1':'full','2':'full','3':'full','4':'full',
                                                        'A':'full', 'B': 'full','C': 'full','D': 'full','E':'partial',
                                                        'F':'partial','G':'partial','H':'partial','I':'full',
                                                        'J':'partial'})
print(data_step2['basements'].value_counts(dropna=False))

NaN        185650
full       162826
partial    145547
None        10466
Name: basements, dtype: int64


In [11]:
# feature 1: category_code_description
print(data_step2.category_code_description.value_counts(dropna=False))

Single Family    461994
Multi Family      42495
Name: category_code_description, dtype: int64


In [12]:
# feature 3: central_air
data_step2['central_air'] = data_step1_final['central_air'].replace({'0': 'N', '1': 'Y','Y': 'Y', 'N': 'N'})
print(data_step2['central_air'].value_counts(dropna=False))

NaN    224789
N      168264
Y      111436
Name: central_air, dtype: int64


In [13]:
# feature 5: exterior_condition
data_step2['exterior_condition'] = data_step1_final['exterior_condition'].replace({0:'none',1:'new',2:'rehabbed',
                                                                          3:'above average',4:'rehabbed',5:'average', 
                                                                          6: 'below average',7:'vacant',8:'sealed',
                                                                          9:'compromised'})
print(data_step2.exterior_condition.value_counts(dropna=False))

rehabbed         426451
above average     41574
average           14782
new               13224
vacant             5064
below average      2587
NaN                 436
none                371
Name: exterior_condition, dtype: int64


In [14]:
# feature 9: garage_type
data_step2['garage_type'] = data_step1_final['garage_type'].replace({'A':'builtin', 'B':'attached','C':'detached',
                                                            'F':'converted', 'S':'selfpark', 'T':'attendant',
                                                            '0':'none', '0.0':'none', 0.0:'none',
                                                            '1':'builtin', '1.0':'builtin', 1.0:'builtin',
                                                            '2':'attached', '2.0':'attached', 2.0:'attached',
                                                            '3':'detached', '3.0':'detached', 3.0:'detached'})
print(data_step2.garage_type.value_counts(dropna=False))

none         276861
builtin      140019
NaN           34752
converted     22774
detached      18369
attached       8434
selfpark       2575
attendant       705
Name: garage_type, dtype: int64


In [15]:
# feature 11: house_number
data_step2['house_number'] = np.where(data_step2['house_number']%2==0, 'South or West', 'North or East')

In [16]:
data_step2.house_number.value_counts(dropna=False)

South or West    254510
North or East    249979
Name: house_number, dtype: int64

In [17]:
# feature 12: interior_condition
data_step2['interior_condition'] = data_step1_final['interior_condition'].replace({0:'none',1:'new',2:'new',
                                                                          3:'above average',4:'average', 
                                                                          5:'below average',6:'vacant',7:'sealed'})
print(data_step2.exterior_condition.value_counts(dropna=False))

rehabbed         426451
above average     41574
average           14782
new               13224
vacant             5064
below average      2587
NaN                 436
none                371
Name: exterior_condition, dtype: int64


In [18]:
# feature 20: site_type
data_step2['site_type'].value_counts(dropna=False)

NaN    243195
A      235162
B       25477
D         380
C         246
E          37
G          33
F           4
Name: site_type, dtype: int64

In [20]:
# feaure 21: street_designation

data_step2['street_designation'] = data_step1_final['street_designation'].astype("string")
data_step2['street_designation'] = np.where(data_step1_final['street_designation'].str.match("ST"), "ST",
                                   np.where(data_step1_final['street_designation'].str.match("AVE"), "AVE",
                                   np.where(data_step1_final['street_designation'].str.match("RD"), "RD",
                                   np.where(data_step1_final['street_designation'].str.match("LA"), "LA",
                                   np.where(data_step1_final['street_designation'].str.match("DR"), "DR",
                                   np.where(data_step1_final['street_designation'].str.match("PL"), "PL",
                                   np.where(data_step1_final['street_designation'].str.match("SQ"), "SQ","Other")))))))


data_step2['street_designation'].value_counts(dropna=False)

ST       371808
AVE       73012
RD        29498
Other      8606
LA         7600
PL         5819
DR         5777
SQ         2369
Name: street_designation, dtype: int64

In [21]:
# feature 22: topography      
data_step2['topography'] = data_step1_final['topography'].replace({'0':'None'})
data_step2['topography'].value_counts(dropna=False)

F       437959
NaN      33401
A        28190
E         4442
B          235
C          158
D          103
None         1
Name: topography, dtype: int64

In [22]:
# feature 25: type_heater  
data_step2['type_heater'] = data_step1_final['type_heater'].replace({'0':'None'})
data_step2['type_heater'].value_counts(dropna=False)

NaN     218424
H       119560
A        96591
B        59212
G         4733
C         2913
None      1764
E          727
D          565
Name: type_heater, dtype: int64

In [23]:
# feature 26: view_type 
data_step2['view_type'] = data_step1_final['view_type'].replace({'0':'None'})
data_step2['view_type'].value_counts(dropna=False)

I       467576
A        14667
C         7514
None      3634
H         2658
D         2530
NaN       2255
E         1911
B         1744
Name: view_type, dtype: int64

In [24]:
data_step2_final = data_step2
print(data_step2_final.shape)

(504489, 31)


### Step 3: recode numerical data

**3 transform numerical data**

In [25]:
data_step3 = data_step2_final

In [26]:
# sale_date 
data_step3['sale_date'] = pd.to_datetime(data_step2_final['sale_date'])
data_step3['sale_year'] = data_step2_final['sale_date'].dt.year
data_step3['sale_month'] = data_step2_final['sale_date'].dt.month
data_step3['sale_week'] = data_step2_final['sale_date'].dt.isocalendar().week
data_step3['sale_week'] = data_step2_final['sale_week'].astype(int)
data_step3['sale_day'] = data_step2_final['sale_date'].dt.day
data_step3['sale_dow'] = data_step2_final['sale_date'].dt.dayofweek
data_step3 = data_step3.drop(['sale_date'], axis=1)

In [27]:
# year_built
data_step3['year_built'] = data_step2_final['year_built'].replace('196Y', np.NaN)
data_step3["year_built"] = pd.to_numeric(data_step3["year_built"])

In [28]:
data_step3_final = data_step3
print(data_step3_final.shape)

(504489, 35)


### Step 4:  drop extreme high sale price

**4 drop rows with sales price of greater than $5 million**

In [29]:
data_step4 = data_step3_final[(data_step3_final['sale_price']>1000)&(data_step3_final['sale_price']<5000000)]

In [30]:
data_step4_final = data_step4
print(data_step4_final.shape)

(358226, 35)


### Step 5: split into training, validation and test set

**5 split the data into training and test sets:   
The training and validation dataset contains all rows with house sold during the years of 2010 - 2019.    
The test dataset contains all rows with house sold during 2020 and 2021.**

In [31]:
# first subset the data only to 2010 - 2021
data_step5 = data_step4_final[data_step4_final['sale_year'] > 2009]
data_step5.shape

(180208, 35)

In [32]:
# get training plus validation set 
X_train = data_step5[data_step5['sale_year'] < 2020].drop(['sale_price'],axis=1)
y_train = data_step5.loc[X_train.index, 'sale_price']
print(X_train.shape)
print(y_train.shape)

(150594, 34)
(150594,)


In [33]:
# get test set 
X_test = data_step5[data_step5['sale_year'] > 2019].drop(['sale_price'],axis=1)
y_test = data_step5.loc[X_test.index, 'sale_price']
print(X_test.shape)
print(y_test.shape)

(29614, 34)
(29614,)


In [34]:
# get training and validation set
X_train_train, X_train_val, y_train_train, y_train_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)
print(X_train_train.shape)
print(X_train_val.shape)
print(y_train_train.shape)
print(y_train_val.shape)

(120475, 34)
(30119, 34)
(120475,)
(30119,)


In [35]:
data_step5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180208 entries, 0 to 580978
Data columns (total 35 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   basements                  121072 non-null  object 
 1   category_code_description  180208 non-null  object 
 2   census_tract               180199 non-null  float64
 3   central_air                115196 non-null  object 
 4   depth                      179997 non-null  float64
 5   exterior_condition         180026 non-null  object 
 6   fireplaces                 179710 non-null  float64
 7   frontage                   179991 non-null  float64
 8   garage_spaces              179517 non-null  float64
 9   garage_type                156099 non-null  object 
 10  geographic_ward            180199 non-null  float64
 11  house_number               180208 non-null  object 
 12  interior_condition         179972 non-null  object 
 13  market_value               18

### Step 6: process training data 

**for training data:\
categorical features: fill in mode then one hot encode\
numercial features: fill in median then standardize**

In [36]:
# for categorical data, fill missing values with mode and then one-hot encode
cat_features = ['basements', 'category_code_description', 'central_air', 'exterior_condition', 'garage_type', 
                'house_number','interior_condition', 'site_type','street_designation', 'topography', 
                'type_heater', 'view_type','street_designation']
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore',sparse=False))
])

# for numerical data, fill missing values with median and then scale
num_features = ['census_tract', 'depth', 'fireplaces', 'frontage', 'garage_spaces',
                'geographic_ward', 'market_value', 'number_of_bathrooms','number_of_bedrooms', 'number_of_rooms',
                'number_stories', 'total_area', 'total_livable_area','year_built', 'zip_code',
                'lat', 'lng','sale_year','sale_month','sale_week',
                'sale_day','sale_dow']
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', StandardScaler())
])

# Combine two transformers into single ColumnTransformer preprocessor
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, cat_features),
    ('num', num_pipe, num_features)
])

In [37]:
# use pipeline to fit training data
preprocessor_train = preprocessor.fit(X_train_train)

# prepare column names
cat_columns = preprocessor.named_transformers_['cat']['encoder'].get_feature_names(cat_features)
columns = np.append(cat_columns, num_features)

# preprocess training data
X_train_train_final = pd.DataFrame(preprocessor_train.transform(X_train_train), index=X_train_train.index, columns=columns)
y_train_train_final = y_train_train.fillna(y_train_train.median())

### Step 7: process validation and test data 

**use statistical traits in training data to process validation and test data**

In [38]:
# preprocess validation data
X_train_val_final = pd.DataFrame(preprocessor_train.transform(X_train_val), index=X_train_val.index, columns=columns)
y_train_val_final = y_train_val.fillna(y_train_train.median())

# preprocess test data
X_test_final = pd.DataFrame(preprocessor_train.transform(X_test), index=X_test.index, columns=columns)
y_test_final = y_test.fillna(y_train_train.median())

In [39]:
print(X_train_train_final.shape)
print(X_train_val_final.shape)
print(X_test_final.shape)
print(y_train_train_final.shape)
print(y_train_val_final.shape)
print(y_test_final.shape)

(120475, 97)
(30119, 97)
(29614, 97)
(120475,)
(30119,)
(29614,)


In [41]:
pickle_out = open("housedat_Xtrain_sub.pickle","wb")
pickle.dump(X_train_train_final, pickle_out)
pickle_out.close()

pickle_out = open("housedat_Xval.pickle","wb")
pickle.dump(X_train_val_final, pickle_out)
pickle_out.close()

pickle_out = open("housedat_Xtest.pickle","wb")
pickle.dump(X_test_final, pickle_out)
pickle_out.close()

pickle_out = open("housedat_ytrain_sub.pickle","wb")
pickle.dump(y_train_train_final, pickle_out)
pickle_out.close()

pickle_out = open("housedat_yval.pickle","wb")
pickle.dump(y_train_val_final, pickle_out)
pickle_out.close()

pickle_out = open("housedat_ytest.pickle","wb")
pickle.dump(y_test_final, pickle_out)
pickle_out.close()

possible predictions datasets:   
https://www.kaggle.com/datasets/harlfoxem/housesalesprediction     
https://github.com/michellesklee/predicting_home_values    