In [1]:
# Import libraries
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
# Import data
df = pd.read_csv('/Users/ellyatwood/Documents/GEO490/geospatial-data-science/data/lab5/seattle_house_prices.csv')


In [9]:
# Convert DataFrame to GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['long'], df['lat']))
gdf = gdf.set_crs(4326, allow_override=True)

# Reproject everything to UTM 10N (EPSG:32610)
gdf_utm = gdf.to_crs('EPSG:32610')

In [12]:
print (df.count)

<bound method DataFrame.count of          price  bedrooms  bathrooms  sqft_living  sqft_lot  yr_built      lat  \
0       538000         3       2.25         2570      7242      1951  47.7210   
1       180000         2       1.00          770     10000      1933  47.7379   
2       604000         4       3.00         1960      5000      1965  47.5208   
3       510000         3       2.00         1680      8080      1987  47.6168   
4      1230000         4       4.50         5420    101930      2001  47.6561   
...        ...       ...        ...          ...       ...       ...      ...   
19446   475000         3       2.50         1310      1294      2008  47.5773   
19447   360000         3       2.50         1530      1131      2009  47.6993   
19448   400000         4       2.50         2310      5813      2014  47.5107   
19449   400000         3       2.50         1600      2388      2004  47.5345   
19450   325000         2       0.75         1020      1076      2008  47.594

There are 19451 total houses in the dataset, and 9 features for determining house price

In [14]:
print (df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19451 entries, 0 to 19450
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   price        19451 non-null  int64   
 1   bedrooms     19451 non-null  int64   
 2   bathrooms    19451 non-null  float64 
 3   sqft_living  19451 non-null  int64   
 4   sqft_lot     19451 non-null  int64   
 5   yr_built     19451 non-null  int64   
 6   lat          19451 non-null  float64 
 7   long         19451 non-null  float64 
 8   geometry     19451 non-null  geometry
dtypes: float64(3), geometry(1), int64(5)
memory usage: 1.3 MB
None


In [34]:
# Compute correlation matrix
corr_matrix = gdf_utm.corr()

# Display just house value correlations
corr_matrix["price"].sort_values(ascending= False)

price                   1.000000
sqft_living             0.702296
bathrooms               0.524395
bedrooms                0.315804
lat                     0.308082
living_per_lot          0.126975
sqft_lot                0.090125
yr_built                0.052453
long                    0.020092
bedrooms_per_lot       -0.039601
age                    -0.052453
bathrooms_per_living   -0.267342
bedrooms_per_living    -0.479228
Name: price, dtype: float64

The variables that are best

In [28]:
# Bedrooms per lot
gdf_utm['bedrooms_per_lot'] = gdf_utm['bedrooms'] / gdf_utm['sqft_lot']

# Bedrooms per living
gdf_utm['bedrooms_per_living'] = gdf_utm['bedrooms'] / gdf_utm['sqft_living']

# Bathrooms per living
gdf_utm['bathrooms_per_living'] = gdf_utm['bathrooms'] / gdf_utm['sqft_living']

# Sqftliving per lot
gdf_utm['living_per_lot'] = gdf_utm['sqft_living'] / gdf_utm['sqft_lot']

# Age
gdf_utm['age'] = 2022 - gdf_utm['yr_built']

In [29]:
# Compute correlation matrix
corr_matrix = gdf_utm.corr()

# Display just house value correlations
corr_matrix["price"].sort_values(ascending= False)

price                   1.000000
sqft_living             0.702296
bathrooms               0.524395
bedrooms                0.315804
lat                     0.308082
living_per_lot          0.126975
sqft_lot                0.090125
yr_built                0.052453
long                    0.020092
bedrooms_per_lot       -0.039601
age                    -0.052453
bathrooms_per_living   -0.267342
bedrooms_per_living    -0.479228
Name: price, dtype: float64

In [30]:
# Define feature list
feature_list =  ['bedrooms', 'bathrooms', 'sqft_living', 
                 'sqft_lot', 'yr_built', 'lat', 'long', 'bedrooms_per_lot', 
                 'bedrooms_per_living', 'bathrooms_per_living', 'living_per_lot',
                'age']

# Define features and labels 
X = gdf_utm[feature_list]
y = gdf_utm['price']

# Standarize data
scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)



In [31]:
# Split data 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [32]:
# Define model
forest_reg = RandomForestRegressor(n_estimators = 30)

# Fit model
forest_reg.fit(X_train, y_train)

RandomForestRegressor(n_estimators=30)

In [33]:
# Predict test labels predictions
predictions = forest_reg.predict(X_test)

# Compute mean-squared-error
final_mse = mean_squared_error(y_test , predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

160774.30471727348

## Approach 2

In [36]:
# Split data by prices
gdfsorted = gdf_utm.sort_values(by='price', ascending=False)

In [37]:
expensivehouse = gdfsorted.iloc[0:5000]

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,lat,long,geometry,bedrooms_per_lot,bedrooms_per_living,bathrooms_per_living,living_per_lot,age
6526,7700000,6,8.00,12050,27600,1910,47.6298,-122.323,POINT (550861.285 5275377.370),0.000217,0.000498,0.000664,0.436594,112
3522,7060000,5,4.50,10040,37325,1940,47.6500,-122.214,POINT (559027.375 5277699.601),0.000134,0.000498,0.000448,0.268989,82
8328,6890000,6,7.75,9890,31374,2001,47.6305,-122.240,POINT (557096.070 5275512.941),0.000191,0.000607,0.000784,0.315229,21
3969,5570000,5,5.75,9200,35069,2001,47.6289,-122.233,POINT (557623.710 5275340.298),0.000143,0.000543,0.000625,0.262340,21
1303,5350000,5,5.00,8000,23985,2009,47.6232,-122.220,POINT (558606.750 5274716.555),0.000208,0.000625,0.000625,0.333542,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15030,637000,5,3.00,2460,7240,1991,47.5486,-122.189,POINT (561022.625 5266449.641),0.000691,0.002033,0.001220,0.339779,31
3420,637000,4,3.50,3080,118918,2008,47.7721,-121.924,POINT (580617.178 5291531.153),0.000034,0.001299,0.001136,0.025900,14
4494,637000,4,2.75,2900,5803,2007,47.7368,-122.232,POINT (557579.975 5287332.913),0.000689,0.001379,0.000948,0.499742,15
2699,637000,3,2.00,1980,6000,1958,47.6921,-122.283,POINT (553802.319 5282328.306),0.000500,0.001515,0.001010,0.330000,64
