In [40]:
#Machine Learning Model to predict median house price based on Location (latitude/longitude)

#Load Data Set
import pandas as pd
df = pd.read_csv("/Users/chrislu/Downloads/housing.csv")


In [41]:
#Set Variables
x = df[['latitude','longitude']]
y = df['median_house_value']


In [42]:
from sklearn.model_selection import train_test_split


In [43]:
#Split Data Set
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.2, random_state=100)

In [44]:
#Training Model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)

#Predictions
y_model_train_predict = model.predict(x_train)
y_model_test_predict = model.predict(x_test)

In [45]:
#Evaluate Accuracy of Model
from sklearn.metrics import mean_squared_error, r2_score

model_train_mse = mean_squared_error(y_train, y_model_train_predict)
model_train_r2 = r2_score(y_train, y_model_train_predict)

model_test_mse = mean_squared_error(y_test, y_model_test_predict)
model_test_r2 = r2_score(y_test, y_model_test_predict)

print('Model Train MSE', model_train_mse)
print('Model Train R2', model_train_r2)
print('Model Test MSE', model_test_mse)
print('Model Test R2', model_test_r2)

Model Train MSE 10038564995.605913
Model Train R2 0.24382407387161242
Model Test MSE 10285451646.96813
Model Test R2 0.2366429722282778


In [46]:
#Load into dataframe
results = pd.DataFrame(['Linear Regression', model_train_mse, model_train_r2, model_test_mse, model_test_r2]).transpose()
#name columns
results.columns = ['Method', 'Train MSE', 'Train R2', 'Test MSE', 'Test R2']
#print finishedDataframe
results
#Based on these results the model does not do great in predicting median house value

Unnamed: 0,Method,Train MSE,Train R2,Test MSE,Test R2
0,Linear Regression,10038564995.605911,0.243824,10285451646.96813,0.236643


In [47]:
#Train another Model
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(max_depth=2, random_state=100)
rfr.fit(x_train, y_train)
y_rfr_train_predict = rfr.predict(x_train)
y_rfr_test_predict = rfr.predict(x_test)


In [48]:
rfr_train_mse = mean_squared_error(y_train, y_rfr_train_predict)
rfr_train_r2 = r2_score(y_train, y_rfr_train_predict)
rfr_test_mse = mean_squared_error(y_test, y_rfr_test_predict)
rfr_test_r2 = r2_score(y_test, y_rfr_test_predict)
print('RFR Train MSE', rfr_train_mse)
print('RFR Train R2', rfr_train_r2)
print('RFR Test MSE', rfr_test_mse)
print('RFR Test R2', rfr_test_r2)

RFR Train MSE 11180109003.060965
RFR Train R2 0.15783488144904223
RFR Test MSE 11607730655.482422
RFR Test R2 0.13850717727541884


In [49]:

results2 = pd.DataFrame(['LRandom Forest Regression', rfr_train_mse, rfr_train_r2, rfr_test_mse, rfr_test_r2]).transpose()
results2.columns = ['Method', 'Train MSE', 'Train R2', 'Test MSE', 'Test R2']
results2

Unnamed: 0,Method,Train MSE,Train R2,Test MSE,Test R2
0,LRandom Forest Regression,11180109003.060965,0.157835,11607730655.482422,0.138507


In [50]:
#Neither of these models seemed to work that well so I may need to look into finding a model that better accommodates multiple x variables 
#Another thing I think may be the problem is latitiude and longitude have weird numbers so I may convert them to a better numerical value like Z-score

from sklearn.preprocessing import StandardScaler

coordinates = df[['latitude', 'longitude']]

# Initialize a StandardScaler
scaler = StandardScaler()

# Fit and transform the scaler on the coordinates data
scaled_coordinates = scaler.fit_transform(coordinates)

# Create a DataFrame with the scaled data
scaled_df = pd.DataFrame(scaled_coordinates, columns=['scaled_latitude', 'scaled_longitude'])



In [51]:
#Linear Regression Pt 2 with scaled coordinates

#Set Variables
x = scaled_df[['scaled_latitude', 'scaled_longitude']]
y = df['median_house_value']

from sklearn.model_selection import train_test_split

#Split Data Set
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.2, random_state=100)
#Training Model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)

#Predictions
y_model_train_predict = model.predict(x_train)
y_model_test_predict = model.predict(x_test)
#Evaluate Accuracy of Model
from sklearn.metrics import mean_squared_error, r2_score

model_train_mse = mean_squared_error(y_train, y_model_train_predict)
model_train_r2 = r2_score(y_train, y_model_train_predict)

model_test_mse = mean_squared_error(y_test, y_model_test_predict)
model_test_r2 = r2_score(y_test, y_model_test_predict)

print('Model Train MSE', model_train_mse)
print('Model Train R2', model_train_r2)
print('Model Test MSE', model_test_mse)
print('Model Test R2', model_test_r2)

#No change..... May need to delve deeper into the data

Model Train MSE 10038564995.605913
Model Train R2 0.24382407387161242
Model Test MSE 10285451646.968134
Model Test R2 0.23664297222827746
