In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import tree

import pickle

### Prepare Data

In [4]:
# Load data from csv files
df = pd.read_csv('../cl_Resources/home_value_calc.csv')

In [5]:
df.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,median_home_value,Per Capita Income,Masters holders,Bachelor holders,Median gross rent,Poverty Count,...,pop_biz,commute time car,house_age,Land-Sq-Mi,zip_code,latitude,longitude,city,state,county
0,1001,17423.0,45.0,56714.0,202800.0,30430.0,1353.0,2016.0,975.0,1462.0,...,461,,49,11.442,1001,42.140549,-72.788661,Agawam,MA,Hampden
1,1002,29970.0,23.2,48923.0,344000.0,26072.0,2956.0,3161.0,1206.0,8351.0,...,480,188755.0,47,55.043,1002,42.367092,-72.464571,Amherst,MA,Hampshire
2,1003,11296.0,19.9,2499.0,-666666666.0,3829.0,10.0,3.0,1150.0,54.0,...,7,,47,0.711,1003,42.369562,-72.63599,Amherst,MA,Hampshire
3,1005,5228.0,44.1,70568.0,213700.0,32169.0,327.0,505.0,926.0,230.0,...,89,,47,44.242,1005,42.32916,-72.139465,Barre,MA,Worcester
4,1007,14888.0,42.5,80502.0,258000.0,36359.0,1455.0,2185.0,921.0,1410.0,...,441,,36,52.643,1007,42.280267,-72.402056,Belchertown,MA,Hampshire


In [6]:
df.columns

Index(['Zipcode', 'Population', 'Median Age', 'Household Income',
       'median_home_value', 'Per Capita Income', 'Masters holders',
       'Bachelor holders', 'Median gross rent', 'Poverty Count',
       'Poverty Rate', 'Unemployment', 'Unemployment rate', 'pop_arc/eng',
       'pop_stem', 'pop_tech', 'pop_biz', 'commute time car', 'house_age',
       'Land-Sq-Mi', 'zip_code', 'latitude', 'longitude', 'city', 'state',
       'county'],
      dtype='object')

In [7]:
df = df.drop(['commute time car'], axis=1)
df = df.drop(['Zipcode', 'zip_code','latitude', 'longitude', 'city', 'state', 'county'], axis=1)

In [8]:
df.count()

Population           32734
Median Age           32734
Household Income     32734
median_home_value    32734
Per Capita Income    32452
Masters holders      32734
Bachelor holders     32734
Median gross rent    32734
Poverty Count        32734
Poverty Rate         32416
Unemployment         32734
Unemployment rate    32416
pop_arc/eng          32734
pop_stem             32734
pop_tech             32734
pop_biz              32734
house_age            32734
Land-Sq-Mi           32734
dtype: int64

In [9]:
# Remove bad data
df = df[~(df == -666666666.0).any(axis=1)]
df = df[~(df == 666668684).any(axis=1)]

In [10]:
df.count()

Population           26531
Median Age           26531
Household Income     26531
median_home_value    26531
Per Capita Income    26531
Masters holders      26531
Bachelor holders     26531
Median gross rent    26531
Poverty Count        26531
Poverty Rate         26531
Unemployment         26531
Unemployment rate    26531
pop_arc/eng          26531
pop_stem             26531
pop_tech             26531
pop_biz              26531
house_age            26531
Land-Sq-Mi           26531
dtype: int64

In [None]:
# Add a new column to df_new
df["Population Density"] = df["Population"]/df["Land-Sq-Mi"]

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.head()

In [None]:
# Correction Matrix Plot
import matplotlib.pyplot as plt
import pandas
import numpy

# data = pandas.read_csv(url, names=names)
# df_test = df_new.drop("median_home_value", axis=1)
correlations = df.corr()
# plot correlation matrix
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0,19,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
names = ['Population', 'Median Age', 'Household Income', 'median_home_value',
       'Per Capita Income', 'Masters holders', 'Bachelor holders',
       'Median gross rent', 'Poverty Count', 'Poverty Rate', 'Unemployment',
       'Unemployment rate', 'pop_arc/eng', 'pop_stem', 'pop_tech', 'pop_biz',
       'house_age', 'Land-Sq-Mi', 'Population Density']
# ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

In [None]:
# Drop correlated feature columns
df_test_col_removed = df.drop(['Unemployment', "Poverty Count", 'pop_tech', 'Bachelor holders', 'pop_biz','pop_stem', 'Land-Sq-Mi' ], axis=1)
df_test_col_removed.columns
df_test_col_removed.count()

In [None]:
df_test_col_removed.count()

In [None]:
# # Save the dataset to csv
# df_test_col_removed.to_csv("clean_house_value_data.csv", index=False)

In [None]:
# Correction Matrix Plot
import matplotlib.pyplot as plt
import pandas
import numpy

# data = pandas.read_csv(url, names=names)
# df_test = df_new.drop("median_home_value", axis=1)
correlations = df_test_col_removed.corr()
# plot correlation matrix
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0,12,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
names = ['Population', 'Median Age', 'Household Income', 'median_home_value',
       'Per Capita Income', 'Masters holders', 'Median gross rent',
       'Poverty Rate', 'Unemployment rate', 'pop_arc/eng',
       'house_age', 'Population Density']
# ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

In [None]:
# Assign the data to X and y
X = df_test_col_removed.drop('median_home_value', axis=1)
y = df_test_col_removed["median_home_value"].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
X

In [None]:
# Trend for Population
plt.scatter(X["Population"], y)
plt.xlabel("Population")
plt.ylabel("House Value")
plt.show()

In [None]:
# Trend for Median Age
plt.scatter(X["Median Age"], y)
plt.xlabel("Median Age")
plt.ylabel("House Value")
plt.show()

In [None]:
# Trend for Household Income
plt.scatter(X["Household Income"], y)
plt.xlabel("Household Income")
plt.ylabel("House Value")
plt.show()

In [None]:
# Trend for Per Capita Income
plt.scatter(X["Per Capita Income"], y)
plt.xlabel("Per Capita Income")
plt.ylabel("House Value")
plt.show()

In [None]:
# Trend for 'Masters holders'
plt.scatter(X['Masters holders'], y)
plt.xlabel('Masters holders')
plt.ylabel("House Value")
plt.show()

In [None]:
# Trend for 'Masters holders'
plt.scatter(X['Masters holders'], y)
plt.xlabel('Masters holders')
plt.ylabel("House Value")
plt.show()

In [None]:
# Trend for Poverty Rate
plt.scatter(X["Poverty Rate"], y)
plt.xlabel("Poverty Rate")
plt.ylabel("House Value")
plt.show()

In [None]:
# Trend for Unemployment Rate
plt.scatter(X["Unemployment rate"], y)
plt.xlabel("Unemployment rate")
plt.ylabel("House Value")
plt.show()

In [None]:
# Trend for pop_arc/eng
plt.scatter(X["pop_arc/eng"], y)
plt.xlabel("pop_arc/eng")
plt.ylabel("House Value")
plt.show()

In [None]:
# Trend for Population Density
plt.scatter(X["house_age"], y)
plt.xlabel("house_age")
plt.ylabel("House Value")
plt.show()

In [None]:
# Trend for Population Density
plt.scatter(X["Population Density"], y)
plt.xlabel("Population Density")
plt.ylabel("House Value")
plt.show()

In [None]:
# Use sklearn's `train_test_split` to split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Scale data with SKLearn StandardScaler
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
X_test_scaled[0]

### Linear regression Model

In [None]:
X_test.to_csv("features.csv", index=False)

In [None]:
## Create the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
# Fitting our model with training data
model.fit(X_train_scaled, y_train_scaled)

In [None]:
# Make prediction with test data
predictions = model.predict(X_test_scaled)

In [None]:
# Evaluate the model by Calculate the following for the test data
# 1. Calculate the mean_squared_error (mse)
# 2. the r-squared value (r2)
from sklearn.metrics import mean_squared_error, r2_score
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

# Or by calling the `score` method on the model to show the r2 score
model.score(X_test_scaled, y_test_scaled)

In [None]:
# Residual plot
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.show()

In [None]:
# Make a prediction
# X_scaled_new = [[ 0.80809856,  1.35009033, -0.01890773,  0.34549878, -0.82941627, 0.2246137 ]]
# X_scaled_new = [[ 3.48460584, -0.82941627,  0.80809856, 0.2345, 0.5678, -0.33548015, 0.2246137 ,  1.35009033]]
# X_scaled_new = [[ 3.48460584, -0.82941627,  0.80809856,  0.34549878, -0.01890773, -0.33548015,  0.2246137 ,  1.35009033]]

# X_scaled_new = [[-0.11078969,  0.51738345,  0.*2866739,  0.34154584, -0.27276221,
#         0.98311992, -0.45964101,  0.82921035, -0.37015243, -0.01418429,
#        -0.91011539, -0.95559417]]

X_scaled_new = [[ 0.03112578,  0.78546681, -0.58000409, -0.18338835, -0.22006741,
       -0.56337918, -0.35730793, -0.16093156,  0.13908083, -0.30457034,
       -0.25241833]]
prediction_scaled = model.predict(X_scaled_new)
prediction = y_scaler.inverse_transform(prediction_scaled)
prediction

In [None]:
## LASSO model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Lasso

### BEGIN SOLUTION
lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled)
predictions = lasso.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = lasso.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
## Ridge model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Ridge

### BEGIN SOLUTION
ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)
### END SOLUTION

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
## ElasticNet model
# Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import ElasticNet

elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = elasticnet.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = elasticnet.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

### Random Forest Regression Model

In [None]:
# Train a random forest regression model
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=300)
rf = rf.fit(X_train_scaled, y_train_scaled)

# save the model
filename = 'cl_model.sav'
pickle.dump(rf, open(filename, 'wb'))

# R2 for training data
rf.score(X_train_scaled, y_train_scaled)

In [None]:
importances = rf.feature_importances_
importances_list = sorted(zip(rf.feature_importances_, X.keys()), reverse=True)

In [None]:
importances_list

In [None]:
# R2 for testing data
rf.score(X_test_scaled, y_test_scaled)

In [None]:
# load the model
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test_scaled, y_test_scaled)

In [None]:
result

In [None]:
# Make a prediction
X_scaled_new = [[ 0.03112578,  0.78546681, -0.58000409, -0.18338835, -0.22006741,
       -0.56337918, -0.35730793, -0.16093156,  0.13908083, -0.30457034,
       -0.25241833]]
prediction_scaled = rf.predict(X_scaled_new)
prediction = y_scaler.inverse_transform(prediction_scaled)
prediction

In [None]:
# Make a prediction
# X_scaled_new = ([[ 0.90809856,  1.35009033, -0.01890773,  0.94549878, -0.82941627, 0 ]])
# X_scaled_new = [[ 0.80809856,  1.35009033,  3.48460584, 0.3456,  0.34549878, -0.82941627,  0.2246137 ]]
prediction_scaled = loaded_model.predict(X_scaled_new)
prediction = y_scaler.inverse_transform(prediction_scaled)
prediction

In [None]:
loaded_model.feature_importances_

In [None]:
importances = loaded_model.feature_importances_
importances_list = sorted(zip(loaded_model.feature_importances_, X.keys()), reverse=True)

In [None]:
importances_list

In [5]:
X_new = [[17423.0,45.0,56714.0,30430.0,1353.0,975.0,8.391207,2.749240,149,49,1522.723300]]