In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import tree

import pickle

### Prepare Data

In [None]:
# Load data from csv files
# df = pd.read_csv('cl_Resources/census_data.csv')
# df_land = pd.read_csv('cl_Resources/Zipcode-Population-Density-2010.csv')
# df_unemployment = pd.read_csv('cl_Resources/Unemployment.csv')
df = pd.read_csv('cl_Resources/home_value_calc.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.count()

In [None]:
df = df[~(df == -666666666.0).any(axis=1)]

In [None]:
df.head()

In [None]:
df.count()

In [None]:
# Add a new column to df_new
df["Population Density"] = df["Population"]/df["Land-Sq-Mi"]

In [None]:
df.head()

In [None]:
df = df.drop(["Poverty Count", "commute time car", 'Zipcode', 'zip_code','latitude', 'longitude', 'city', 'state', 'county', 'Bachelor holders', 'pop_biz','pop_stem' ], axis=1)

In [None]:
df.head()

In [None]:
df_test = df.drop("median_home_value", axis=1)

In [None]:
df_test.columns

In [None]:
# Correction Matrix Plot
import matplotlib.pyplot as plt
import pandas
import numpy

# data = pandas.read_csv(url, names=names)
# df_test = df_new.drop("median_home_value", axis=1)
correlations = df_test.corr()
# plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0,14,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
names = ['Population', 'Median Age', 'Household Income', 'Per Capita Income',
       'Masters holders', 'Median gross rent', 'Poverty Rate', 'Unemployment',
       'Unemployment rate', 'pop_arc/eng', 'pop_tech', 'house_age',
       'Land-Sq-Mi', 'Population Density']
# ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

In [None]:
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape to create this

X = df_test
# X = df_new[["Population", "Median Age", "Household Income", "Per Capita Income", "Poverty Rate", "Land-Sq-Mi", "Unemp Rate", "Population Density"]]
# X = df_new[["Household Income", "Population Density", "Poverty Rate", "Per Capita Income", "Median Age", "Unemp Rate"]]
# X = df_new[["Household Income", "Population Density", "Poverty Rate", "Median Age", "Unemp Rate"]]
# X = df_new[["Household Income", "Per Capita Income", "Poverty Rate", "Median Age", "Unemp Rate"]]
y = df["median_home_value"].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
# Plot the data to see if a linear trend exists for Population
# Can plot for each features
plt.scatter(X["Population"], y)
plt.xlabel("Population")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Median Age
# Can plot for each features
plt.scatter(X["Median Age"], y)
plt.xlabel("Median Age")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Household Income
# Can plot for each features
plt.scatter(X["Household Income"], y)
plt.xlabel("Household Income")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Per Capita Income
# Can plot for each features
plt.scatter(X["Per Capita Income"], y)
plt.xlabel("Per Capita Income")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Poverty Rate
# Can plot for each features
plt.scatter(X["Poverty Rate"], y)
plt.xlabel("Poverty Rate")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Median Age
# Can plot for each features
plt.scatter(X["Land-Sq-Mi"], y)
plt.xlabel("Land-Sq-Mi")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Unemployment Rate
# Can plot for each features
plt.scatter(X["Unemployment rate"], y)
plt.xlabel("Unemployment rate")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Population Density
# Can plot for each features
plt.scatter(X["Population Density"], y)
plt.xlabel("Population Density")
plt.ylabel("House Value")

In [None]:
# Use sklearn's `train_test_split` to split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Scale data with SKLearn StandardScaler
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
X_test_scaled[0]

In [None]:
y_test[0]

### Linear regression Model

In [None]:
## Create the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
# Fitting our model with training data
model.fit(X_train_scaled, y_train_scaled)

In [None]:
# Make prediction with test data
predictions = model.predict(X_test_scaled)

In [None]:
# Evaluate the model by Calculate the following for the test data
# 1. Calculate the mean_squared_error (mse)
# 2. the r-squared value (r2)
from sklearn.metrics import mean_squared_error, r2_score
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

# Or by calling the `score` method on the model to show the r2 score
model.score(X_test_scaled, y_test_scaled)

In [None]:
# Residual plot
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.show()

In [None]:
X_test_scaled[0]

In [None]:
# Make a prediction
# X_scaled_new = [[ 0.80809856,  1.35009033, -0.01890773,  0.34549878, -0.82941627, 0.2246137 ]]
# X_scaled_new = [[ 3.48460584, -0.82941627,  0.80809856, 0.2345, 0.5678, -0.33548015, 0.2246137 ,  1.35009033]]
# X_scaled_new = [[ 3.48460584, -0.82941627,  0.80809856,  0.34549878, -0.01890773, -0.33548015,  0.2246137 ,  1.35009033]]

X_scaled_new = [[-0.51078969,  0.11738345,  0.22866739,  0.34154584, -0.27276221,
        0.58311992, -0.45964101, -0.22551918,  0.32921035, -0.37015243,
       -0.42977507, -0.01418429, -0.01011539, -0.25559417]]
prediction_scaled = model.predict(X_scaled_new)
prediction = y_scaler.inverse_transform(prediction_scaled)
prediction

In [None]:
## LASSO model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Lasso

### BEGIN SOLUTION
lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled)
predictions = lasso.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = lasso.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
## Ridge model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Ridge

### BEGIN SOLUTION
ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)
### END SOLUTION

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
## ElasticNet model
# Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import ElasticNet

elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = elasticnet.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = elasticnet.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

### Random Forest Regression Model

In [None]:
# Train a random forest regression model
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=300)
rf = rf.fit(X_train_scaled, y_train_scaled)

# save the model
filename = 'cl_model.sav'
pickle.dump(rf, open(filename, 'wb'))

# R2 for training data
rf.score(X_train_scaled, y_train_scaled)

In [None]:
importances = rf.feature_importances_
importances_list = sorted(zip(rf.feature_importances_, X.keys()), reverse=True)

In [None]:
importances_list

In [None]:
sorted(zip(rf.feature_importances_, X.keys()), reverse=True)

In [None]:
# R2 for testing data
rf.score(X_test_scaled, y_test_scaled)

In [None]:
# load the model
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test_scaled, y_test_scaled)

In [None]:
# accuracy = accuracy_score(y_test, predicted)

In [None]:
# Make a prediction
# X_scaled_new = ([[ 0.90809856,  1.35009033, -0.01890773,  0.94549878, -0.82941627, 0 ]])
# X_scaled_new = [[ 0.80809856,  1.35009033,  3.48460584, 0.3456,  0.34549878, -0.82941627,  0.2246137 ]]
# X_scaled_new = [[ 3.48460584, -0.82941627,  0.80809856, 0.2345, 0.5678, -0.33548015, 0.2246137 ,  1.35009033]]
# X_scaled_new = [[ 3.48460584, -0.82941627,  0.80809856,  0.34549878, -0.01890773, -0.33548015,  0.2246137 ,  1.35009033]]
X_scaled_new = [[-0.51078969,  0.11738345,  0.22866739,  0.34154584, -0.27276221,
        0.58311992, -0.45964101, -0.22551918,  0.32921035, -0.37015243,
       -0.42977507, -0.01418429, -0.01011539, -0.25559417]]

prediction_scaled = rf.predict(X_scaled_new)
prediction = y_scaler.inverse_transform(prediction_scaled)
prediction

In [None]:
# Make a prediction
# X_scaled_new = ([[ 0.90809856,  1.35009033, -0.01890773,  0.94549878, -0.82941627, 0 ]])
# X_scaled_new = [[ 0.80809856,  1.35009033,  3.48460584, 0.3456,  0.34549878, -0.82941627,  0.2246137 ]]
prediction_scaled = loaded_model.predict(X_scaled_new)
prediction = y_scaler.inverse_transform(prediction_scaled)
prediction

In [None]:
loaded_model.feature_importances_

In [None]:
importances_new = loaded_model.feature_importances_
importances_list_new = sorted(zip(loaded_model.feature_importances_, X.keys()), reverse=True)

In [None]:
importances_list_new

In [None]:
importances_list = sorted(zip(rf.feature_importances_, X.keys()), reverse=True)

### Decision Tree Regression model

In [None]:
# Train a Decision Tree Regression model
from sklearn.tree import DecisionTreeRegressor
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train_scaled, y_train_scaled)
clf.score(X_test_scaled, y_test_scaled)

In [None]:
prediction_scaled = clf.predict(X_scaled_new)
prediction = y_scaler.inverse_transform(prediction_scaled)

In [None]:
prediction

### GridSearch Model

### Deep Learning

In [None]:
from keras.models import Sequential
model = Sequential()

In [None]:
from keras.layers import Dense
number_inputs = 8
number_hidden_nodes = 20
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))

In [None]:
number_classes = 1
model.add(Dense(units=number_classes, activation='softmax'))

In [None]:
model.summary()

In [None]:
# model.compile(optimizer='adam',
#               loss='categorical_crossentropy',
#               metrics=['accuracy'])

model.compile(optimizer='rmsprop',
              loss='mse')

In [None]:
# model.fit(
#     X_train_scaled,
#     y_train_scaled,
#     epochs=1000,
#     shuffle=True,
#     verbose=1
# )

In [1]:
import cl_model_new

In [2]:
X_new = [[17423.0, 45.0, 56714.0, 30430.0, 1353.0, 975.0, 8.391207, 479.0, 2.749240, 149, 240, 49, 11.442, 1522.723300]]

In [3]:
# X_new = [[ 3.48460584, -0.82941627,  0.80809856,  0.34549878, -0.01890773, -0.33548015,  0.2246137 ,  1.35009033]]

In [4]:
model_filename = 'cl_model.sav'
model_r2_filename = 'cl_model_r2.csv'

In [5]:
a = cl_model_new.build_model(model_r2_filename, model_filename)

Start loading, cleaning data ...
Finish loading data

Start scaling data ...
Finish scaling data

Start building model, this may take a little while ...


  rf = rf.fit(X_train_scaled, y_train_scaled)


Finish building model



In [6]:
d = cl_model_new.load_data()

Start loading, cleaning data ...
Finish loading data

Start scaling data ...
Finish scaling data



In [None]:
list(d[1].values)

In [7]:
p = cl_model_new.make_prediction(X_new, model_r2_filename, model_filename)

Start loading, cleaning data ...
Finish loading data

Start scaling data ...
Finish scaling data

Start making prediction ...
Finisn making prediction



In [8]:
p

{'Prediction': 192306.925,
 'R2': 0.8097843479963904,
 'importance': [(0.5121116135584018, 'Median gross rent'),
  (0.24614308016962258, 'Per Capita Income'),
  (0.0333490471799134, 'Population Density'),
  (0.031542625414429114, 'house_age'),
  (0.02873632942994995, 'Median Age'),
  (0.024756567302404605, 'Poverty Rate'),
  (0.023726557065823595, 'Household Income'),
  (0.019555108025219006, 'Unemployment rate'),
  (0.017432776219766393, 'Land-Sq-Mi'),
  (0.016596754717658865, 'Masters holders'),
  (0.014338498843446153, 'pop_arc/eng'),
  (0.011278174427091876, 'pop_tech'),
  (0.010722136736886347, 'Population'),
  (0.0097107309093862, 'Unemployment')]}