In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import tree

import pickle

### Prepare Data

In [None]:
# Load data from csv files
df = pd.read_csv('cl_Resources/census_data.csv')
df_land = pd.read_csv('cl_Resources/Zipcode-Population-Density-2010.csv')
df_unemployment = pd.read_csv('cl_Resources/Unemployment.csv')

In [None]:
df.head()

In [None]:
df_land.head()

In [None]:
df_unemployment.head()

In [None]:
df.count()

In [None]:
# drop rows which have nan 
df.dropna(inplace=True)

In [None]:
df.count()

In [None]:
# Delete rows which have negative data
df_new = df[~(df < 0).any(axis=1)]

In [None]:
df_new.count()

In [None]:
df_unemployment.count()

In [None]:
# Delete rows which have 
df_unemployment.dropna(inplace=True)

In [None]:
df_unemployment.count()

In [None]:
# Join df_new with df_land to add Land-Sq-Mi column to df_new
df_new = df_new.join(df_land.set_index('Zipcode'), on='Zipcode')

In [None]:
df_new.count()

In [None]:
# Delete the nan data added by the join
df_new.dropna(inplace=True)

In [None]:
df_new.count()

In [None]:
# Join df_new with df_unemployment to add Unemp Rate column to df_new
df_new = df_new.join(df_unemployment.set_index('Zipcode'), on='Zipcode')

In [None]:
df_new.count()

In [None]:
# Delete the nan data added by the join
df_new.dropna(inplace=True)

In [None]:
df_new.count()

In [None]:
# df_new.sort_values(by='median_home_value', ascending=False)

In [None]:
# Add a new column to df_new
df_new["Population Density"] = df_new["Population"]/df_new["Land-Sq-Mi"]

In [None]:
# Drop Zipcode column
df_new.drop(columns=['Zipcode'], inplace=True)

In [None]:

# df_new.sort_values(by='median_home_value', ascending=False)

In [None]:
df_new.count()

In [None]:
df_new.head()

In [None]:
# Correction Matrix Plot
import matplotlib.pyplot as plt
import pandas
import numpy

# data = pandas.read_csv(url, names=names)
df_test = df_new.drop("median_home_value", axis=1)
correlations = df_test.corr()
# plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
names = ["Population", "Median Age", "Household Income", "Per Capita Income", "Poverty Count", "Poverty Rate", "Land-Sq-Mi", "Unemp Rate", "Population Density"]
# ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

In [None]:
# Based un correlation analysis, drop column
df_test_new = df_test.drop("Poverty Count", axis=1)

In [None]:
# data = pandas.read_csv(url, names=names)
correlations_new = df_test_new.corr()
# plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations_new, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0,8,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
names = ["Population", "Median Age", "Household Income", "Per Capita Income", "Poverty Rate", "Land-Sq-Mi", "Unemp Rate", "Population Density"]
# ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

In [None]:
# Based un correlation analysis, drop column
df_new = df_new.drop("Poverty Count", axis=1)
df_new.head()

In [None]:
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape to create this

X = df_new[["Population", "Median Age", "Household Income", "Per Capita Income", "Poverty Rate", "Land-Sq-Mi", "Unemp Rate", "Population Density"]]
# X = df_new[["Household Income", "Population Density", "Poverty Rate", "Per Capita Income", "Median Age", "Unemp Rate"]]
# X = df_new[["Household Income", "Population Density", "Poverty Rate", "Median Age", "Unemp Rate"]]
# X = df_new[["Household Income", "Per Capita Income", "Poverty Rate", "Median Age", "Unemp Rate"]]
y = df_new["median_home_value"].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
# Plot the data to see if a linear trend exists for Population
# Can plot for each features
plt.scatter(X["Population"], y)
plt.xlabel("Population")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Median Age
# Can plot for each features
plt.scatter(X["Median Age"], y)
plt.xlabel("Median Age")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Household Income
# Can plot for each features
plt.scatter(X["Household Income"], y)
plt.xlabel("Household Income")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Per Capita Income
# Can plot for each features
plt.scatter(X["Per Capita Income"], y)
plt.xlabel("Per Capita Income")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Poverty Rate
# Can plot for each features
plt.scatter(X["Poverty Rate"], y)
plt.xlabel("Poverty Rate")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Median Age
# Can plot for each features
plt.scatter(X["Land-Sq-Mi"], y)
plt.xlabel("Land-Sq-Mi")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Unemployment Rate
# Can plot for each features
plt.scatter(X["Unemp Rate"], y)
plt.xlabel("Unemployment Rate")
plt.ylabel("House Value")

In [None]:
# Plot the data to see if a linear trend exists for Population Density
# Can plot for each features
plt.scatter(X["Population Density"], y)
plt.xlabel("Population Density")
plt.ylabel("House Value")

In [None]:
# Use sklearn's `train_test_split` to split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Scale data with SKLearn StandardScaler
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
X_test_scaled[0]

In [None]:
y_test[0]

### Linear regression Model

In [None]:
## Create the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
# Fitting our model with training data
model.fit(X_train_scaled, y_train_scaled)

In [None]:
# Make prediction with test data
predictions = model.predict(X_test_scaled)

In [None]:
# Evaluate the model by Calculate the following for the test data
# 1. Calculate the mean_squared_error (mse)
# 2. the r-squared value (r2)
from sklearn.metrics import mean_squared_error, r2_score
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

# Or by calling the `score` method on the model to show the r2 score
model.score(X_test_scaled, y_test_scaled)

In [None]:
# Residual plot
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.show()

In [None]:
# Make a prediction
# X_scaled_new = [[ 0.80809856,  1.35009033, -0.01890773,  0.34549878, -0.82941627, 0.2246137 ]]
#X_scaled_new = [[ 3.48460584, -0.82941627,  0.80809856, 0.2345, 0.5678, -0.33548015, 0.2246137 ,  1.35009033]]
X_scaled_new = [[ 3.48460584, -0.82941627,  0.80809856,  0.34549878, -0.01890773, -0.33548015,  0.2246137 ,  1.35009033]]
prediction_scaled = model.predict(X_scaled_new)
prediction = y_scaler.inverse_transform(prediction_scaled)
prediction

In [None]:
## LASSO model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Lasso

### BEGIN SOLUTION
lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled)
predictions = lasso.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = lasso.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
## Ridge model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Ridge

### BEGIN SOLUTION
ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)
### END SOLUTION

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
## ElasticNet model
# Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import ElasticNet

elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = elasticnet.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = elasticnet.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

### Random Forest Regression Model

In [None]:
# Train a random forest regression model
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=300)
rf = rf.fit(X_train_scaled, y_train_scaled)

# save the model
filename = 'cl_model.sav'
pickle.dump(rf, open(filename, 'wb'))

# R2 for training data
rf.score(X_train_scaled, y_train_scaled)

In [None]:
importances = rf.feature_importances_
importances_list = sorted(zip(rf.feature_importances_, X.keys()), reverse=True)

In [None]:
importances_list

In [None]:
sorted(zip(rf.feature_importances_, X.keys()), reverse=True)

In [None]:
# R2 for testing data
rf.score(X_test_scaled, y_test_scaled)

In [None]:
# load the model
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test_scaled, y_test_scaled)

In [None]:
# accuracy = accuracy_score(y_test, predicted)

In [None]:
# Make a prediction
# X_scaled_new = ([[ 0.90809856,  1.35009033, -0.01890773,  0.94549878, -0.82941627, 0 ]])
#X_scaled_new = [[ 0.80809856,  1.35009033,  3.48460584, 0.3456,  0.34549878, -0.82941627,  0.2246137 ]]
# X_scaled_new = [[ 3.48460584, -0.82941627,  0.80809856, 0.2345, 0.5678, -0.33548015, 0.2246137 ,  1.35009033]]
X_scaled_new = [[ 3.48460584, -0.82941627,  0.80809856,  0.34549878, -0.01890773, -0.33548015,  0.2246137 ,  1.35009033]]
prediction_scaled = rf.predict(X_scaled_new)
prediction = y_scaler.inverse_transform(prediction_scaled)
prediction

In [None]:
# Make a prediction
# X_scaled_new = ([[ 0.90809856,  1.35009033, -0.01890773,  0.94549878, -0.82941627, 0 ]])
# X_scaled_new = [[ 0.80809856,  1.35009033,  3.48460584, 0.3456,  0.34549878, -0.82941627,  0.2246137 ]]
prediction_scaled = loaded_model.predict(X_scaled_new)
prediction = y_scaler.inverse_transform(prediction_scaled)
prediction

In [None]:
loaded_model.feature_importances_

In [None]:
importances_new = loaded_model.feature_importances_
importances_list_new = sorted(zip(loaded_model.feature_importances_, X.keys()), reverse=True)

In [None]:
importances_list_new

In [None]:
importances_list = sorted(zip(rf.feature_importances_, X.keys()), reverse=True)

### Decision Tree Regression model

In [None]:
# Train a Decision Tree Regression model
from sklearn.tree import DecisionTreeRegressor
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train_scaled, y_train_scaled)
clf.score(X_test_scaled, y_test_scaled)

In [None]:
prediction_scaled = clf.predict(X_scaled_new)
prediction = y_scaler.inverse_transform(prediction_scaled)

In [None]:
prediction

### GridSearch Model

### Deep Learning

In [None]:
from keras.models import Sequential
model = Sequential()

In [None]:
from keras.layers import Dense
number_inputs = 8
number_hidden_nodes = 20
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))

In [None]:
number_classes = 1
model.add(Dense(units=number_classes, activation='softmax'))

In [None]:
model.summary()

In [None]:
# model.compile(optimizer='adam',
#               loss='categorical_crossentropy',
#               metrics=['accuracy'])

model.compile(optimizer='rmsprop',
              loss='mse')

In [None]:
# model.fit(
#     X_train_scaled,
#     y_train_scaled,
#     epochs=1000,
#     shuffle=True,
#     verbose=1
# )

In [1]:
import cl_model

In [2]:
X_new = [[ 3.48460584, -0.82941627,  0.80809856,  0.34549878, -0.01890773, -0.33548015,  0.2246137 ,  1.35009033]]

In [3]:
model_filename = 'cl_model.sav'
model_r2_filename = 'cl_model_r2.csv'
X_filename = 'X.csv'

In [None]:
cl_model.build_model(model_r2_filename, model_filename)

In [4]:
p = cl_model.make_prediction(X_new, model_r2_filename, X_filename)

Start making prediction ...
Finisn making prediction



In [5]:
p

{'Prediction': 512179.0,
 'R2': 0.7261756163456871,
 'importance': [(0.5938583867574011, 'Per Capita Income'),
  (0.10082554899101619, 'Population Density'),
  (0.09054515607819855, 'Household Income'),
  (0.06026247167123873, 'Population'),
  (0.04994681270724938, 'Median Age'),
  (0.048105924664446145, 'Poverty Rate'),
  (0.03429546296619177, 'Land-Sq-Mi'),
  (0.022160236164258343, 'Unemp Rate')]}