# Testing Regression Models

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

Reading city data

In [2]:
city_df = pd.read_csv("nyc.csv")
# city_df.drop(city_df.tail(1).index,inplace=True)
city_df

Unnamed: 0,year,population,migration,education,health,safety,unemployment,housing,income,gdp
0,2010,8190209,123294,33.4,85.2,581.68,8.9,400006,29326,71022
1,2011,8272948,128350,34.1,85.4,623.6,8.6,386463,30200,70586
2,2012,8346693,127917,34.7,86.2,639.29,8.7,373526,30730,73181
3,2013,8396091,121249,35.7,86.6,623.91,7.6,382238,32540,73335
4,2014,8433806,124068,35.9,88.6,596.7,6.5,400567,32910,74159
5,2015,8463049,118088,36.8,90.7,585.77,5.3,411400,34396,75541
6,2016,8463153,114709,37.0,92.2,573.42,4.8,423722,35508,76961
7,2017,8437478,125478,37.3,92.8,538.9,4.5,443741,37447,78245
8,2018,8390081,110624,39.0,92.8,541.03,4.0,467823,39589,80712
9,2019,8336817,121898,39.2,93.1,570.72,3.7,478920,43046,81903


In [3]:
# Our y (output from regression) is migration
migration = city_df['population'].tolist()
y = migration

# The rest of the columns (except year, migration) are our factors to regress on
city_df.drop(['migration', 'year', 'population'], axis=1, inplace=True)
X = city_df.to_numpy()
X.shape

(10, 7)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
# Create model
params = {'n_estimators':100, 'max_depth': 3}
linreg = GradientBoostingRegressor(**params)
# linreg = RandomForestRegressor(**params)

# linreg = LinearRegression()
# linreg = LogisticRegression()
linreg.fit(X_train, y_train)

# Calculate our y hat (how our model performs against the test data held off)
y_hat_test = linreg.predict(X_test)

In [6]:
# See our Squared Mean Error score and Root Mean Squared Error:
test_mse = mean_squared_error(y_test, y_hat_test)
test_rmse = np.sqrt(test_mse)
print("the test RMSE is: ", test_rmse)
# See our Mean Absolute Error
test_mae = mean_absolute_error(y_test, y_hat_test)
print("the test MAE is: ", test_mae)

the test RMSE is:  68913.52349060177
the test MAE is:  54510.56174091296


In [7]:
cv_3_results = cross_val_score(linreg, X, y, cv=3, scoring="neg_mean_absolute_error")

In [8]:
cv_3_results

array([-133215.11651366,  -61239.20330087,  -75024.17839813])

In [22]:
# ATL_X_2019 = [[56.5, 90.4, 770.30, 3.2, 235574, 54414, 61761]]
# ATL_X_pandemic = [[56.5, 70, 770.30, 3.2, 235574, 54414, 61761]]
# ATL_X_economic = [[56.5, 90.4, 770.30, 6.2, 235574, 44414, 51761]]
# ATL_X_safety = [[56.5, 90.4, 500, 3.2, 235574, 54414, 61761]]
# ATL_X_education = [[80.5, 90.4, 770.30, 3.2, 235574, 54414, 61761]]
NYC_X_2019 = [[39.2, 93.1, 570.72, 3.7, 478920, 43046, 81903]]
NYC_X_pandemic = [[39.2, 70, 570.72, 3.7, 478920, 43046, 81903]]
NYC_X_economic = [[39.2, 93.1, 570.72, 6.7, 478920, 33046, 71903]]
NYC_X_safety = [[39.2, 93.1, 370.72, 3.7, 478920, 43046, 81903]]
NYC_X_education = [[59.2, 93.1, 570.72, 3.7, 478920, 43046, 81903]]

In [25]:
linreg.predict(NYC_X_safety)

array([8337071.51139307])