# Testing Regression Models

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

Reading city data

In [49]:
city_df = pd.read_csv("atlanta.csv")
# city_df.drop(city_df.tail(1).index,inplace=True)
city_df

Unnamed: 0,year,population,migration,education,health,safety,unemployment,housing,income,gdp
0,2010,429410,16547,44.8,80.4,1071.63,10.3,152504,30688,52315
1,2011,437812,16850,47.2,78.8,1432.79,9.9,134362,33117,52808
2,2012,449016,20024,47.4,83.0,1379.05,8.8,131764,35829,52783
3,2013,453990,16294,48.4,82.8,1223.23,7.8,147272,36257,53217
4,2014,461154,19140,48.9,86.1,1227.43,6.7,165460,36936,54852
5,2015,468303,15962,48.3,88.9,1119.62,5.7,175916,39660,56590
6,2016,479174,23192,50.5,89.2,1083.63,5.1,187967,40882,58195
7,2017,491670,20051,49.2,89.5,935.72,4.5,202235,44690,59847
8,2018,498183,22481,53.4,89.8,768.79,3.8,221222,48869,61608
9,2019,506811,19052,56.5,90.4,770.3,3.2,235574,54414,61761


In [50]:
# Our y (output from regression) is migration
migration = city_df['population'].tolist()
y = migration

# The rest of the columns (except year, migration) are our factors to regress on
city_df.drop(['migration', 'year', 'population'], axis=1, inplace=True)
X = city_df.to_numpy()
X.shape

(10, 7)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [52]:
# Create model
params = {'n_estimators':100, 'max_depth': 3}
linreg = GradientBoostingRegressor(**params)
# linreg = RandomForestRegressor(**params)

# linreg = LinearRegression()
# linreg = LogisticRegression()
linreg.fit(X_train, y_train)

# Calculate our y hat (how our model performs against the test data held off)
y_hat_test = linreg.predict(X_test)

In [54]:
# See our Squared Mean Error score and Root Mean Squared Error:
test_mse = mean_squared_error(y_test, y_hat_test)
test_rmse = np.sqrt(test_mse)
print("the test RMSE is: ", test_rmse)
# See our Mean Absolute Error
test_mae = mean_absolute_error(y_test, y_hat_test)
print("the test MAE is: ", test_mae)

the test RMSE is:  3311.396411392649
the test MAE is:  3300.710239736305


In [55]:
cv_3_results = cross_val_score(linreg, X, y, cv=3, scoring="neg_mean_absolute_error")

In [56]:
cv_3_results

array([-19765.80023071,  -2132.55038223, -19839.59635257])

In [84]:
X_2019 = [[56.5, 90.4, 770.30, 3.2, 235574, 54414, 61761]]
X_pandemic = [[56.5, 70, 770.30, 3.2, 235574, 54414, 61761]]
X_economic = [[56.5, 90.4, 770.30, 6.2, 235574, 44414, 51761]]
X_safety = [[56.5, 90.4, 500, 3.2, 235574, 54414, 61761]]
X_education = [[80.5, 90.4, 770.30, 3.2, 235574, 54414, 61761]]

In [85]:
linreg.predict(X_education)

array([506810.01628531])