In [22]:
import pandas as pd
import numpy as np
import sys
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [4]:
df_pop = pd.read_csv('../data/usa/us_2010_2022_population.csv')
df_tot_area = pd.read_csv('../data/usa/us_area.csv')
df_base = df_pop.merge(df_tot_area, how='inner', on='City', suffixes=('_pop','_area'))

In [7]:
df_mean_income = pd.read_csv('../data/usa/us_2010_2022_mean_inc_tot.csv')
df_median_income = pd.read_csv('../data/usa/us_2010_2022_med_inc_tot.csv')
df_poverty = pd.read_csv('../data/usa/us_2010_2022_pov_tot.csv')
df_travel_time = pd.read_csv('../data/usa/us_2010_2022_trav_time.csv')
df_educ_highschool = pd.read_csv('../data/usa/us_2015_2022_hs_tot.csv')

In [19]:
year = str(year)
df_new_base = df_base.filter(like=year).copy()
df_new_base['City'] = df_base['City']
df_observable = globals()[f'df_{observable}'].filter(like=year).copy()
df_observable['City'] = globals()[f'df_{observable}']['City']
df = df_new_base.merge(df_observable, how='inner', on='City')
df = df[['City', f'{year}_pop', f'{year}_area', f'{year}']].rename(columns={f'{year}_pop': 'Population', f'{year}_area': 'Area', f'{year}': 'Observable'})
df.dropna(inplace=True)
df.iloc[:, 1:] = df.iloc[:, 1:].apply(lambda x: np.log(x))

In [21]:
df['Interaction'] = df['Population']*df['Area']
df['Population_sq'] = df['Population']*df['Population']
df['Area_sq'] = df['Area']*df['Area']

X_col = sm.add_constant(df['Population'])
X = sm.add_constant(df[['Population','Area']])
X_p = sm.add_constant(df['Population'])
X_a = sm.add_constant(df['Area'])
X_i = sm.add_constant(df[['Population','Area','Interaction']])
X_f = sm.add_constant(df[['Population','Area','Interaction','Population_sq','Area_sq']])
y = df['Observable']

ols_col = sm.OLS(df['Area'], X_col)
ols = sm.OLS(y.values, X)
ols_p = sm.OLS(y.values, X_p)
ols_a = sm.OLS(y.values, X_a)
ols_i = sm.OLS(y.values, X_i)
ols_f = sm.OLS(y.values, X_f)
ols_result_col = ols_col.fit()
ols_result = ols.fit()
ols_result_p = ols_p.fit()
ols_result_a = ols_a.fit()
ols_result_i = ols_i.fit()
ols_result_f = ols_f.fit()

y_pred_col = ols_result_col.predict(X_col)
y_pred = ols_result.predict(X)
y_pred_p = ols_result_p.predict(X_p)
y_pred_a = ols_result_a.predict(X_a)
y_pred_i = ols_result_i.predict(X_i)
y_pred_f = ols_result_f.predict(X_f)

model = LinearRegression()
cv = RepeatedKFold(n_splits=5, n_repeats=10)
mse_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv)
mse_scores_p = cross_val_score(model, X_p, y, scoring='neg_mean_squared_error', cv=cv)
mse_scores_a = cross_val_score(model, X_a, y, scoring='neg_mean_squared_error', cv=cv)
mse_scores_i = cross_val_score(model, X_i, y, scoring='neg_mean_squared_error', cv=cv)
mse_scores_f = cross_val_score(model, X_f, y, scoring='neg_mean_squared_error', cv=cv)
rmse_scores = np.sqrt(-mse_scores)
rmse_scores_p = np.sqrt(-mse_scores_p)
rmse_scores_a = np.sqrt(-mse_scores_a)
rmse_scores_i = np.sqrt(-mse_scores_i)
rmse_scores_f = np.sqrt(-mse_scores_f)