In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy import stats
import json
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import joblib

In [6]:
regions = ["New England", "Mid Atlantic", "South", "Midwest", "Southwest", "West"]

In [11]:
new_england_df = pd.read_csv("../data/cleaned_new_england_covid.csv")
mid_atlantic_df = pd.read_csv("../data/cleaned_mid_atlantic_covid.csv")
south_df = pd.read_csv("../data/cleaned_south_covid.csv")
midwest_df = pd.read_csv("../data/cleaned_midwest_covid.csv")
southwest_df = pd.read_csv("../data/cleaned_southwest_covid.csv")
west_df = pd.read_csv("../data/cleaned_west_covid.csv")

In [14]:
# load lasso coefs
with open('../data/coefs_region.json') as json_file:
    coefs = json.load(json_file)

In [17]:
region_features = {}
for region in coefs.keys():
    region_features[region] = list(coefs[region].keys())

In [23]:
dfs_region_map = {
    "New England": new_england_df,
    "Mid Atlantic": mid_atlantic_df,
    "South": south_df,
    "Midwest": midwest_df,
    "Southwest": southwest_df,
    "West": west_df
}

In [39]:
def get_and_plot_best_accuracies_logreg(dfs_region_map, coefs):
    best_accs_region = {}
    for region in dfs_region_map.keys():
        df = dfs_region_map[region]
        X = df[coefs[region]]
        y = df["critical_staffing_shortage_today_yes"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        C_range = 10.**np.arange(-3, 3)

        train_accLoR = []
        test_accLoR = []
        
        best_model = None
        best_acc = -np.inf

        LR_model = LinearRegression()
        LR_model.fit(X_train, y_train)
        test_accuracyLR = LR_model.score(X_test, y_test)
        best_accs_region[region] = test_accuracyLR

    return best_accs_region

In [41]:
best_accs_region = get_and_plot_best_accuracies_logreg(dfs_region_map, region_features)

In [44]:
best_accs_region

{'New England': 0.8033521877317962,
 'Mid Atlantic': 0.8280235923928767,
 'South': 0.76655961777935,
 'Midwest': 0.8392429084363228,
 'Southwest': 0.9586491053431411,
 'West': 0.9374093818603041}