I want to figure out which machine learning model will best predict counties with high COVID fatality rates. I will train separate models for year one and year two. Hopefully the same model performs well on both years.

First, let's get all the data:

In [1]:
import sys
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import RidgeClassifierCV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
import pandas as pd
from sklearn.metrics import roc_auc_score

class config:
    USE_CACHE = True
    CACHE_DIR = "/Users/caseydurfee/msds/data_mining_final_project/cache"

from COVID_data import all_data
data = all_data.get_all_data(config)

Now, we'll get training data for year 1, with a min population of 50,000.

In [6]:
from COVID_data import prepare_model_data

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
import pprint

test_models = [RandomForestClassifier(), 
                SVC(), 
                LogisticRegression(max_iter=10000), 
                RidgeClassifierCV(), 
                AdaBoostClassifier(), 
                BaggingClassifier(),
                GradientBoostingClassifier(),
                LinearSVC(max_iter=10000)]

roc_auc_scorer = make_scorer(roc_auc_score)

ITERS = 5

for year in [1,2]:
    X, y = prepare_model_data.make_train_test(data, year=year, split=False)

    roc_scores = {}
    max_roc = 0.0
    best_model = None
    for throwaway in range(ITERS):
        for model in test_models:
            model_name = repr(model.__class__)

            mean_score = cross_val_score(model, X, y, scoring=roc_auc_scorer).mean()

            if model_name not in roc_scores:
                roc_scores[model_name] = 0.0
            roc_scores[model_name] += mean_score
            if mean_score > max_roc:
                best_model = model
                max_roc = mean_score

    sz = pd.Series(roc_scores)
    print(f">>>> mean scores for year {year}")
    print(sz / ITERS)

dropping ['DEATHS_FIRST_YEAR', 'DEATHS', 'DEATHS_SECOND_YEAR', 'DEATHS_ALPHA', 'DEATHS_DELTA', 'DEATHS_OMICRON', 'DEATH_RATE', 'DEATH_RATE_FIRST_YEAR', 'DEATH_RATE_SECOND_YEAR', 'DEATH_RATE_ALPHA', 'DEATH_RATE_DELTA', 'DEATH_RATE_OMICRON']
{"<class 'sklearn.ensemble._bagging.BaggingClassifier'>": 1.757034653498293,
 "<class 'sklearn.ensemble._forest.RandomForestClassifier'>": 1.7867964790319193,
 "<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>": 1.8428082605938505,
 "<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>": 1.9333207994196684,
 "<class 'sklearn.linear_model._logistic.LogisticRegression'>": 1.902638449039896,
 "<class 'sklearn.linear_model._ridge.RidgeClassifierCV'>": 1.8407350070782547,
 "<class 'sklearn.svm._classes.LinearSVC'>": 1.9090079938163913,
 "<class 'sklearn.svm._classes.SVC'>": 1.7657030622363483}


Looks like SVC is our big winner.

In [3]:
best_model.feature_importances_

AttributeError: 'AdaBoostClassifier' object has no attribute 'estimators_'

now, let's use permutation importance to figure out what really matters.

In [4]:
from sklearn.inspection import permutation_importance

result = permutation_importance(best_model, X, y, 
            n_repeats=100)

forest_importances = pd.Series(result.importances_mean, index=X.columns)

NotFittedError: This AdaBoostClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
disp = forest_importances.reindex(forest_importances.abs().sort_values().index).to_string()

print(disp)

it seems suspicious that the best model is the only one that tunes itself. Let's try grid search on SVC and see if we can up the success.

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV

parameters = {
    'C': [2**x for x in range(3, 13)],
    'gamma': [2**x for x in range(-15,-5)]
}



grid = GridSearchCV(SVC(), parameters).fit(x_train, y_train)

In [None]:

scores = [x for x in grid.cv_results_["mean_test_score"]]
scores = np.array(scores).reshape(len(grid.param_grid["C"]), len(grid.param_grid["gamma"]))

print(scores)

In [None]:
final_model = SVC(C=2**9, gamma=2**-11).fit(x_train, y_train)
