In [27]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

df = pd.read_csv('multivariate_eng_df.csv', index_col = 'Date', parse_dates = True)
df.head()

Unnamed: 0_level_0,Unnamed: 0,STATE,PROVINCE_STATE_NAME,Completeness_pct,Administered_Dose1_Recip,Administered_Dose1_Recip_5Plus,Administered_Dose1_Recip_12Plus,Administered_Dose1_Recip_18Plus,Administered_Dose1_Recip_65Plus,Series_Complete_Yes,...,vaccination_rate_complete,booster_rate,prop_age04,prop_age59,prop_age6064,prop_age8084,prop_age85plus,AGE04_29_TOT,AGE30_59_TOT,AGE60PLUS_TOT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-01,3160,6,California,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.744418,6.165309,5.818187,1.71647,1.860398,13077768,15818572,8169914
2020-03-02,3161,6,California,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.744418,6.165309,5.818187,1.71647,1.860398,13077768,15818572,8169914
2020-03-03,3162,6,California,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.744418,6.165309,5.818187,1.71647,1.860398,13077768,15818572,8169914
2020-03-04,3163,6,California,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.744418,6.165309,5.818187,1.71647,1.860398,13077768,15818572,8169914
2020-03-05,3164,6,California,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.744418,6.165309,5.818187,1.71647,1.860398,13077768,15818572,8169914


In [28]:
#backwards selection
#separate target and features
X = df.drop(['PEOPLE_POSITIVE_NEW_CASES_COUNT','STATE', 'PROVINCE_STATE_NAME'] , axis=1)
y = df['PEOPLE_POSITIVE_NEW_CASES_COUNT']

#timeseries split for evaluation
tscv = TimeSeriesSplit(n_splits=5)  #adjust based on dataset size and timeframe

def backward_feature_selection(X, y, initial_features, tscv):                                                                                                                                                                       
    features = initial_features[:]
    best_features = features[:]
    best_score = float('inf')
    
    while len(features) > 0:
        print(f"Trying {len(features)} features")
        scores = []
        
        for train_index, test_index in tscv.split(X):
            X_train, X_test = X.iloc[train_index][features], X.iloc[test_index][features]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            model = XGBRegressor(n_estimators=75, learning_rate=0.5, objective='reg:squarederror')
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = mean_absolute_error(y_test, y_pred)
            scores.append(score)
                                            
        avg_score = np.mean(scores)
        print(f"Average MAE: {avg_score}")
        
        if avg_score < best_score:
            best_score = avg_score
            best_features = features[:]
            if len(features) > 1:
                # Remove the least important feature
                feature_importances = model.feature_importances_
                least_important = np.argmin(feature_importances)
                removed_feature = features.pop(least_important)
                print(f"Removed {removed_feature}, Continuing...")
            else:
                break
        else:
            print("No improvement, stopping.")
            break
            
    return best_features

initial_features = X.columns.tolist()
optimal_features = backward_feature_selection(X, y, initial_features, tscv)

print("Optimal number of features:", len(optimal_features))


Trying 178 features
Average MAE: 2479.2429117615507
Removed Booster_Doses_5Plus, Continuing...
Trying 177 features
Average MAE: 2479.2429117615507
No improvement, stopping.
Optimal number of features: 178


In [29]:
#forwards selection
def forward_feature_selection(X, y, tscv):
    remaining_features = X.columns.tolist()
    selected_features = []
    best_score = float('inf')
    
    while len(remaining_features) > 0:
        score_improvement = False
        best_feature_to_add = None
        
        for feature in remaining_features:
            trial_features = selected_features + [feature]
            print(f"Trying features: {trial_features}")
            scores = []
            
            for train_index, test_index in tscv.split(X):
                X_train, X_test = X.iloc[train_index][trial_features], X.iloc[test_index][trial_features]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                
                model = XGBRegressor(n_estimators=20, learning_rate=0.6, objective='reg:squarederror')
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                score = mean_absolute_error(y_test, y_pred)
                scores.append(score)
            
            avg_score = np.mean(scores)
            print(f"Average MAE with {feature}: {avg_score}")
            
            if avg_score < best_score:
                best_score = avg_score
                best_feature_to_add = feature
                score_improvement = True
        
        if score_improvement:
            selected_features.append(best_feature_to_add)
            remaining_features.remove(best_feature_to_add)
            print(f"Added {best_feature_to_add} to the model, Continuing...")
        else:
            print("No further improvement, stopping.")
            break
            
    return selected_features

optimal_features = forward_feature_selection(X, y, tscv)

print("Optimal features:", optimal_features)
print("Optimal number of features:", len(optimal_features))

Trying features: ['Unnamed: 0']
Average MAE with Unnamed: 0: 5297.8790776122305
Trying features: ['Completeness_pct']
Average MAE with Completeness_pct: 5676.434950636658
Trying features: ['Administered_Dose1_Recip']
Average MAE with Administered_Dose1_Recip: 5267.7333837630895
Trying features: ['Administered_Dose1_Recip_5Plus']
Average MAE with Administered_Dose1_Recip_5Plus: 6710.043825783772
Trying features: ['Administered_Dose1_Recip_12Plus']
Average MAE with Administered_Dose1_Recip_12Plus: 5174.264489004025
Trying features: ['Administered_Dose1_Recip_18Plus']
Average MAE with Administered_Dose1_Recip_18Plus: 5361.197387741692
Trying features: ['Administered_Dose1_Recip_65Plus']
Average MAE with Administered_Dose1_Recip_65Plus: 5651.062001543132
Trying features: ['Series_Complete_Yes']
Average MAE with Series_Complete_Yes: 5014.09559128596
Trying features: ['Series_Complete_5Plus']
Average MAE with Series_Complete_5Plus: 6326.889921393583
Trying features: ['Series_Complete_5to17']