In [1]:
from pathlib import Path

from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

DATA = Path('../data/')

In [2]:
# Read different feature files
stationary_features = pd.read_csv(DATA/'processed/abridged_stationary_features.tsv', sep='\t', converters={'countyFIPS': str})
mobility_features = pd.read_csv(DATA/'processed/mobility_time_varying_features.tsv', sep='\t')
cases_deaths_features = pd.read_csv(DATA/'processed/abridged_time_varying_features.tsv', sep='\t')
lockdown_features = pd.read_csv(DATA/'processed/abridged_time_based_features.tsv', sep='\t')

## Patterns in missing values

## Imputation 

### Stationary features imputation

In [3]:
# Drop columns which have too many missing values (>50%)
prop_missing = stationary_features.isnull().mean(axis=0)
stationary_features.drop(prop_missing[prop_missing > 0.5].index, axis=1, inplace=True)

# Drop redundant lat lon column (these have missing values whereas POP ones don't)
stationary_features.drop(['lat', 'lon'], axis=1, inplace=True)

# Add an indicator if the value is missing
missing_indicators = stationary_features.isnull()
missing_indicators.columns = ['MISSING_' + col for col in missing_indicators.columns]
stationary_features = pd.concat([stationary_features, missing_indicators], axis=1)

#### Median Imputation

In [4]:
# Calculate state-wise and general median
state_median = stationary_features.groupby('StateName').agg(np.nanmedian).T.to_dict()
general_median = stationary_features.dropna().median().to_dict()

stationary_features_median_imputed = list()
for i, row in stationary_features.iterrows():
    state = row['StateName']
    imputed_row = row.fillna(state_median[state])
    
    # Some missing values might still be there due to the values being missing for the whole state
    # In this case, just impute using the column median
    if imputed_row.isna().sum() > 0:
        imputed_row = row.fillna(general_median)
    
    stationary_features_median_imputed.append(imputed_row)
    
stationary_features_median_imputed = pd.DataFrame(stationary_features_median_imputed)
stationary_features_median_imputed.to_csv(DATA/'processed/abridged_stationary_features_median_imputed.tsv',
                                          sep='\t', index=None)

### Time varying features imputation