In [1]:
import pandas as pd
pd.options.display.max_rows = 350

In [2]:
master = pd.read_csv('../county-data.csv', index_col=0)

In [3]:
data = master[master.CM_deaths_opiates.isnull() == False]
deaths = master[master.CM_deaths_opiates.isnull() == False].CM_deaths_opiates

In [4]:
# Drop the target column as well as all the other columns that would seem to be data leaks
data = data.drop(['DM_deaths_drug', 'DM_crude_rate_drug', 'DM_age_adj_rate_drug',
                  'CM_deaths_opiates', 'CM_crude_rate_opiates', 'CM_age_adj_rate_opiates'], axis=1)

data = data.join(pd.get_dummies(data.state))

data.drop(['county', 'state'], axis=1, inplace=True)

data.fillna(data.mean(), inplace=True)

Unnamed: 0_level_0,housing_units,area_land,area_water,lat,long,votes_clinton,votes_trump,votes_total,pop,pop_male,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,22135,594.436,9.952,32.536382,-86.644490,5908.0,18110.0,24661.0,55416.0,26994.0,...,0,0,0,0,0,0,0,0,0,0
1003,104061,1589.784,437.527,30.659218,-87.746067,18409.0,72780.0,94090.0,208563.0,101241.0,...,0,0,0,0,0,0,0,0,0,0
1007,8981,622.582,3.587,33.015893,-87.127148,1874.0,6733.0,8748.0,22643.0,12145.0,...,0,0,0,0,0,0,0,0,0,0
1009,23887,644.776,5.852,33.977448,-86.567246,2150.0,22808.0,25384.0,57704.0,28498.0,...,0,0,0,0,0,0,0,0,0,0
1015,53289,605.868,6.419,33.771706,-85.822513,13197.0,32803.0,47376.0,114611.0,55138.0,...,0,0,0,0,0,0,0,0,0,0
1019,16267,553.700,46.278,34.069515,-85.654242,1524.0,8809.0,10503.0,25725.0,12769.0,...,0,0,0,0,0,0,0,0,0,0
1021,19278,692.854,7.948,32.854059,-86.726627,2909.0,15068.0,18255.0,43941.0,21605.0,...,0,0,0,0,0,0,0,0,0,0
1031,22330,678.972,1.523,31.402183,-85.989201,4194.0,15825.0,20513.0,51226.0,25319.0,...,0,0,0,0,0,0,0,0,0,0
1033,25758,592.619,29.510,34.703112,-87.801457,7296.0,16718.0,24626.0,54216.0,26158.0,...,0,0,0,0,0,0,0,0,0,0
1043,37054,734.841,20.180,34.131923,-86.869267,3730.0,32734.0,37278.0,82471.0,40731.0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
len(deaths)

1755

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoLars, LassoLarsCV, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, deaths, random_state=1337)

In [8]:
#model = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=1337)
#model = Ridge(random_state=1337)

#model = Lasso(random_state=1337, max_iter=50)
#model = ElasticNet(random_state=1337, max_iter=100000, alpha=0.1)
model = LassoLarsCV(n_jobs=-1, cv=10, max_iter=100)

cross_val_score(model, data, deaths, n_jobs=-1, cv=10)

In [9]:
model.fit(X_train, y_train)

LassoLarsCV(copy_X=True, cv=10, eps=2.2204460492503131e-16,
      fit_intercept=True, max_iter=100, max_n_alphas=1000, n_jobs=-1,
      normalize=True, positive=False, precompute='auto', verbose=False)

In [10]:
pred = model.predict(X_test)

In [11]:
r2_score(y_test, pred)

0.76781893182509686

In [12]:
mean_squared_error(y_test, pred)

26378.579887216631

In [13]:
mean_absolute_error(y_test, pred)

56.088830825327207

In [14]:
coeffs = zip(data.columns, model.coef_)
# Filter out 0ed coefficients
coeffs = [x for x in coeffs if x[1] != 0]

In [15]:
# Copied from https://pandas.pydata.org/pandas-docs/stable/style.html
def color_negative_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
    color = 'red' if val < 0 else 'black'
    return 'color: %s' % color

df = pd.DataFrame(sorted(coeffs, key=lambda x: x[1], reverse=True), columns=['variable', 'coefficient'])
df.set_index('variable', inplace=True)
df.style.applymap(color_negative_red)

Unnamed: 0_level_0,coefficient
variable,Unnamed: 1_level_1
MD,195.865
NV,109.129
CT,109.051
MA,59.3665
NM,47.5064
NC,46.0092
AZ,41.9551
UT,41.3255
RI,21.7171
WY,17.9562
