In [62]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, RidgeCV, ElasticNetCV, ElasticNet
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_squared_error, SCORERS

In [3]:
df = pd.read_csv('./data/medicare_partd_20162020_filled_generic.csv')

 #### Try a simple model with fewer attributes

In [4]:
X=df[['Avg_Spnd_Per_Bene_2016','Avg_Spnd_Per_Bene_2017','Avg_Spnd_Per_Bene_2018','Avg_Spnd_Per_Bene_2019']]
y=df['Avg_Spnd_Per_Bene_2020']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=20)

In [6]:
lr_simp = LinearRegression()

In [7]:
lr_simp.fit(X_train, y_train)

LinearRegression()

In [8]:
simp_preds = lr_simp.predict(X_test)

In [9]:
lr_simp.score(X_test, y_test), lr_simp.score(X_train, y_train)

(0.9589527678842404, 0.9486655000331241)

In [10]:
mean_squared_error(y_test, simp_preds, squared=False)

9493.950303374058

#### add in generic status

In [11]:
X=df[['Avg_Spnd_Per_Bene_2016','Avg_Spnd_Per_Bene_2017','Avg_Spnd_Per_Bene_2018','Avg_Spnd_Per_Bene_2019', 'generic']]
y=df['Avg_Spnd_Per_Bene_2020']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=20)

In [13]:
lr = LinearRegression()

lr.fit(X_train, y_train)

LinearRegression()

In [14]:
preds = lr.predict(X_test)

In [15]:
lr.score(X_test, y_test), lr.score(X_train, y_train)

(0.9588795427354823, 0.9487997636573491)

In [16]:
mean_squared_error(y_test, preds, squared=False)

9502.414773686553

Adding in the generic flag caused the model to perform more poorly

#### add more features back in, but exclude the non-numeric features

In [17]:
X=df.drop(columns=['Avg_Spnd_Per_Bene_2020','Brnd_Name', 'Gnrc_Name','Mftr_Name'])
y=df['Avg_Spnd_Per_Bene_2020']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=20)

In [19]:
lr = LinearRegression()

lr.fit(X_train, y_train)

LinearRegression()

In [20]:
preds = lr.predict(X_test)

In [21]:
lr.score(X_test, y_test), lr.score(X_train, y_train)

(0.9689957303940668, 0.9681843407559321)

In [22]:
mean_squared_error(y_test, preds, squared=False)

8251.174422497505

This model is performing better than the others

In [23]:
pd.DataFrame(list(zip(X.columns,lr.coef_)), columns=['Factor', 'Weight']).sort_values(by='Weight')

Unnamed: 0,Factor,Weight
16,Outlier_Flag_2017,-1611.068
39,Outlier_Flag_2020,-1288.807
42,generic,-512.4748
41,CAGR_Avg_Spnd_Per_Dsg_Unt_16_20,-96.66333
8,Outlier_Flag_2016,-79.17036
40,Chg_Avg_Spnd_Per_Dsg_Unt_19_20,-48.86394
29,Avg_Spnd_Per_Dsg_Unt_Wghtd_2019,-3.601446
22,Avg_Spnd_Per_Clm_2018,-1.232836
30,Avg_Spnd_Per_Clm_2019,-1.227964
5,Avg_Spnd_Per_Dsg_Unt_Wghtd_2016,-1.226884


#### regularization

In [29]:
lasso = Lasso(alpha=.01)

In [30]:
lasso.fit(X=X_train, y=y_train)

  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=0.01)

In [31]:
lasso.score(X_test, y_test)

0.9689970558758515

In [32]:
preds_lasso= lasso.predict(X_test)

In [33]:
mean_squared_error(y_test, preds_lasso, squared=False)

8250.99804520445

The lasso model is performing almost exactly as well as the regular linear regression model

In [34]:
lasso.

<bound method BaseEstimator.get_params of Lasso(alpha=0.01)>

In [35]:
lasso_cv = LassoCV(cv=10).fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [39]:
lasso_cv.alpha_

766503303.9971256

In [40]:
lasso_better = Lasso(alpha=766503303.9971256)

In [41]:
lasso_better.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=766503303.9971256)

In [42]:
lasso_better.score(X_test, y_test)

0.7390438152273723

In [43]:
preds_better = lasso_better.predict(X_test)

In [44]:
mean_squared_error(y_test, preds_better, squared=False)

23938.052359442234

In [45]:
en = ElasticNet()

In [46]:
en.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


ElasticNet()

In [47]:
en.score(X_test, y_test)

0.9689996110634017

In [49]:
mean_squared_error(y_test, en.predict(X_test), squared=False)

8250.658024562947

#### Grid Search

In [56]:
params = {'alpha':( .01, .1, 1, 10)}

In [73]:
gs = GridSearchCV(
    Lasso(),
    param_grid=params,
    scoring='neg_root_mean_squared_error'
)

In [74]:
gs.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(estimator=Lasso(), param_grid={'alpha': (0.01, 0.1, 1, 10)},
             scoring='neg_root_mean_squared_error')

In [76]:
gs.best_params_

{'alpha': 1}

In [77]:
lasso_gs = Lasso(alpha=1)
lasso_gs.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=1)

In [71]:
lasso_gs.score(X_test, y_test), lasso_gs.score(X_train, y_train)

(0.9690014023824276, 0.968184142396436)

In [72]:
mean_squared_error(y_test, lasso_gs.predict(X_test), squared= False)

8250.419644098803

In [None]:
sorted(SCORERS.keys())

In [79]:
params2 = {'alpha':( .01, .1, 1, 10),
           'l1_ratio':(0, .5, 1)}
           

In [80]:
gs2 = GridSearchCV(
    ElasticNet(),
    param_grid = params2,
    n_jobs=-1
)

In [81]:
gs2.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

GridSearchCV(estimator=ElasticNet(), n_jobs=-1,
             param_grid={'alpha': (0.01, 0.1, 1, 10), 'l1_ratio': (0, 0.5, 1)})

In [84]:
gs2.best_params_

{'alpha': 0.01, 'l1_ratio': 0}

In [85]:
en_best = ElasticNet(alpha=.01, l1_ratio=0)

In [86]:
en_best.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


ElasticNet(alpha=0.01, l1_ratio=0)

In [87]:
en_best.score(X_train, y_train), en_best.score(X_test, y_test)

(0.9681788811877671, 0.9690135459334523)

In [89]:
mean_squared_error(y_test, en_best.predict(X_test), squared=False)

8248.803454759212

### Try this with encoding

In [96]:
X=df.drop(columns = ['Avg_Spnd_Per_Bene_2020','Avg_Spnd_Per_Clm_2020','Avg_Spnd_Per_Dsg_Unt_Wghtd_2020','Tot_Spndng_2020'])
y=df['Avg_Spnd_Per_Bene_2020']

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=20)

In [99]:
transf = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', sparse=False),['Brnd_Name', 'Gnrc_Name','Mftr_Name']),
    remainder='passthrough',
    n_jobs=-1,
    verbose_feature_names_out=False
)

In [100]:
transf.fit(X_train)

ColumnTransformer(n_jobs=-1, remainder='passthrough',
                  transformers=[('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['Brnd_Name', 'Gnrc_Name', 'Mftr_Name'])],
                  verbose_feature_names_out=False)

In [103]:
enc_column_names = transf.get_feature_names_out()
enc_column_names

array(['Brnd_Name_1st Tier Unifine Pentips',
       'Brnd_Name_1st Tier Unifine Pentips Plus', 'Brnd_Name_Abacavir',
       ..., 'Chg_Avg_Spnd_Per_Dsg_Unt_19_20',
       'CAGR_Avg_Spnd_Per_Dsg_Unt_16_20', 'generic'], dtype=object)

In [104]:
X_train_enc = pd.DataFrame(transf.transform(X_train), columns= enc_column_names)
X_test_enc = pd.DataFrame(transf.transform(X_test), columns= enc_column_names)

In [106]:
gs2.fit(X_train_enc, y_train) #this overwhelmed my computer so I forced it to stop

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

KeyboardInterrupt: 

### this is where I ran out of time in the hackathon

In [None]:
gs2.best_params_

In [None]:
en_allfeats = ElasticNet()