# Lasso 
### Feature Importance

<hr>

In [1]:
from sklearn import linear_model
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score


# loading dataframe
df_state_demo = pd.read_csv('state_demographics.csv')

# only including int/float columns
x_feat_list = list(df_state_demo.columns)
x_feat_list.remove("State")
x_feat_list.remove("Ethnicities.Black Alone") # removing the y_feat

# scale normalization
for feat in x_feat_list:
    df_state_demo[feat] = df_state_demo[feat] / df_state_demo[feat].std()

# setting the x and y values
x = df_state_demo.loc[:, x_feat_list].values
y = df_state_demo.loc[:, "Ethnicities.Black Alone"].values

# initialization of models
lreg = linear_model.Lasso(alpha=0.1) # 0.5 chosen for mid-range penalization 
skfold = KFold(n_splits=20, shuffle=True)

# to store predictions
y_pred = np.empty(y.shape)

# cross validation
for train_idx, test_idx in skfold.split(x, y):
    # split into train and test sets
    x_train = x[train_idx, :]
    y_train = y[train_idx]
    x_test = x[test_idx, :]

    # fit Lasso on training set
    lreg.fit(x_train, y_train)

    # predicting
    y_pred[test_idx] = lreg.predict(x_test)

mse = np.mean((y_pred - y) ** 2)
r2 = r2_score(y, y_pred)

#-----------------------------------------------------------------------------

# Now doing it on the full dataset
# fitting the model
lreg.fit(x,y)

# # coefficients 
lreg_coefs = list(lreg.coef_)

# # to store feature/coef pairs
answer_dict = {}

for lreg_coef, x_feat in zip(lreg_coefs, x_feat_list):
    # only keeping features whose coefficient is of value (i.e not 0)
    if (lreg_coef != 0):
        answer_dict[x_feat] = lreg_coef

print(f"MSE: {mse}")
print(f"R2 Score: {r2}")

MSE: 6.148544453343199
R2 Score: 0.9452636692140831
