### Goal: Perform lasso regression on the scaled data frame using the agregated income data, the behavioral data, and the combination of these.


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [5]:
MM_df = pd.read_csv("../data/MaternalMortalityDF_scaled.csv")

In [6]:
MM_df

Unnamed: 0,State,Year,Maternal_Mortality_Rate,"$10,000 to $14,999","$100,000 to $149,999","$15,000 to $24,999","$150,000 to $199,999","$200,000 or more","$25,000 to $34,999","$35,000 to $49,999",...,UnknownInitialCare,NotReportedInitialCare,20-27Weeks,28-31Weeks,32-35Weeks,36Weeks,37-39Weeks,40Weeks,41Weeks,42PlusWeeks
0,Arizona,2011,17.535041,0.050941,0.120274,0.113698,0.039829,0.035108,0.109472,0.148437,...,0.000000,1.0,0.006383,0.007224,0.039992,0.039208,0.638813,0.211309,0.052395,0.003752
1,Arizona,2012,28.921461,0.049908,0.126118,0.108871,0.041107,0.036075,0.109403,0.148753,...,0.000000,1.0,0.005553,0.007959,0.040212,0.038061,0.641189,0.211300,0.051376,0.003135
2,Arizona,2013,26.869159,0.048638,0.124151,0.108188,0.044261,0.039363,0.107901,0.146377,...,0.000000,1.0,0.005105,0.007488,0.040070,0.037593,0.638633,0.211063,0.055923,0.003049
3,Arizona,2014,27.622084,0.048948,0.128419,0.100804,0.044806,0.044947,0.106101,0.144143,...,0.018725,0.0,0.005455,0.007642,0.039592,0.036899,0.637403,0.206245,0.062012,0.003338
4,Arizona,2015,23.432649,0.042673,0.131472,0.098472,0.050628,0.046791,0.103092,0.141929,...,0.019777,0.0,0.004874,0.007862,0.039707,0.037656,0.640590,0.203641,0.060890,0.003351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,Washington,2015,12.360939,0.033038,0.169348,0.071090,0.071309,0.075715,0.081155,0.123342,...,0.048309,0.0,0.004989,0.007675,0.036903,0.031206,0.560850,0.241982,0.107529,0.007799
167,Washington,2016,13.258936,0.033063,0.177253,0.068122,0.077832,0.084730,0.073801,0.118044,...,0.048417,0.0,0.004044,0.007281,0.037125,0.032816,0.566996,0.237158,0.106160,0.007491
168,Washington,2017,27.409150,0.030409,0.184129,0.063782,0.084553,0.094392,0.069118,0.110217,...,0.058359,0.0,0.004123,0.006921,0.038122,0.034421,0.581097,0.226034,0.101482,0.006395
169,Washington,2018,24.394494,0.026178,0.186673,0.061550,0.092450,0.106915,0.065511,0.109749,...,0.077726,0.0,0.004728,0.006819,0.036975,0.034199,0.588976,0.224987,0.096068,0.005483


Aggrigate Income Data:

In [7]:
Agg_MM = True
if Agg_MM:
    MM_df['sub_75k_income'] = MM_df["Less than $10,000"]+ MM_df["$10,000 to $14,999"]+MM_df["$15,000 to $24,999"]+MM_df["$25,000 to $34,999"]+MM_df["$35,000 to $49,999"]+MM_df["$50,000 to $74,999"]
    MM_df['between_75k_150k_income'] = MM_df['$75,000 to $99,999']+MM_df["$100,000 to $149,999"]
    MM_df['above_150k_income'] = MM_df['$150,000 to $199,999']+MM_df['$200,000 or more']

Set predictors and desired output variable:

In [8]:
x = MM_df[['sub_75k_income',	'between_75k_150k_income',	'above_150k_income']]
y = MM_df['Maternal_Mortality_Rate']

Perform a train test split:

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

In [12]:
x.describe()

Unnamed: 0,sub_75k_income,between_75k_150k_income,above_150k_income
count,171.0,171.0,171.0
mean,0.604384,0.272257,0.12337
std,0.078967,0.029583,0.052394
min,0.401168,0.20015,0.047882
25%,0.559625,0.249544,0.083896
50%,0.615257,0.273129,0.111248
75%,0.667547,0.293484,0.150131
max,0.752493,0.331787,0.282649


Perform lasso regression:

In [13]:
lasso = Lasso(alpha=0.1)

In [14]:
lasso.fit(xtrain, ytrain)


In [15]:
from sklearn.metrics import mean_squared_error

In [16]:
ypred = lasso.predict(xtest)

In [17]:
mse = mean_squared_error(ytest, ypred)

print(f"The Lasso Regression Mean Squared Error: {mse}")

The Lasso Regression Mean Squared Error: 164.96356889626688


This is only slightly better than the mean squared error for linear regression.

### Let's continue with ridge regression on the scaled behavioral data.

There seem to be some values in the behavioral data that are showing up as NaN. So I will first remove those rows.

In [18]:
cleaned_MM_df = MM_df.dropna()

In [19]:
cleaned_MM_df

Unnamed: 0,State,Year,Maternal_Mortality_Rate,"$10,000 to $14,999","$100,000 to $149,999","$15,000 to $24,999","$150,000 to $199,999","$200,000 or more","$25,000 to $34,999","$35,000 to $49,999",...,28-31Weeks,32-35Weeks,36Weeks,37-39Weeks,40Weeks,41Weeks,42PlusWeeks,sub_75k_income,between_75k_150k_income,above_150k_income
0,Arizona,2011,17.535041,0.050941,0.120274,0.113698,0.039829,0.035108,0.109472,0.148437,...,0.007224,0.039992,0.039208,0.638813,0.211309,0.052395,0.003752,0.685861,0.240198,0.074937
1,Arizona,2012,28.921461,0.049908,0.126118,0.108871,0.041107,0.036075,0.109403,0.148753,...,0.007959,0.040212,0.038061,0.641189,0.211300,0.051376,0.003135,0.677035,0.245300,0.077182
2,Arizona,2013,26.869159,0.048638,0.124151,0.108188,0.044261,0.039363,0.107901,0.146377,...,0.007488,0.040070,0.037593,0.638633,0.211063,0.055923,0.003049,0.669731,0.246781,0.083624
3,Arizona,2014,27.622084,0.048948,0.128419,0.100804,0.044806,0.044947,0.106101,0.144143,...,0.007642,0.039592,0.036899,0.637403,0.206245,0.062012,0.003338,0.658680,0.251632,0.089753
4,Arizona,2015,23.432649,0.042673,0.131472,0.098472,0.050628,0.046791,0.103092,0.141929,...,0.007862,0.039707,0.037656,0.640590,0.203641,0.060890,0.003351,0.644614,0.257747,0.097419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,Washington,2015,12.360939,0.033038,0.169348,0.071090,0.071309,0.075715,0.081155,0.123342,...,0.007675,0.036903,0.031206,0.560850,0.241982,0.107529,0.007799,0.540799,0.312515,0.147025
167,Washington,2016,13.258936,0.033063,0.177253,0.068122,0.077832,0.084730,0.073801,0.118044,...,0.007281,0.037125,0.032816,0.566996,0.237158,0.106160,0.007491,0.518502,0.318333,0.162562
168,Washington,2017,27.409150,0.030409,0.184129,0.063782,0.084553,0.094392,0.069118,0.110217,...,0.006921,0.038122,0.034421,0.581097,0.226034,0.101482,0.006395,0.495483,0.324429,0.178944
169,Washington,2018,24.394494,0.026178,0.186673,0.061550,0.092450,0.106915,0.065511,0.109749,...,0.006819,0.036975,0.034199,0.588976,0.224987,0.096068,0.005483,0.476336,0.324416,0.199364


In [20]:
x = cleaned_MM_df[['Percent_Overweight',	'Percent_Obese',	'Percent_Overweight_or_Obese',	'Percent_No_Activity']]
y = cleaned_MM_df['Maternal_Mortality_Rate']

In [21]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

In [22]:
x.describe()

Unnamed: 0,Percent_Overweight,Percent_Obese,Percent_Overweight_or_Obese,Percent_No_Activity
count,170.0,170.0,170.0,170.0
mean,29.519412,30.413529,59.932941,27.811176
std,1.122302,3.805399,3.734008,3.691433
min,25.9,21.8,52.3,18.0
25%,28.9,27.6,57.5,25.825
50%,29.6,30.7,60.3,27.95
75%,30.2,33.1,62.5,29.85
max,33.0,39.4,68.8,37.9


In [23]:
lasso = Lasso(alpha=0.1)

In [24]:
lasso.fit(xtrain, ytrain)

In [25]:
ypred = lasso.predict(xtest)

In [26]:
mse = mean_squared_error(ytest, ypred)

print(f"The Lasso Regression Mean Squared Error: {mse}")

The Lasso Regression Mean Squared Error: 110.71393271801271


This is a good mean squared error compared to that found using the income data.

### Let's perform ridge regression using both the behavioral data and the aggregated income data.

In [27]:
x = cleaned_MM_df[['Percent_Overweight',	'Percent_Obese',	'Percent_Overweight_or_Obese',	'Percent_No_Activity', 'sub_75k_income',	'between_75k_150k_income',	'above_150k_income']]
y = cleaned_MM_df['Maternal_Mortality_Rate']

In [28]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

In [29]:
lasso = Lasso(alpha=0.1)

In [30]:
lasso.fit(xtrain, ytrain)

In [31]:
ypred = lasso.predict(xtest)

In [32]:
mse = mean_squared_error(ytest, ypred)

print(f"The Lasso Regression Mean Squared Error: {mse}")

The Lasso Regression Mean Squared Error: 110.71393271801271


Let's examine the natality data.

In [33]:
cleaned_MM_df.columns

Index(['State', 'Year', 'Maternal_Mortality_Rate', '$10,000 to $14,999',
       '$100,000 to $149,999', '$15,000 to $24,999', '$150,000 to $199,999',
       '$200,000 or more', '$25,000 to $34,999', '$35,000 to $49,999',
       '$50,000 to $74,999', '$75,000 to $99,999', 'Less than $10,000',
       'Percent_Overweight', 'Percent_Obese', 'Percent_Overweight_or_Obese',
       'Percent_No_Activity', 'Births', 'ChronicHypertensionBirths',
       'DiabetesBirths', 'EnclampsiaBirths',
       'Pregnancy-AssociatedHypertensionBirths', 'TobaccoUseBirths',
       'HospitalBirths', 'ResidenceBirths', 'MD_Delivery', 'DO_Delivery',
       'CNM_Delivery', 'Other_Delivery_Attendant', 'Vaginal_Delivery',
       'Cesarean_Delivery', 'Married', 'Unmarried', 'NoPrenatalCare',
       '1stMonthInitialCare', '2ndMonthInitialCare', '3rdMonthInitialCare',
       '4thMonthInitialCare', '5thMonthInitialCare', '6thMonthInitialCare',
       '7thMonthInitialCare', '8thMonthInitialCare', '9thMonthInitialCare',
    