Predict Health Insurance Premiums

Dataset was retrieved from the Center for Disease Control and Prevention (CDC) website, specifically the National Health Interview Surveys (NHIS), Sample Adult Interview (https://www.cdc.gov/nchs/nhis/2022nhis.htm)

In [63]:
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score
     

In [64]:
df = pd.read_csv('/Users/hp/Desktop/Database Systems/FINAL PROJECT/adult22csv/adult22.csv')
df

Unnamed: 0,URBRRL,RATCAT_A,INCTCFLG_A,IMPINCFLG_A,SHOTTYPE1_A,CEVOTELC_A,CEMMETNG_A,CEVOLUN2_A,CEVOLUN1_A,HITTEST_A,...,PROXYREL_A,PROXY_A,AVAIL_A,HHSTAT_A,INTV_MON,RECTYPE,IMPNUM_A,WTFA_A,HHX,POVRATTC_A
0,2,7,0,0,,,,,,,...,,,1,1,1,10,1,4548.583,H059086,1.92
1,4,14,0,0,,,,,,,...,,,1,1,1,10,1,7087.431,H054049,10.30
2,4,14,0,0,,,,,,,...,,,1,1,1,10,1,8125.516,H055201,9.36
3,4,11,0,0,,,,,,,...,,,1,1,1,10,1,7837.390,H044893,3.66
4,1,2,0,1,,,,,,,...,2.0,1.0,3,1,1,10,1,10234.356,H043149,0.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27646,4,11,0,0,,1.0,2.0,2.0,2.0,,...,,,1,1,12,10,1,3599.659,H015345,3.54
27647,4,10,0,0,3.0,2.0,2.0,,1.0,2.0,...,,,1,1,12,10,1,3220.314,H061055,3.35
27648,4,13,0,0,1.0,1.0,2.0,2.0,2.0,2.0,...,,,1,1,12,10,1,3198.866,H031575,4.88
27649,4,8,0,0,,1.0,1.0,2.0,2.0,2.0,...,,,1,1,12,10,1,6207.867,H015667,2.30


In [65]:
#clean variables for linear regression: age, gender, body mass index, number of children, smoking habits, and geolocation
#reference: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9265373/

'''
SEX_A Sex of Sample Adult 
AGEP_A Age of SA (top coded)
HEIGHTTC_A Height without shoes (inches), Public Use
WEIGHTLBTC_A Weight without shoes (pounds), Public Use
BMICAT_A Categorical Body Mass Index, Public Use
PCNTKIDS_A Number of children in Sample Adult family, top-coded 3+
SMKNOW_A Smoking status, now
No geolocation data

variables that can be used for insurance premium
NOTCOV_A Insurance coverage status
HICOSTR1_A Out-of-pocket premium cost - plan 2 -> there are 14039 missing values in this variable. So I tried 
dropping NOTCOV_A==2 (not covered by insurance)
HICOSTR2_A Out-of-pocket premium cost - plan 2 -> don't use this variable; mostly missing
'''

data = df[['SEX_A', 'AGEP_A', 'HEIGHTTC_A', 'WEIGHTLBTC_A', 'BMICAT_A', 'PCNTKIDS_A', 'SMKNOW_A', 'HICOSTR1_A', 'NOTCOV_A']]
data

Unnamed: 0,SEX_A,AGEP_A,HEIGHTTC_A,WEIGHTLBTC_A,BMICAT_A,PCNTKIDS_A,SMKNOW_A,HICOSTR1_A,NOTCOV_A
0,1,85,68,148,2,0,3.0,,2
1,1,64,74,235,4,0,3.0,99999.0,2
2,2,37,69,218,4,0,,,2
3,2,72,64,240,4,0,,,2
4,2,84,66,183,3,0,,,2
...,...,...,...,...,...,...,...,...,...
27646,1,64,70,180,3,0,1.0,,1
27647,2,67,66,142,2,0,,,2
27648,2,69,60,125,2,0,,8316.0,2
27649,1,30,75,250,4,2,3.0,4608.0,2


In [66]:
#check missing data
data.isnull().sum()

SEX_A               0
AGEP_A              0
HEIGHTTC_A          0
WEIGHTLBTC_A        0
BMICAT_A            0
PCNTKIDS_A          0
SMKNOW_A        17754
HICOSTR1_A      14039
NOTCOV_A            0
dtype: int64

In [67]:
#SMKNOW_A was given to currents mokers. NaN values are non smokers
data['SMKNOW_A'].fillna(0, inplace=True)

#there are 14039 missing values in HICOSTR1_A. Drop people who are not covered by insurance
data['NOTCOV_A'].value_counts()
data = data[data['NOTCOV_A'] == 2]
data = data.reset_index()
data = data.drop(['NOTCOV_A'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [68]:
#SHOULD WE DROP PEOPLE WITHOUT HEALTH INSURANCE PREMIUM FOR SUPERVISED LEARNING? 
#create a new dataframe with only the rows with health insurance premium cost
drop_data = data.dropna(subset=['HICOSTR1_A'])
drop_data

#drop HICOSTR1_A == 99999 (People who don't know)
drop_data = drop_data[drop_data['HICOSTR1_A'] != 99999]
drop_data = drop_data.reset_index()
drop_data

Unnamed: 0,level_0,index,SEX_A,AGEP_A,HEIGHTTC_A,WEIGHTLBTC_A,BMICAT_A,PCNTKIDS_A,SMKNOW_A,HICOSTR1_A
0,8,8,2,42,68,997,9,0,3.0,6600.0
1,12,12,1,69,75,245,4,0,0.0,2600.0
2,17,18,1,27,67,160,3,0,0.0,3250.0
3,19,20,1,38,71,200,3,0,0.0,15600.0
4,21,22,1,46,71,173,2,2,0.0,5400.0
...,...,...,...,...,...,...,...,...,...,...
10468,25474,27638,1,42,69,200,3,1,3.0,936.0
10469,25478,27642,1,50,69,168,2,1,2.0,20400.0
10470,25482,27648,2,69,60,125,2,0,0.0,8316.0
10471,25483,27649,1,30,75,250,4,2,3.0,4608.0


In [69]:
# dividing dataset into train and test
x = drop_data[['SEX_A', 'AGEP_A', 'HEIGHTTC_A', 'WEIGHTLBTC_A', 'BMICAT_A', 'PCNTKIDS_A', 'SMKNOW_A']]
y = drop_data[['HICOSTR1_A']]

#rename columns
x.rename(columns={'SEX_A':'sex', 'AGEP_A':'age', 'HEIGHTTC_A':'height', 'WEIGHTLBTC_A':'weight', 'BMICAT_A':'bmi', 'PCNTKIDS_A':'children', 'SMKNOW_A':'smoke'}, inplace=True)
y.rename(columns={'HICOSTR1_A': 'premium'}, inplace=True)

# Split 20% with test_size=0.2
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [70]:
#Linear regression

model = LinearRegression()
model.fit(X_train, y_train)
train_pred = model.predict(X_train)

# calculate the accuracy of the model by computing the R2 score between predicted and real values
r2_train = metrics.r2_score(y_train, y_prediction)
print('R squared vale : ', r2_train)

R squared vale :  0.005569307322922157


In [71]:
# prediction on test data
test_pred =model.predict(X_test)

# R squared value
r2_test = metrics.r2_score(y_test, test_pred)
print('R squared vale : ', r2_test)

R squared vale :  0.0012517529192317411


In [None]:
#predictive system: if user inputs client's data used for prediction, returns predicted insurance cost
input_data = (2, 31,25.74,0,1,0)

# changing input_data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = regressor.predict(input_data_reshaped)
print(prediction)

print('The insurance cost is USD ', prediction[0])
     