In [9]:
import pandas as pd
import statsmodels.formula.api as sm  # this library will be used to demonstrate Indicator features.

# The medical costs dataset "insurance.csv" can be downloaded from Github.com and loaded into a Pandas DataFrame using this codeline. 
med_cost_df = pd.read_csv("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv")
# Alternatively, the medical costs dataset can be downloaded from the video lecture as a resource, moved/copied into your
# Python directory and loaded into a Pandas DataFrame with this code
# med_cost_DF = pd.read_csv("insurance_data.csv")
print(med_cost_df)

      age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1338 rows x 7 columns]


In [10]:
med_cost_df2 = pd.get_dummies(med_cost_df, columns=['region'], dtype=bool)
print(med_cost_df2)

      age     sex     bmi  children smoker      charges  region_northeast  \
0      19  female  27.900         0    yes  16884.92400             False   
1      18    male  33.770         1     no   1725.55230             False   
2      28    male  33.000         3     no   4449.46200             False   
3      33    male  22.705         0     no  21984.47061             False   
4      32    male  28.880         0     no   3866.85520             False   
...   ...     ...     ...       ...    ...          ...               ...   
1333   50    male  30.970         3     no  10600.54830             False   
1334   18  female  31.920         0     no   2205.98080              True   
1335   18  female  36.850         0     no   1629.83350             False   
1336   21  female  25.800         0     no   2007.94500             False   
1337   61  female  29.070         0    yes  29141.36030             False   

      region_northwest  region_southeast  region_southwest  
0             

In [11]:
# instead of true and false, 1 and 0 integers are more useful as they work better with standard models allowing
# for calculation with parameters and weights, yielding 0*value=0 and 1*value=value.
med_cost_df3 = pd.get_dummies(med_cost_df, columns=['region'], dtype=int)
print(med_cost_df3)

      age     sex     bmi  children smoker      charges  region_northeast  \
0      19  female  27.900         0    yes  16884.92400                 0   
1      18    male  33.770         1     no   1725.55230                 0   
2      28    male  33.000         3     no   4449.46200                 0   
3      33    male  22.705         0     no  21984.47061                 0   
4      32    male  28.880         0     no   3866.85520                 0   
...   ...     ...     ...       ...    ...          ...               ...   
1333   50    male  30.970         3     no  10600.54830                 0   
1334   18  female  31.920         0     no   2205.98080                 1   
1335   18  female  36.850         0     no   1629.83350                 0   
1336   21  female  25.800         0     no   2007.94500                 0   
1337   61  female  29.070         0    yes  29141.36030                 0   

      region_northwest  region_southeast  region_southwest  
0             

In [12]:
# Now that indicators are integers we can apply statistics against indicator features

regression_model = sm.ols(formula='charges ~ region_northeast + region_northwest + region_southeast + region_southwest', data=med_cost_df3).fit()
print(regression_model.params)
print(regression_model.summary())

Intercept           1.478242e+15
region_northeast   -1.478242e+15
region_northwest   -1.478242e+15
region_southeast   -1.478242e+15
region_southwest   -1.478242e+15
dtype: float64
                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     2.217
Date:                Sun, 21 Sep 2025   Prob (F-statistic):             0.0651
Time:                        16:08:16   Log-Likelihood:                -14473.
No. Observations:                1338   AIC:                         2.896e+04
Df Residuals:                    1333   BIC:                         2.898e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                       coef   

In [13]:
med_cost_df4 = med_cost_df3.loc[med_cost_df3['smoker'] == 'yes']

regression_model2 = sm.ols(formula='charges ~ region_northeast + region_northwest + region_southeast + region_southwest', data=med_cost_df4).fit()
print(regression_model2.params)
print(regression_model2.summary())

Intercept           25395.919994
region_northeast     4277.616478
region_northwest     4796.083188
region_southeast     9449.076829
region_southwest     6873.143499
dtype: float64
                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.036
Model:                            OLS   Adj. R-squared:                  0.025
Method:                 Least Squares   F-statistic:                     3.316
Date:                Sun, 21 Sep 2025   Prob (F-statistic):             0.0205
Time:                        16:08:21   Log-Likelihood:                -2946.2
No. Observations:                 274   AIC:                             5900.
Df Residuals:                     270   BIC:                             5915.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef   

In [15]:
med_cost_df5 = med_cost_df3.loc[med_cost_df3['smoker'] == 'no']

regression_model3 = sm.ols(formula='charges ~ region_northeast + region_northwest + region_southeast + region_southwest', data=med_cost_df5).fit()
print(regression_model3.params)
print(regression_model3.summary())

Intercept           6754.699242
region_northeast    2410.832430
region_northwest    1801.764473
region_southeast    1277.517067
region_southwest    1264.585271
dtype: float64
                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     2.155
Date:                Sun, 21 Sep 2025   Prob (F-statistic):             0.0917
Time:                        16:16:23   Log-Likelihood:                -10761.
No. Observations:                1064   AIC:                         2.153e+04
Df Residuals:                    1060   BIC:                         2.155e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef    std 

In [16]:
med_cost_df6 = pd.get_dummies(med_cost_df, columns=['smoker'], dtype=int)
print(med_cost_df6)

      age     sex     bmi  children     region      charges  smoker_no  \
0      19  female  27.900         0  southwest  16884.92400          0   
1      18    male  33.770         1  southeast   1725.55230          1   
2      28    male  33.000         3  southeast   4449.46200          1   
3      33    male  22.705         0  northwest  21984.47061          1   
4      32    male  28.880         0  northwest   3866.85520          1   
...   ...     ...     ...       ...        ...          ...        ...   
1333   50    male  30.970         3  northwest  10600.54830          1   
1334   18  female  31.920         0  northeast   2205.98080          1   
1335   18  female  36.850         0  southeast   1629.83350          1   
1336   21  female  25.800         0  southwest   2007.94500          1   
1337   61  female  29.070         0  northwest  29141.36030          0   

      smoker_yes  
0              1  
1              0  
2              0  
3              0  
4              0

In [18]:
regression_model4 = sm.ols(formula='charges ~ smoker_yes', data=med_cost_df6).fit()
print(regression_model4.params)
print(regression_model4.summary())

Intercept      8434.268298
smoker_yes    23615.963534
dtype: float64
                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.620
Model:                            OLS   Adj. R-squared:                  0.619
Method:                 Least Squares   F-statistic:                     2178.
Date:                Sun, 21 Sep 2025   Prob (F-statistic):          8.27e-283
Time:                        16:23:19   Log-Likelihood:                -13831.
No. Observations:                1338   AIC:                         2.767e+04
Df Residuals:                    1336   BIC:                         2.768e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------