In [1]:
# * * * * * * * * * * * * * * * * * * * * * *
# Author      : Robert Meza
# Cohort      : UC Berkeley ML/AI — March 2025
# Description : Capstone Project for CalPERS
# File        : capstone_2_of_2.ipynb
# * * * * * * * * * * * * * * * * * * * * * *

In [3]:
# <- ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !
# This is part two (capstone_2_of_2.ipynb), 
# the previous steps were completed in 
# part 1 (capstone_1_of_2.ipynb)
# <- ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !

In [4]:
# start

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import time
import warnings

from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.pipeline import Pipeline

warnings.filterwarnings("ignore")

In [8]:
# read the final dataset
df = pd.read_csv("data/final_data.csv")

In [11]:
# sample down for performance
df = df.iloc[:400000]

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   YEAR                       400000 non-null  int64 
 1   AGE                        400000 non-null  int64 
 2   CARRIER                    400000 non-null  object
 3   PREMIUM_MTHLY_AMT          400000 non-null  int64 
 4   DEP_COUNT                  400000 non-null  int64 
 5   HIRE_YEAR                  400000 non-null  int64 
 6   LOV_HLTH_PLAN_TYPE_CD_ASC  400000 non-null  int64 
 7   LOV_HLTH_PLAN_TYPE_CD_EPO  400000 non-null  int64 
 8   LOV_HLTH_PLAN_TYPE_CD_HMO  400000 non-null  int64 
 9   LOV_HLTH_PLAN_TYPE_CD_PPO  400000 non-null  int64 
dtypes: int64(9), object(1)
memory usage: 30.5+ MB


In [15]:
df.head()

Unnamed: 0,YEAR,AGE,CARRIER,PREMIUM_MTHLY_AMT,DEP_COUNT,HIRE_YEAR,LOV_HLTH_PLAN_TYPE_CD_ASC,LOV_HLTH_PLAN_TYPE_CD_EPO,LOV_HLTH_PLAN_TYPE_CD_HMO,LOV_HLTH_PLAN_TYPE_CD_PPO
0,2021,30,Anthem Blue Cross,2208,4,2018,0,0,0,1
1,2022,46,Blue Shield of California,1958,5,1999,1,0,0,0
2,2021,37,Anthem Blue Cross,1371,2,2011,0,0,0,1
3,2020,29,UnitedHealthcare Services Inc.,1454,1,2015,0,0,1,0
4,2024,40,UnitedHealthcare Services Inc.,2296,3,2018,0,0,1,0


In [17]:
# * * * * * * * * * * * * * * * * * * * * * *
# 5. Classification Modeling
# * * * * * * * * * * * * * * * * * * * * * *

In [19]:
target = "CARRIER"
X = df.drop(columns=[target])

le = LabelEncoder()
y = le.fit_transform(df[target])

In [21]:
# test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6625, stratify=y)

In [23]:
# scale the data after test train split
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
# build logistic regression, and decision tree
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier()
}

In [27]:
results = []
for name, model in models.items():
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()
    
    train_time = end - start
    train_acc = accuracy_score(y_train, model.predict(X_train))
    test_acc = accuracy_score(y_test, model.predict(X_test))
    
    results.append([name, train_time, train_acc, test_acc])

In [28]:
df_results = pd.DataFrame(results, columns=["Model", "Train Time (s)", "Train Accuracy", "Test Accuracy"])
print(df_results)

                 Model  Train Time (s)  Train Accuracy  Test Accuracy
0  Logistic Regression        7.228339        0.466353        0.46635
1        Decision Tree        0.515594        0.999956        0.99970


In [29]:
# attempt to improve model performance

In [30]:
models = {
    "Logistic Regression": Pipeline([
        ("s", StandardScaler()),
        ("lr", LogisticRegression(max_iter=1000, random_state=6625))
    ]),
    "Decision Tree": DecisionTreeClassifier(random_state=6625) 
}

In [31]:
results2 = []
for name, model in models.items():
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()
    
    train_time = end - start
    train_acc = accuracy_score(y_train, model.predict(X_train))
    test_acc = accuracy_score(y_test, model.predict(X_test))
    
    results2.append([name, train_time, train_acc, test_acc])

In [32]:
df_results2 = pd.DataFrame(results2, columns=["Model", "Train Time (s)", "Train Accuracy", "Test Accuracy"])
print(df_results2)

                 Model  Train Time (s)  Train Accuracy  Test Accuracy
0  Logistic Regression        4.579377        0.686306       0.686262
1        Decision Tree        0.487670        0.999956       0.999713


In [33]:
# finding --> in decision tree, there is a feature that can easily identify the carrier 

In [34]:
# continue to tune the logistic regression model and add randon Forest...

In [35]:
# end