In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
df = pd.read_csv('../HealthInsurancePredictor/data/customer_test_masked.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,custid,sex,is_employed,income,marital_status,health_ins,housing_type,num_vehicles,age,state_of_res,code_column,gas_usage,rooms,recent_move_b
0,4523,001115999_01,Male,,28900.0,Married,,Homeowner free and clear,1.0,82,Arkansas,653,20.0,6,F
1,58780,000566299_01,Male,True,40000.0,Never married,,Rented,1.0,40,New Mexico,404,40.0,5,T
2,18628,001397329_01,Female,True,203000.0,Married,,Homeowner with mortgage/loan,3.0,54,Colorado,1291,80.0,2,F
3,11525,000843100_01,Female,,0.0,Married,,Homeowner free and clear,1.0,64,California,8962,30.0,2,F
4,56266,000260071_03,Male,True,40000.0,Married,,Homeowner with mortgage/loan,4.0,35,New Jersey,2059,150.0,1,F


In [3]:
# Drop unnecessary columns
df = df.drop(columns=['Unnamed: 0', 'code_column'], errors='ignore')

df.set_index('custid', inplace=True)

df.head()

Unnamed: 0_level_0,sex,is_employed,income,marital_status,health_ins,housing_type,num_vehicles,age,state_of_res,gas_usage,rooms,recent_move_b
custid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
001115999_01,Male,,28900.0,Married,,Homeowner free and clear,1.0,82,Arkansas,20.0,6,F
000566299_01,Male,True,40000.0,Never married,,Rented,1.0,40,New Mexico,40.0,5,T
001397329_01,Female,True,203000.0,Married,,Homeowner with mortgage/loan,3.0,54,Colorado,80.0,2,F
000843100_01,Female,,0.0,Married,,Homeowner free and clear,1.0,64,California,30.0,2,F
000260071_03,Male,True,40000.0,Married,,Homeowner with mortgage/loan,4.0,35,New Jersey,150.0,1,F


In [4]:
# Print the number of null values for each column
print("\nNull values per column:")
print(df.isnull().sum())


Null values per column:
sex                 0
is_employed       259
income              0
marital_status      0
health_ins        804
housing_type       34
num_vehicles       34
age                 0
state_of_res        0
gas_usage          34
rooms               0
recent_move_b      34
dtype: int64


In [5]:
# Check for missing rows in the four columns
missing_housing = df['housing_type'].isnull()
missing_vehicles = df['num_vehicles'].isnull()
missing_gas = df['gas_usage'].isnull()
missing_recent_move = df['recent_move_b'].isnull()

# Combine the masks using logical AND
combined_missing = missing_housing & missing_vehicles & missing_gas & missing_recent_move

# Verify if the counts match and print results
print("Total missing in housing_type:", missing_housing.sum())
print("Total missing in num_vehicles:", missing_vehicles.sum())
print("Total missing in gas_usage:", missing_gas.sum())
print("Total missing in recent_move_b:", missing_recent_move.sum())
print("Rows missing in all four columns:", combined_missing.sum())

Total missing in housing_type: 34
Total missing in num_vehicles: 34
Total missing in gas_usage: 34
Total missing in recent_move_b: 34
Rows missing in all four columns: 34


In [6]:
# Create a copy of the dataset to store transformations
df_processed = df.copy()

# Convert NaN values in 'is_employed' to 'Not in Workforce'
df_processed['is_employed'] = df_processed['is_employed'].fillna('Not in Workforce')

df_processed.head()

Unnamed: 0_level_0,sex,is_employed,income,marital_status,health_ins,housing_type,num_vehicles,age,state_of_res,gas_usage,rooms,recent_move_b
custid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
001115999_01,Male,Not in Workforce,28900.0,Married,,Homeowner free and clear,1.0,82,Arkansas,20.0,6,F
000566299_01,Male,True,40000.0,Never married,,Rented,1.0,40,New Mexico,40.0,5,T
001397329_01,Female,True,203000.0,Married,,Homeowner with mortgage/loan,3.0,54,Colorado,80.0,2,F
000843100_01,Female,Not in Workforce,0.0,Married,,Homeowner free and clear,1.0,64,California,30.0,2,F
000260071_03,Male,True,40000.0,Married,,Homeowner with mortgage/loan,4.0,35,New Jersey,150.0,1,F


In [7]:
# Print the number of null values for each column
print("\nNull values per column:")
print(df_processed.isnull().sum())


Null values per column:
sex                 0
is_employed         0
income              0
marital_status      0
health_ins        804
housing_type       34
num_vehicles       34
age                 0
state_of_res        0
gas_usage          34
rooms               0
recent_move_b      34
dtype: int64


In [8]:
# Impute NaN values using median or mode from the training dataset
df_processed['housing_type'] = df_processed['housing_type'].fillna(df_processed['housing_type'].mode()[0])
df_processed['num_vehicles'] = df_processed['num_vehicles'].fillna(df_processed['num_vehicles'].median())
df_processed['gas_usage'] = df_processed['gas_usage'].fillna(df_processed['gas_usage'].median())
df_processed['recent_move_b'] = df_processed['recent_move_b'].fillna(df_processed['recent_move_b'].mode()[0])

In [9]:
# Print the number of null values for each column
print("\nNull values per column:")
print(df_processed.isnull().sum())


Null values per column:
sex                 0
is_employed         0
income              0
marital_status      0
health_ins        804
housing_type        0
num_vehicles        0
age                 0
state_of_res        0
gas_usage           0
rooms               0
recent_move_b       0
dtype: int64


In [10]:
df_processed.head()

Unnamed: 0_level_0,sex,is_employed,income,marital_status,health_ins,housing_type,num_vehicles,age,state_of_res,gas_usage,rooms,recent_move_b
custid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
001115999_01,Male,Not in Workforce,28900.0,Married,,Homeowner free and clear,1.0,82,Arkansas,20.0,6,F
000566299_01,Male,True,40000.0,Never married,,Rented,1.0,40,New Mexico,40.0,5,T
001397329_01,Female,True,203000.0,Married,,Homeowner with mortgage/loan,3.0,54,Colorado,80.0,2,F
000843100_01,Female,Not in Workforce,0.0,Married,,Homeowner free and clear,1.0,64,California,30.0,2,F
000260071_03,Male,True,40000.0,Married,,Homeowner with mortgage/loan,4.0,35,New Jersey,150.0,1,F


In [11]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Create a copy of the dataset to store transformations
df_transformed = df_processed.copy()

# Identify categorical columns
categorical_cols = df_transformed.select_dtypes(include=['object']).columns

# Identify binary columns
binary_cols = df_transformed.select_dtypes(include=['bool']).columns

# Convert binary columns to strings
for column in binary_cols:
    df_transformed[column] = df_transformed[column].astype(str)

# Convert categorical columns to strings
for column in categorical_cols:
    df_transformed[column] = df_transformed[column].astype(str)

# Initialize LabelEncoder for binary columns
label_encoders = {}
for column in binary_cols:
    le = LabelEncoder()
    df_transformed[column] = le.fit_transform(df_transformed[column])
    label_encoders[column] = le

# Initialize OneHotEncoder for categorical columns
onehot_encoder = OneHotEncoder(drop='first', sparse_output=False)

# Apply OneHotEncoder to categorical columns
df_encoded = pd.DataFrame(onehot_encoder.fit_transform(df_transformed[categorical_cols]), columns=onehot_encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns and concatenate encoded columns
df_transformed = df_transformed.drop(columns=categorical_cols).reset_index(drop=True)
df_transformed = pd.concat([df_transformed, df_encoded], axis=1)

df_transformed.index = df_processed.index
df_transformed.index.name = 'custid'

In [12]:
pd.set_option('display.max_columns', None)
df_transformed.head()

Unnamed: 0_level_0,income,health_ins,num_vehicles,age,gas_usage,rooms,sex_Male,is_employed_Not in Workforce,is_employed_True,marital_status_Married,marital_status_Never married,marital_status_Widowed,housing_type_Homeowner with mortgage/loan,housing_type_Occupied with no rent,housing_type_Rented,state_of_res_Alaska,state_of_res_Arizona,state_of_res_Arkansas,state_of_res_California,state_of_res_Colorado,state_of_res_Connecticut,state_of_res_Delaware,state_of_res_District of Columbia,state_of_res_Florida,state_of_res_Georgia,state_of_res_Idaho,state_of_res_Illinois,state_of_res_Indiana,state_of_res_Iowa,state_of_res_Kansas,state_of_res_Kentucky,state_of_res_Louisiana,state_of_res_Maine,state_of_res_Maryland,state_of_res_Massachusetts,state_of_res_Michigan,state_of_res_Minnesota,state_of_res_Mississippi,state_of_res_Missouri,state_of_res_Montana,state_of_res_Nebraska,state_of_res_Nevada,state_of_res_New Hampshire,state_of_res_New Jersey,state_of_res_New Mexico,state_of_res_New York,state_of_res_North Carolina,state_of_res_North Dakota,state_of_res_Ohio,state_of_res_Oklahoma,state_of_res_Oregon,state_of_res_Pennsylvania,state_of_res_Rhode Island,state_of_res_South Carolina,state_of_res_South Dakota,state_of_res_Tennessee,state_of_res_Texas,state_of_res_Utah,state_of_res_Virginia,state_of_res_Washington,state_of_res_West Virginia,state_of_res_Wisconsin,state_of_res_Wyoming,recent_move_b_T
custid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1
001115999_01,28900.0,,1.0,82,20.0,6,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000566299_01,40000.0,,1.0,40,40.0,5,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
001397329_01,203000.0,,3.0,54,80.0,2,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000843100_01,0.0,,1.0,64,30.0,2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000260071_03,40000.0,,4.0,35,150.0,1,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_transformed.shape

(804, 64)

In [14]:
import os
# Save the df_transformed dataset to a CSV file in the directory 'data/preprocessed' (create the directory if it does not exist)
output_dir = 'data/preprocessed'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'customer_test_transformed.csv')
df_transformed.to_csv(output_file, index=True)

In [15]:
# Load the training dataset
train_file = 'data/preprocessed/customer_transformed.csv'
train_df = pd.read_csv(train_file)

# Load the test dataset
test_file = 'data/preprocessed/customer_test_transformed.csv'
test_df = pd.read_csv(test_file)

In [16]:
train_df.set_index('custid', inplace=True)
test_df.set_index('custid', inplace=True)

In [17]:
X_train = train_df.drop(columns=['health_ins'])
y_train = train_df['health_ins']

In [18]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
lr = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
# Add missing columns to the test dataset and set their values to 0
missing_columns = ['state_of_res_Hawaii', 'state_of_res_Vermont']
for col in missing_columns:
    if col not in test_df.columns:
        test_df[col] = 0

In [20]:
# Ensure the test dataset has the same column order as the training dataset
test_df = test_df[X_train.columns]

# Make predictions using the trained model
predictions = lr.predict(test_df)

In [21]:
test_df.head()

Unnamed: 0_level_0,income,num_vehicles,age,gas_usage,rooms,sex_Male,is_employed_Not in Workforce,is_employed_True,marital_status_Married,marital_status_Never married,marital_status_Widowed,housing_type_Homeowner with mortgage/loan,housing_type_Occupied with no rent,housing_type_Rented,state_of_res_Alaska,state_of_res_Arizona,state_of_res_Arkansas,state_of_res_California,state_of_res_Colorado,state_of_res_Connecticut,state_of_res_Delaware,state_of_res_District of Columbia,state_of_res_Florida,state_of_res_Georgia,state_of_res_Hawaii,state_of_res_Idaho,state_of_res_Illinois,state_of_res_Indiana,state_of_res_Iowa,state_of_res_Kansas,state_of_res_Kentucky,state_of_res_Louisiana,state_of_res_Maine,state_of_res_Maryland,state_of_res_Massachusetts,state_of_res_Michigan,state_of_res_Minnesota,state_of_res_Mississippi,state_of_res_Missouri,state_of_res_Montana,state_of_res_Nebraska,state_of_res_Nevada,state_of_res_New Hampshire,state_of_res_New Jersey,state_of_res_New Mexico,state_of_res_New York,state_of_res_North Carolina,state_of_res_North Dakota,state_of_res_Ohio,state_of_res_Oklahoma,state_of_res_Oregon,state_of_res_Pennsylvania,state_of_res_Rhode Island,state_of_res_South Carolina,state_of_res_South Dakota,state_of_res_Tennessee,state_of_res_Texas,state_of_res_Utah,state_of_res_Vermont,state_of_res_Virginia,state_of_res_Washington,state_of_res_West Virginia,state_of_res_Wisconsin,state_of_res_Wyoming,recent_move_b_T
custid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1
001115999_01,28900.0,1.0,82,20.0,6,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
000566299_01,40000.0,1.0,40,40.0,5,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0
001397329_01,203000.0,3.0,54,80.0,2,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
000843100_01,0.0,1.0,64,30.0,2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
000260071_03,40000.0,4.0,35,150.0,1,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
from datetime import datetime
# Add predictions to the test dataset
test_df['health_ins'] = predictions

# Convert boolean predictions to string format (TRUE/FALSE)
test_df['health_ins'] = test_df['health_ins'].apply(lambda x: 'TRUE' if x else 'FALSE')

# Generate the current timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Format the file name with the timestamp
output_file = f'data/submissions/submission_{timestamp}.csv'
os.makedirs('data/submissions', exist_ok=True)

# Ensure 'custid' is a column in the submission DataFrame
test_df.reset_index(inplace=True)
submission = test_df[['custid', 'health_ins']]

submission.head()

submission.to_csv(output_file, index=False, header=True)
