In [1]:

# Final Decision Tree Model Script

import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
import pickle

# Load dataset

path = R"C:\Users\ninah\OneDrive\Desktop\DR DENIS\Midterm_Project\dataset\medical_insurance.csv"
df = pd.read_csv(path)

# Preprocessing

# Drop identifier
df = df.drop(columns=['person_id'])

# Fill missing categorical values
df['alcohol_freq'] = df['alcohol_freq'].fillna('Unknown')

# Log-transform the target
df['annual_medical_cost_log'] = np.log1p(df['annual_medical_cost'])
df = df.drop(columns=['annual_medical_cost'])
df = df.rename(columns={'annual_medical_cost_log': 'annual_medical_cost'})

# Drop columns with infinite VIF (high multicollinearity)
drop_inf_cols = [
    'diabetes', 'liver_disease', 'arthritis', 'mental_health', 'asthma', 
    'copd', 'cardiovascular_disease', 'cancer_history', 'kidney_disease',
    'monthly_premium'
]
df = df.drop(columns=drop_inf_cols)

# Feature Columns


categorical_columns = df.columns[df.dtypes == 'object'].tolist()
numerical_columns = list(set(df.columns) - set(categorical_columns))

# Remove target from numerical features
numerical_columns.remove('annual_medical_cost')

features = numerical_columns + categorical_columns


# Convert data to Dict format for DictVectorizer

df_dict = df[features].to_dict(orient='records')

# Initialize vectorizer
dv = DictVectorizer(sparse=False)
X_full = dv.fit_transform(df_dict)

# Target variable
y_full = df['annual_medical_cost'].values


# Train final Decision Tree model

final_dt = DecisionTreeRegressor(
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=1,
    random_state=42
)

final_dt.fit(X_full, y_full)


# Save combined model (vectorizer + model) using pickle

output_file = "final_dt_model.bin"

with open(output_file, 'wb') as f_out: 
    pickle.dump((dv, final_dt), f_out)

print(f"Model and vectorizer saved to {output_file}")
  

Model and vectorizer saved to final_dt_model.bin


In [2]:
import pandas as pd
import numpy as np
import pickle

def predict_medical_cost(customer_data):
    """
    Predict annual medical costs for new customers
    
    Parameters:
    customer_data: Can be a single dictionary or list of dictionaries
    
    Returns:
    np.array: Predicted annual medical costs in dollars
    """
    # Load model
    with open("final_dt_model.bin", 'rb') as f_in:
        dv, model = pickle.load(f_in)
    
    # Convert input to DataFrame
    if isinstance(customer_data, dict):
        # Single customer as dictionary
        customer_df = pd.DataFrame([customer_data])
    elif isinstance(customer_data, list):
        # Multiple customers as list of dictionaries
        customer_df = pd.DataFrame(customer_data)
    elif isinstance(customer_data, pd.DataFrame):
        # Already a DataFrame
        customer_df = customer_data
    else:
        raise ValueError("Input must be a dictionary, list of dictionaries, or DataFrame")
    
    print(f"ðŸ“Š Processing {len(customer_df)} customer(s)")
    
    # Transform features and predict
    X = dv.transform(customer_df.to_dict(orient='records'))
    y_pred_log = model.predict(X)
    y_pred = np.expm1(y_pred_log)
    
    return y_pred

In [3]:
new_customer = pd.DataFrame([
        {
            'age': 45,
            'sex': 'male',
            'region': 'northwest',
            'urban_rural': 'urban',
            'income': 75000.0,
            'education': 'bachelor',
            'marital_status': 'married',
            'employment_status': 'employed',
            'household_size': 3,
            'dependents': 1,
            'bmi': 27.5,
            'smoker': 'no',
            'alcohol_freq': 'never',
            'visits_last_year': 2,
            'hospitalizations_last_3yrs': 0,
            'days_hospitalized_last_3yrs': 0,
            'medication_count': 1,
            'systolic_bp': 120.0,
            'diastolic_bp': 80.0,
            'ldl': 110.0,
            'hba1c': 5.4,
            'plan_type': 'HMO',
            'network_tier': 'Silver',
            'deductible': 500,
            'copay': 25,
            'policy_term_years': 3,
            'policy_changes_last_2yrs': 0,
            'provider_quality': 4.5,
            'risk_score': 0.2,
            'annual_premium': 5000.0,
            'claims_count': 2,
            'avg_claim_amount': 300.0,
            'total_claims_paid': 600.0,
            'chronic_count': 0,
            'hypertension': 0,
            'proc_imaging_count': 0,
            'proc_surgery_count': 0,
            'proc_physio_count': 0,
            'proc_consult_count': 1,
            'proc_lab_count': 1,
            'is_high_risk': 0,
            'had_major_procedure': 0
        }
    ])   

In [5]:
prediction = predict_medical_cost(new_customer)
prediction

ðŸ“Š Processing 1 customer(s)


array([48289.68555035])