In [18]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("../data/processed/kaggle_cancer_patients_processed.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30104 entries, 0 to 30103
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 30104 non-null  int64  
 1   Gender              30104 non-null  object 
 2   Country_Region      30104 non-null  object 
 3   Genetic_Risk        30104 non-null  float64
 4   Air_Pollution       30104 non-null  float64
 5   Alcohol_Use         30104 non-null  float64
 6   Smoking             30104 non-null  float64
 7   Obesity_Level       30104 non-null  float64
 8   Cancer_Type         30104 non-null  object 
 9   Cancer_Stage        30104 non-null  int64  
 10  Treatment_Cost_USD  30104 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 2.5+ MB


In [4]:
df.head()

Unnamed: 0,Age,Gender,Country_Region,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Type,Cancer_Stage,Treatment_Cost_USD
0,71,Male,UK,6.4,2.8,9.5,0.9,8.7,Lung,3,62913.44
1,40,Male,UK,1.7,2.9,4.8,3.5,2.7,Colon,1,67446.25
2,43,Female,Brazil,5.1,2.8,2.3,6.7,0.5,Skin,3,77977.12
3,22,Male,Germany,9.5,6.4,3.3,3.9,5.1,Cervical,4,33468.99
4,41,Male,Canada,5.1,8.2,0.3,3.7,2.1,Cervical,0,9790.83


In [None]:
X = df.drop('Treatment_Cost_USD', axis=1)
y = df['Treatment_Cost_USD']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [None]:
categorical_cols = ['Gender', 'Country_Region', 'Cancer_Type']
numerical_cols = ['Age', 'Genetic_Risk', 'Air_Pollution', 'Alcohol_Use', 'Smoking', 'Obesity_Level', 'Cancer_Stage']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', MinMaxScaler(), numerical_cols)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(max_depth=5, random_state=42))
])

pipeline.fit(X_train, y_train)
joblib.dump(pipeline, '../models/treatment_cost_model.pkl')