In [18]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("../data/processed/kaggle_cancer_patients_processed.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30104 entries, 0 to 30103
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 30104 non-null  int64  
 1   Gender              30104 non-null  object 
 2   Country_Region      30104 non-null  object 
 3   Genetic_Risk        30104 non-null  float64
 4   Air_Pollution       30104 non-null  float64
 5   Alcohol_Use         30104 non-null  float64
 6   Smoking             30104 non-null  float64
 7   Obesity_Level       30104 non-null  float64
 8   Cancer_Type         30104 non-null  object 
 9   Cancer_Stage        30104 non-null  int64  
 10  Treatment_Cost_USD  30104 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 2.5+ MB


In [4]:
df.head()

Unnamed: 0,Age,Gender,Country_Region,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Type,Cancer_Stage,Treatment_Cost_USD
0,71,Male,UK,6.4,2.8,9.5,0.9,8.7,Lung,3,62913.44
1,40,Male,UK,1.7,2.9,4.8,3.5,2.7,Colon,1,67446.25
2,43,Female,Brazil,5.1,2.8,2.3,6.7,0.5,Skin,3,77977.12
3,22,Male,Germany,9.5,6.4,3.3,3.9,5.1,Cervical,4,33468.99
4,41,Male,Canada,5.1,8.2,0.3,3.7,2.1,Cervical,0,9790.83


In [5]:
categorical_column = ['Gender', 'Country_Region', 'Cancer_Type']

encoder = OneHotEncoder(handle_unknown='ignore')
categorical_values = encoder.fit_transform(df[categorical_column].values).toarray()

joblib.dump(encoder, '../models/encoder.pkl')

encoder.get_feature_names_out(categorical_column)

array(['Gender_Female', 'Gender_Male', 'Gender_Other',
       'Country_Region_Australia', 'Country_Region_Brazil',
       'Country_Region_Canada', 'Country_Region_Germany',
       'Country_Region_UK', 'Country_Region_USA', 'Cancer_Type_Breast',
       'Cancer_Type_Cervical', 'Cancer_Type_Colon',
       'Cancer_Type_Leukemia', 'Cancer_Type_Liver', 'Cancer_Type_Lung',
       'Cancer_Type_Prostate', 'Cancer_Type_Skin'], dtype=object)

In [6]:
numerical_column = ['Age', 'Genetic_Risk', 'Air_Pollution', 'Alcohol_Use', 'Smoking', 'Obesity_Level', 'Cancer_Stage']

scaler = MinMaxScaler()
numerical_values = scaler.fit_transform(df[numerical_column].values)

joblib.dump(scaler, '../models/scaler.pkl')

numerical_values

array([[0.73913043, 0.64      , 0.28      , ..., 0.09      , 0.87      ,
        0.75      ],
       [0.28985507, 0.17      , 0.29      , ..., 0.35      , 0.27      ,
        0.25      ],
       [0.33333333, 0.51      , 0.28      , ..., 0.67      , 0.05      ,
        0.75      ],
       ...,
       [0.1884058 , 0.4       , 0.5       , ..., 0.83      , 0.91      ,
        0.25      ],
       [0.28985507, 0.64      , 0.35      , ..., 0.9       , 0.98      ,
        0.25      ],
       [0.01449275, 0.4       , 0.65      , ..., 0.86      , 0.81      ,
        1.        ]])

In [7]:
x = np.concatenate([categorical_values, numerical_values], axis=1)
x.shape

(30104, 24)

In [16]:
y = scaler.fit_transform(df['Treatment_Cost_USD'].values.reshape(-1, 1))
y.shape

(30104, 1)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [10]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((21072, 24), (9032, 24), (21072, 1), (9032, 1))

In [24]:
model = RandomForestRegressor(max_depth=5, random_state=42).fit(x_train, y_train.ravel())

joblib.dump(model, '../models/treatment_cost_model.pkl')

['../models/treatment_cost_model.pkl']