In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib
import warnings

In [57]:
warnings.filterwarnings("ignore")

In [58]:
df = pd.read_csv('clean_cancer_cervical.csv')

In [59]:
column_mapping = {
    'Dx:Cancer':'dx_cancer',
    'Age': 'age',
    'Number of sexual partners': 'num_sexual_partners',
    'First sexual intercourse': 'first_sexual_intercourse',
    'Num of pregnancies': 'num_pregnancies',
    'Smokes (years)': 'smoking_years',
    'Hormonal Contraceptives (years)': 'hormonal_contraceptives_years',
    'IUD (years)': 'iud_years',
    'STDs (number)': 'num_stds',
    'STDs:condylomatosis': 'stds_condylomatosis',
    'STDs:cervical condylomatosis': 'stds_cervical_condylomatosis',
    'STDs:HIV': 'stds_hiv',
    'STDs:HPV': 'stds_hpv',
    'Dx:CIN': 'dx_cin',
    'Dx:HPV': 'dx_hpv'
}

df = df.rename(columns=column_mapping)

In [60]:
y = df['dx_cancer']
x = df.drop('dx_cancer', axis=1)

In [61]:
x.columns

Index(['age', 'num_sexual_partners', 'first_sexual_intercourse',
       'num_pregnancies', 'smoking_years', 'hormonal_contraceptives_years',
       'iud_years', 'num_stds', 'stds_condylomatosis',
       'stds_cervical_condylomatosis', 'stds_hiv', 'stds_hpv', 'dx_cin',
       'dx_hpv'],
      dtype='object')

In [62]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [63]:
pipe = Pipeline([
    ("scale", StandardScaler()),
    ("model", RandomForestClassifier())
])


In [64]:
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt', 'log2'],
    'model__class_weight': ['balanced', 'balanced_subsample', None]
}
model = GridSearchCV(estimator=pipe, cv=3, param_grid=param_grid)

In [65]:
model = model.fit(X_train, y_train)

In [66]:
joblib.dump(value=model.best_estimator_, filename="cancer_model_best.joblib")

['cancer_model_best.joblib']