In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [4]:
ordinal_features = ['Risk','T','N','M','Stage','Response']
nominal_features = ['Gender', 'Smoking','Hx Smoking', 'Hx Radiothreapy','Focality','Thyroid Function','Physical Examination','Adenopathy','Pathology',]
numerical_features = ['Age']

In [5]:
ordinal_categories = {                
    'Risk': ['Low', 'Intermediate', 'High'], 
    'T': ['T1a','T1b','T2','T3a','T3b','T4a','T4b'],
    'N': ['N0', 'N1a', 'N1b'],
    'M': ['M0', 'M1'],
    'Stage': ['I','II','III','IVA','IVB'],
    'Response': ['Excellent', 'Biochemical Incomplete', 'Structural Incomplete', 'Indeterminate']
}

In [6]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories=[ordinal_categories[feat] for feat in ordinal_features]), 
         ordinal_features),
        ('nom', OneHotEncoder(drop='first', sparse_output=False), nominal_features),
        ('num', StandardScaler(), numerical_features)
    ])

In [8]:
X = df.drop('Recurred', axis=1)
y = df['Recurred']

In [9]:
X_preprocessed = preprocessor.fit_transform(X)

In [10]:
ordinal_encoded_names = ordinal_features
nominal_encoded_names = preprocessor.named_transformers_['nom'].get_feature_names_out(nominal_features)
feature_names = list(ordinal_encoded_names) + list(nominal_encoded_names) + numerical_features
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names)

In [11]:
X_preprocessed_df.head()

Unnamed: 0,Risk,T,N,M,Stage,Response,Gender_M,Smoking_Yes,Hx Smoking_Yes,Hx Radiothreapy_Yes,...,Physical Examination_Single nodular goiter-right,Adenopathy_Extensive,Adenopathy_Left,Adenopathy_No,Adenopathy_Posterior,Adenopathy_Right,Pathology_Hurthel cell,Pathology_Micropapillary,Pathology_Papillary,Age
0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.917439
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.454315
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.718957
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.398184
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.398184


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [14]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [15]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('ord', ...), ('nom', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['Low', 'Intermediate', ...], ['T1a', 'T1b', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
y_pred = pipeline.predict(X_test)
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score

def evaluate_model(y_test, y_pred, y_prob, model_name):
    print(f"\n{model_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, pos_label='Yes'))
    print("Recall:", recall_score(y_test, y_pred, pos_label='Yes'))
    print("F1 Score:", f1_score(y_test, y_pred, pos_label='Yes'))
    print("ROC-AUC:", roc_auc_score(y_test, y_prob))

evaluate_model(y_test, y_pred, pipeline.predict_proba(X_test)[:, 1], "Random Forest Classifier")


Random Forest Classifier:
Accuracy: 0.9739130434782609
Precision: 1.0
Recall: 0.90625
F1 Score: 0.9508196721311475
ROC-AUC: 0.995105421686747


In [17]:
import joblib

joblib.dump(pipeline, 'thyroid_model.pkl')

['thyroid_model.pkl']