In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import pickle



In [2]:
import pandas as pd

# dataset PATH
df = pd.read_csv("data/final_features.csv")

# Extract features and label
X = df.drop(["Patient ID", "Staging(Tumor Size)# [T]"], axis=1)
y = df["Staging(Tumor Size)# [T]"]


In [3]:
print(df.columns.tolist())


['Patient ID', 'Staging(Tumor Size)# [T]', 'Staging(Nodes)#(Nx replaced by -1)[N]', 'Staging(Metastasis)#(Mx -replaced by -1)[M]', 'Age at mammo (days)', 'Tumor Size (cm).1', 'TumorMajorAxisLength_mm', 'Tumor Location', 'Menopause (at diagnosis)', 'Volume_cu_mm_Tumor', 'Lymphadenopathy or Suspicious Nodes', 'Metastatic at Presentation (Outside of Lymph Nodes)', 'Peak_SER_tumor', 'Grouping_based_proportion_of_tumor_voxels_3D_tumor_Group_3', 'WashinRate_PC2', 'WashinRate_PC3', 'ser_pe_PC4', 'ser_pe_PC7', 'Autocorrelation_Tumor', 'Energy_Tumor', 'Variance_of_RGH_values_Tumor', 'Mean_norm_DLBP_max_timepoint_binsize_256_with_filling_Tumor', 'Echogenicity']


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
selector = SelectKBest(score_func=f_classif, k=15)
X_train_reduced = selector.fit_transform(X_train, y_train)
X_test_reduced = selector.transform(X_test)

top_features = X.columns[selector.get_support()]
print("Top 15 features:\n", top_features)


Top 15 features:
 Index(['Staging(Nodes)#(Nx replaced by -1)[N]', 'TumorMajorAxisLength_mm',
       'Menopause (at diagnosis)', 'Volume_cu_mm_Tumor',
       'Lymphadenopathy or Suspicious Nodes',
       'Metastatic at Presentation (Outside of Lymph Nodes)', 'Peak_SER_tumor',
       'Grouping_based_proportion_of_tumor_voxels_3D_tumor_Group_3',
       'WashinRate_PC2', 'ser_pe_PC7', 'Autocorrelation_Tumor', 'Energy_Tumor',
       'Variance_of_RGH_values_Tumor',
       'Mean_norm_DLBP_max_timepoint_binsize_256_with_filling_Tumor',
       'Echogenicity'],
      dtype='object')


In [6]:
#Random Forest on FULL FEATURES
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)
rf_score = model_rf.score(X_test, y_test)
print("Random Forest Accuracy:", rf_score)

# Save model
pickle.dump(model_rf, open("finalized_model_RF.model", 'wb'))


Random Forest Accuracy: 0.6324324324324324


In [7]:
#Logistical Regression on REDUCED FEATURES
model_lr = LogisticRegression(max_iter=5000)
model_lr.fit(X_train, y_train)
lr_score = model_lr.score(X_test, y_test)
print("Logistic Regression Accuracy (Top 15 features):", lr_score)

# Save model
pickle.dump(model_lr, open("finalized_model_LR.model", 'wb'))


Logistic Regression Accuracy (Top 15 features): 0.5891891891891892


In [8]:
# Support Vector Classifier
model_svc = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
)
model_svc.fit(X_train, y_train)
svc_score = model_svc.score(X_test, y_test)
print("SVC Accuracy:", svc_score)

# Save model
pickle.dump(model_svc, open("finalized_model_SVC.model", 'wb'))


SVC Accuracy: 0.5027027027027027


In [9]:
# K-Nearest Neighbors (KNN)
model_knn = KNeighborsClassifier(
    n_neighbors=5,
    weights='distance',
    metric='minkowski'
)
model_knn.fit(X_train, y_train)
knn_score = model_knn.score(X_test, y_test)
print("KNN Accuracy:", knn_score)

# Save model
pickle.dump(model_knn, open("finalized_model_KNN.model", 'wb'))

KNN Accuracy: 0.5351351351351351
