### Installing & Importing all the necessary packages

Update system packages, install `libgl1`, and install the `openpyxl` library for handling Excel files

In [None]:
from IPython.display import clear_output

!sudo apt-get update && apt-get install libgl1 -y
!pip install openpyxl

clear_output()

In [5]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             confusion_matrix, classification_report, roc_curve, 
                             roc_auc_score, f1_score, matthews_corrcoef, hamming_loss, 
                             precision_recall_curve)

# Import classifiers and utilities from scikit-learn for building and managing machine learning models:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier
import lightgbm as lgb
from xgboost import XGBClassifier

# Image processing and augmentation
import os
from PIL import Image
import cv2
from tensorflow.keras.preprocessing.image import (load_img, img_to_array, ImageDataGenerator)
import albumentations as A
from skimage.segmentation import slic, mark_boundaries
import skimage.io

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz

# Machine learning and deep learning
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Dense, BatchNormalization, Flatten, Conv2D, 
                                     MaxPooling2D, Dropout, GlobalAveragePooling2D, 
                                     LeakyReLU, Activation)
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.optimizers import Adam, Adamax, SGD, Adagrad, Adadelta, RMSprop, Nadam
from tensorflow.keras.callbacks import (TensorBoard, ModelCheckpoint, EarlyStopping, 
                                        History, ReduceLROnPlateau, CSVLogger, LearningRateScheduler)
from tensorflow.keras.applications import (VGG19, ResNet152V2, ResNet50V2, ResNet101V2, 
                                           InceptionV3, InceptionResNetV2, MobileNetV2, 
                                           DenseNet169, NASNetMobile, EfficientNetB7, ConvNeXtBase)

# Explanation and visualization tools
from lime import lime_image
import shap
from IPython.display import Image, display

# Miscellaneous
import warnings
warnings.filterwarnings('ignore')

# IPython magic commands for TensorBoard
%matplotlib inline
%load_ext tensorboard
%reload_ext tensorboard

# Set plot parameters
params = {'figure.figsize': (16, 8),
          'legend.fontsize': 16,
          'legend.handlelength': 2,
          'axes.titlesize': 'large'}
sns.set_theme(style="white")
plt.rcParams.update(params)

### Google drive operations (optional)

In [9]:
#%cd "/content/drive/MyDrive/PCOS_TL_ML"
# !zip -r save_filename.extension . i folder_location_to_zip
#!zip -r folder.zip . i folder_to_zip
# !ls

In [10]:
#import os
#os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive"
# my kaggle.json file in /content/drive/MyDrive/ColabNotebooks

In [11]:
#!pip install kaggle
#!kaggle datasets init -p /content/drive/MyDrive/PCOS_TL_ML
# my dataset in Resized folder.

In [12]:
#!kaggle datasets create -p /content/drive/MyDrive/PCOS_TL_ML --dir-mode zip

### Data Loading and Cleaning

Load class labels from an Excel file, binary test labels from a CSV, clean the class labels DataFrame by removing empty rows and columns, and display the cleaned data.

In [None]:
la1=pd.read_excel('C:/Users/anushka saini/OneDrive/Desktop/AutoPCOS_classification_challenge/dataset/train/class_label.xlsx')
la1.head()

In [None]:
la2 = pd.read_csv('C:/Users/anushka saini/OneDrive/Desktop/AutoPCOS_classification_challenge/dataset/test_label_binary.csv')
la2.head()

In [None]:
df = la1.dropna(how='all').dropna(how='all', axis=1)
print(df)

### Train-Validate Split

Split the DataFrame into training and validation sets based on specified percentages, using optional random seed for reproducibility.

In [16]:
def train_validate_split(df, train_percent=.8, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    # test = df.iloc[perm[validate_end:]]
    return train, validate

In [17]:
train, validate = train_validate_split(df)

In [None]:
train

In [None]:
validate

### Image and Label Preparation

Map labels, load and preprocess images for training, validation, and testing, and display shapes of image arrays and labels.

In [None]:
label_mapping = {1: "Healthy", 0: "Unhealthy"}
df['Label'] = df['Healthy'].map(label_mapping)
print(df)

In [22]:
train_labels = train[["Healthy"]].values
train_image_paths = [os.path.join('C:/Users/anushka saini/OneDrive/Desktop/AutoPCOS_classification_challenge/dataset/train/images', filename) for filename in train['imagePath']] #to change when using GPU. similarly for validate and test
train_images = []
for train_image_path in train_image_paths:
    image = load_img(train_image_path, target_size=(300, 300)) # change this when you run on GPU. 320X320 rahegi
    image = img_to_array(image) / 255.0  # Normalize pixel values between 0 and 1
    train_images.append(image)
train_images = np.array(train_images, dtype=np.float32)

In [None]:
train_images.shape

In [None]:
train_labels.shape

In [25]:
validate_labels = validate[["Healthy"]].values
validate_image_paths = [os.path.join('C:/Users/anushka saini/OneDrive/Desktop/AutoPCOS_classification_challenge/dataset/train/images', filename) for filename in validate['imagePath']]
validate_images = []
for validate_image_path in validate_image_paths:
    image = load_img(validate_image_path, target_size=(300, 300)) # change this when you run on GPU
    image = img_to_array(image) / 255.0  # Normalize pixel values between 0 and 1
    validate_images.append(image)
validate_images = np.array(validate_images, dtype=np.float32)

In [None]:
validate_images.shape

In [None]:
validate_labels.shape

In [28]:
test_labels = la2[["Healthy"]].values
test_image_paths = [os.path.join('C:/Users/anushka saini/OneDrive/Desktop/AutoPCOS_classification_challenge/dataset/test/images', filename) for filename in la2['imagePath']]
test_images = []
for test_image_path in test_image_paths:
    image = load_img(test_image_path, target_size=(300, 300)) # change this when you run on GPU
    image = img_to_array(image) / 255.0  # Normalize pixel values between 0 and 1
    test_images.append(image)
test_images = np.array(test_images, dtype=np.float32)

In [None]:
test_images.shape

In [None]:
test_labels.shape

### Data Reshaping for Classifier

Reshape training, validation, and test images into flat arrays for classifier input and print their shapes.

In [None]:
#train dataset 80% for classifier fitting
x_train = train_images.reshape(train_images.shape[0], -1)
y_train = train_labels
# print(x_train)
# print(y_train)
print(x_train.shape)
print(y_train.shape)

In [None]:
#validation data 20% for classifier prediction
x_test_internal = validate_images.reshape(validate_images.shape[0], -1)
y_test_internal = validate_labels
# print(y_test_internal)
print(x_test_internal.shape)
print(y_test_internal.shape)

In [None]:
x_test_external = test_images.reshape(test_images.shape[0], -1)
y_test_external = test_labels
# print(y_test_external)
print(x_test_external.shape)
print(y_test_external.shape)

### Model Training and Evaluation

Train Multiple classifiers on the training data, then evaluate and print accuracy, balanced accuracy, weighted F1 score, weighted recall, weighted precision, and weighted Jaccard score for both internal and external test datasets.

In [None]:
clf0 = RandomForestClassifier(max_depth=2, random_state=0).fit(x_train, y_train)
ypred0 = clf0.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred0) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred0) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred0, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred0, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred0, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred0, average='weighted') * 100)

In [None]:
ypred1 = clf0.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred1) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred1) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred1, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred1, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred1, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred1, average='weighted') * 100)

In [None]:
clf1 = RidgeClassifier().fit(x_train, y_train)
ypred2 = clf1.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred2) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred2) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred2, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred2, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred2, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred2, average='weighted') * 100)


In [None]:
ypred3 = clf1.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred3) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred3) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred3, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred3, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred3, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred3, average='weighted') * 100)


In [None]:
extra_tree = ExtraTreeClassifier(random_state=0)
clf2 = BaggingClassifier(extra_tree, random_state=0).fit(x_train, y_train)
ypred4 = clf2.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred4) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred4) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred4, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred4, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred4, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred4, average='weighted') * 100)

In [None]:
ypred5 = clf2.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred5) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred5) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred5, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred5, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred5, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred5, average='weighted') * 100)

In [None]:
clf3 = MLPClassifier(random_state=1, max_iter=300).fit(x_train, y_train)
ypred6 = clf3.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred6) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred6) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred6, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred6, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred6, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred6, average='weighted') * 100)

In [None]:
ypred7 = clf3.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred7) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred7) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred7, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred7, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred7, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred7, average='weighted') * 100)

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)
clf4 = neigh.fit(x_train, y_train)
ypred8 = clf4.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred8) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred8) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred8, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred8, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred8, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred8, average='weighted') * 100)


In [None]:
ypred9 = clf4.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred9) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred9) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred9, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred9, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred9, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred9, average='weighted') * 100)

In [None]:
clf5 = DecisionTreeClassifier(random_state=0)
clf5 = clf5.fit(x_train, y_train)
ypred10 = clf5.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred10) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred10) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred10, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred10, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred10, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred10, average='weighted') * 100)

In [None]:
ypred11 = clf5.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred11) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred11) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred11, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred11, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred11, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred11, average='weighted') * 100)

In [None]:
clf6 = AdaBoostClassifier().fit(x_train,y_train)
ypred12 = clf6.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred12) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred12) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred12, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred12, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred12, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred12, average='weighted') * 100)

In [None]:
ypred13 = clf6.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred13) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred13) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred13, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred13, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred13, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred13, average='weighted') * 100)


In [None]:
clf7 = GaussianNB().fit(x_train,y_train)
ypred14 = clf7.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred14) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred14) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred14, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred14, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred14, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred14, average='weighted') * 100)

In [None]:
ypred15 = clf7.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred15) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred15) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred15, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred15, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred15, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred15, average='weighted') * 100)

In [None]:
clf8 = XGBClassifier(max_depth=3, learning_rate=0.1, subsample=0.5)
clf8.fit(x_train, y_train)
ypred16 = clf8.predict(x_test_internal)

# Evaluate the model
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred16) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred16) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred16, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred16, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred16, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred16, average='weighted') * 100)

In [None]:
ypred17 = clf8.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred17) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred17) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred17, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred17, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred17, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred17, average='weighted') * 100)

In [None]:
clf9 = lgb.LGBMClassifier().fit(x_train,y_train)
ypred18 = clf9.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred18) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred18) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred18, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred18, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred18, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred18, average='weighted') * 100)

In [None]:
ypred19 = clf9.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred19) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred19) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred19, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred19, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred19, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred19, average='weighted') * 100)