### Installing & Importing all the necessary packages

Update system packages, install `libgl1` and python packages

In [None]:
from IPython.display import clear_output

!sudo apt-get update && apt-get install libgl1 -y
!pip install openpyxl xgboost lightgbm

clear_output()

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             confusion_matrix, classification_report, roc_curve, 
                             roc_auc_score, f1_score, matthews_corrcoef, hamming_loss, 
                             precision_recall_curve)

# Import classifiers and utilities from scikit-learn for building and managing machine learning models:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier
import lightgbm as lgb
from xgboost import XGBClassifier

# Image processing and augmentation
import os
from PIL import Image
import cv2
from tensorflow.keras.preprocessing.image import (load_img, img_to_array, ImageDataGenerator)
import albumentations as A
from skimage.segmentation import slic, mark_boundaries
import skimage.io

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz

# Machine learning and deep learning
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Dense, BatchNormalization, Flatten, Conv2D, 
                                     MaxPooling2D, Dropout, GlobalAveragePooling2D, 
                                     LeakyReLU, Activation)
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.optimizers import Adam, Adamax, SGD, Adagrad, Adadelta, RMSprop, Nadam
from tensorflow.keras.callbacks import (TensorBoard, ModelCheckpoint, EarlyStopping, 
                                        History, ReduceLROnPlateau, CSVLogger, LearningRateScheduler)
from tensorflow.keras.applications import (VGG19, ResNet152V2, ResNet50V2, ResNet101V2, 
                                           InceptionV3, InceptionResNetV2, MobileNetV2, 
                                           DenseNet169, NASNetMobile, EfficientNetB7, ConvNeXtBase)

# Explanation and visualization tools
from lime import lime_image
import shap
from IPython.display import Image, display

# Miscellaneous
import warnings
warnings.filterwarnings('ignore')

# IPython magic commands for TensorBoard
%matplotlib inline
%load_ext tensorboard
%reload_ext tensorboard

# Set plot parameters
params = {'figure.figsize': (16, 8),
          'legend.fontsize': 16,
          'legend.handlelength': 2,
          'axes.titlesize': 'large'}
sns.set_theme(style="white")
plt.rcParams.update(params)

### Google drive operations (optional)

In [None]:
#%cd "/content/drive/MyDrive/PCOS_TL_ML"
# !zip -r save_filename.extension . i folder_location_to_zip
#!zip -r folder.zip . i folder_to_zip
# !ls

In [None]:
#import os
#os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive"
# my kaggle.json file in /content/drive/MyDrive/ColabNotebooks

In [None]:
#!pip install kaggle
#!kaggle datasets init -p /content/drive/MyDrive/PCOS_TL_ML
# my dataset in Resized folder.

In [None]:
#!kaggle datasets create -p /content/drive/MyDrive/PCOS_TL_ML --dir-mode zip

### Data Loading and Cleaning

Load class labels from an Excel file, binary test labels from a CSV, clean the class labels DataFrame by removing empty rows and columns, and display the cleaned data.

In [None]:
la1=pd.read_excel('C:/Users/anushka saini/OneDrive/Desktop/AutoPCOS_classification_challenge/dataset/train/class_label.xlsx')
la1.head()

Unnamed: 0,imagePath,Healthy
0,100image13.jpg,1
1,100image2.jpg,1
2,100image65.jpg,0
3,100image71.jpg,0
4,100image83.jpg,0


In [None]:
la2 = pd.read_csv('C:/Users/anushka saini/OneDrive/Desktop/AutoPCOS_classification_challenge/dataset/test_label_binary.csv')
la2.head()

Unnamed: 0,imagePath,Healthy
0,image10000.jpg,1
1,image10001.jpg,0
2,image10002.jpg,1
3,image10003.jpg,0
4,image10004.jpg,0


In [None]:
df = la1.dropna(how='all').dropna(how='all', axis=1)
print(df)

           imagePath  Healthy
0     100image13.jpg        1
1      100image2.jpg        1
2     100image65.jpg        0
3     100image71.jpg        0
4     100image83.jpg        0
...              ...      ...
3195       pco_4.jpg        1
3196       pco_5.jpg        0
3197       pco_6.jpg        0
3198       pco_7.jpg        0
3199       pco_8.jpg        0

[3200 rows x 2 columns]


### Train-Validate Split

Split the DataFrame into training and validation sets based on specified percentages, using optional random seed for reproducibility.

In [None]:
def train_validate_split(df, train_percent=.8, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    # test = df.iloc[perm[validate_end:]]
    return train, validate

In [None]:
train, validate = train_validate_split(df)

In [None]:
train

Unnamed: 0,imagePath,Healthy
881,image0436.jpg,0
1439,image1044.jpg,0
1028,image0592.jpg,0
208,191image0.jpg,0
10,102image93.jpg,0
...,...,...
2920,image3581.jpg,1
23,108image0081.jpg,0
1288,image0874.jpg,0
2161,image2678.jpg,0


In [None]:
validate

Unnamed: 0,imagePath,Healthy
327,3309image24.jpg,0
247,217image26.jpg,0
2281,image2835.jpg,1
2752,image3386.jpg,0
366,518image25.jpg,0
...,...,...
1299,image0885.jpg,0
895,image0450.jpg,0
1347,image0939.jpg,0
1651,image1607.jpg,0


### Image and Label Preparation

Map labels, load and preprocess images for training, validation, and testing, and display shapes of image arrays and labels.

In [None]:
label_mapping = {1: "Healthy", 0: "Unhealthy"}
df['Label'] = df['Healthy'].map(label_mapping)
print(df)

           imagePath  Healthy      Label
0     100image13.jpg        1    Healthy
1      100image2.jpg        1    Healthy
2     100image65.jpg        0  Unhealthy
3     100image71.jpg        0  Unhealthy
4     100image83.jpg        0  Unhealthy
...              ...      ...        ...
3195       pco_4.jpg        1    Healthy
3196       pco_5.jpg        0  Unhealthy
3197       pco_6.jpg        0  Unhealthy
3198       pco_7.jpg        0  Unhealthy
3199       pco_8.jpg        0  Unhealthy

[3200 rows x 3 columns]


In [None]:
train_labels = train[["Healthy"]].values
train_image_paths = [os.path.join('C:/Users/anushka saini/OneDrive/Desktop/AutoPCOS_classification_challenge/dataset/train/images', filename) for filename in train['imagePath']] #to change when using GPU. similarly for validate and test
train_images = []
for train_image_path in train_image_paths:
    image = load_img(train_image_path, target_size=(300, 300)) # change this when you run on GPU. 320X320 rahegi
    image = img_to_array(image) / 255.0  # Normalize pixel values between 0 and 1
    train_images.append(image)
train_images = np.array(train_images, dtype=np.float32)

In [None]:
train_images.shape

(2560, 300, 300, 3)

In [None]:
train_labels.shape

(2560, 1)

In [None]:
validate_labels = validate[["Healthy"]].values
validate_image_paths = [os.path.join('C:/Users/anushka saini/OneDrive/Desktop/AutoPCOS_classification_challenge/dataset/train/images', filename) for filename in validate['imagePath']]
validate_images = []
for validate_image_path in validate_image_paths:
    image = load_img(validate_image_path, target_size=(300, 300)) # change this when you run on GPU
    image = img_to_array(image) / 255.0  # Normalize pixel values between 0 and 1
    validate_images.append(image)
validate_images = np.array(validate_images, dtype=np.float32)

In [None]:
validate_images.shape

(640, 300, 300, 3)

In [None]:
validate_labels.shape

(640, 1)

In [None]:
test_labels = la2[["Healthy"]].values
test_image_paths = [os.path.join('C:/Users/anushka saini/OneDrive/Desktop/AutoPCOS_classification_challenge/dataset/test/images', filename) for filename in la2['imagePath']]
test_images = []
for test_image_path in test_image_paths:
    image = load_img(test_image_path, target_size=(300, 300)) # change this when you run on GPU
    image = img_to_array(image) / 255.0  # Normalize pixel values between 0 and 1
    test_images.append(image)
test_images = np.array(test_images, dtype=np.float32)

In [None]:
test_images.shape

(1468, 300, 300, 3)

In [None]:
test_labels.shape

(1468, 1)

### Data Reshaping for Classifier

Reshape training, validation, and test images into flat arrays for classifier input and print their shapes.

In [None]:
#train dataset 80% for classifier fitting
x_train = train_images.reshape(train_images.shape[0], -1)
y_train = train_labels
# print(x_train)
# print(y_train)
print(x_train.shape)
print(y_train.shape)

(2560, 270000)
(2560, 1)


In [None]:
#validation data 20% for classifier prediction
x_test_internal = validate_images.reshape(validate_images.shape[0], -1)
y_test_internal = validate_labels
# print(y_test_internal)
print(x_test_internal.shape)
print(y_test_internal.shape)

(640, 270000)
(640, 1)


In [None]:
x_test_external = test_images.reshape(test_images.shape[0], -1)
y_test_external = test_labels
# print(y_test_external)
print(x_test_external.shape)
print(y_test_external.shape)

(1468, 270000)
(1468, 1)


### Model Training and Evaluation

Train Multiple classifiers on the training data, then evaluate and print accuracy, balanced accuracy, weighted F1 score, weighted recall, weighted precision, and weighted Jaccard score for both internal and external test datasets.

In [34]:
'''from xgboost import XGBClassifier
clf8 = XGBClassifier().fit(x_train,y_train)
ypred16 = clf8.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred16) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred16) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred16, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred16, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred16, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred16, average='weighted') * 100)'''

# Adjust hyperparameters as needed
clf = XGBClassifier(max_depth=3, learning_rate=0.1, subsample=0.5)
clf.fit(x_train, y_train)
ypred = clf.predict(x_test_internal)

# Evaluate the model
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred, average='weighted') * 100)

Accuracy is 78.125
Balanced accuracy is 65.27162589173601
Weighted f1 score is 77.13293353229652
Weighted recall is 78.125
Weighted precision is 76.59622153209109
Weighted Jaccard score is 65.37600211378341


In [35]:
'''ypred17 = clf8.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred17) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred17) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred17, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred17, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred17, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred17, average='weighted') * 100)'''

ypred17 = clf.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred17) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred17) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred17, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred17, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred17, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred17, average='weighted') * 100)

Accuracy is 86.37602179836512
Balanced accuracy is 79.41871051208508
Weighted f1 score is 85.71119108155597
Weighted recall is 86.37602179836512
Weighted precision is 86.23964533243871
Weighted Jaccard score is 75.86334489657233


In [36]:
clf9 = lgb.LGBMClassifier().fit(x_train,y_train)
ypred18 = clf9.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred18) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred18) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred18, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred18, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred18, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred18, average='weighted') * 100)

[LightGBM] [Info] Number of positive: 756, number of negative: 1804
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 25.882862 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 47025184
[LightGBM] [Info] Number of data points in the train set: 2560, number of used features: 270000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.295312 -> initscore=-0.869720
[LightGBM] [Info] Start training from score -0.869720
Accuracy is 79.53125
Balanced accuracy is 66.66183714865255
Weighted f1 score is 78.42797681534816
Weighted recall is 79.53125
Weighted precision is 77.9773531303602
Weighted Jaccard score is 67.00127665538066


In [37]:
ypred19 = clf9.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred19) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred19) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred19, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred19, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred19, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred19, average='weighted') * 100)

Accuracy is 86.10354223433242
Balanced accuracy is 78.43278184303175
Weighted f1 score is 85.27710192055629
Weighted recall is 86.10354223433242
Weighted precision is 86.12935329683256
Weighted Jaccard score is 75.2991404862292
