### Installing & Importing all the necessary packages

Update system packages, install `libgl1`, and install the `openpyxl` library for handling Excel files

- **`albumentations`**: Library for image augmentation to enhance training data diversity.
- **`sweetviz`**: Generates high-density visualizations of pandas DataFrames for quick data analysis.
- **`grad-cam`**: Visualizes important image regions for CNN predictions using Grad-CAM.
- **`lime`**: Provides local explanations for machine learning model predictions.
- **`pandas_profiling`**: Creates detailed reports of pandas DataFrames for exploratory data analysis.
- **`shap`**: Explains model predictions by attributing feature contributions using Shapley values.
- **`Keras-Preprocessing`**: Offers utilities for preprocessing data, including image and text transformations.

In [None]:
from IPython.display import clear_output

!sudo apt-get update && apt-get install libgl1 -y
!pip install openpyxl -q
!pip install albumentations sweetviz grad-cam lime pandas_profiling shap Keras-Preprocessing -q

clear_output()

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             confusion_matrix, classification_report, roc_curve, 
                             roc_auc_score, f1_score, matthews_corrcoef, hamming_loss, 
                             precision_recall_curve)

# Import classifiers and utilities from scikit-learn for building and managing machine learning models:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression

# Image processing and augmentation
import os
from PIL import Image
import cv2
from tensorflow.keras.preprocessing.image import (load_img, img_to_array, ImageDataGenerator)
import albumentations as A
from skimage.segmentation import slic, mark_boundaries
import skimage.io

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz

# Machine learning and deep learning
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Dense, BatchNormalization, Flatten, Conv2D, 
                                     MaxPooling2D, Dropout, GlobalAveragePooling2D, 
                                     LeakyReLU, Activation)
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.optimizers import Adam, Adamax, SGD, Adagrad, Adadelta, RMSprop, Nadam
from tensorflow.keras.callbacks import (TensorBoard, ModelCheckpoint, EarlyStopping, 
                                        History, ReduceLROnPlateau, CSVLogger, LearningRateScheduler)
from tensorflow.keras.applications import (VGG19, ResNet152V2, ResNet50V2, ResNet101V2, 
                                           InceptionV3, InceptionResNetV2, MobileNetV2, 
                                           DenseNet169, NASNetMobile, EfficientNetB7, ConvNeXtBase)

# Explanation and visualization tools
from lime import lime_image
import shap
from IPython.display import Image, display

# Miscellaneous
import warnings
warnings.filterwarnings('ignore')

# IPython magic commands for TensorBoard
%matplotlib inline
%load_ext tensorboard
%reload_ext tensorboard

# Set plot parameters
params = {'figure.figsize': (16, 8),
          'legend.fontsize': 16,
          'legend.handlelength': 2,
          'axes.titlesize': 'large'}
sns.set_theme(style="white")
plt.rcParams.update(params)

### Data Loading and Cleaning

Load class labels from an Excel file, binary test labels from a CSV, clean the class labels DataFrame by removing empty rows and columns, and display the cleaned data.

In [None]:
la1=pd.read_excel('/workspace/anushka saini/train_val/multilabelpcos.xlsx')
la1

Unnamed: 0,ImagePath,Round and Thin,Cumulus oophorous,Corpus luteum,Hemorrhagic ovarian cyst,Hemorrhagic corpus luteum,Endometrioma,serous cystadenoma,Serous cystadenocarcinoma,Mucinous cystadenoma,...,Foetus,Chocolate cyst,Cervix,Urinary bladder,Polyp,Cervical cyst,Adnexa,Vagina,Uterus,Ovary
0,100image65.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,100image71.jpg,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,100image83.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,100image87.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,100image94.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2292,pco_3.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2293,pco_5.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2294,pco_6.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2295,pco_7.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
la2 = pd.read_csv('/workspace/anushka saini/test/test_label_multi.csv')
la2

Unnamed: 0,imagePath,Round and Thin,Cumulus oophorous,Corpus luteum,Hemorrhagic ovarian cyst,Hemorrhagic corpus luteum,Endometrioma,serous cystadenoma,Serous cystadenocarcinoma,Mucinous cystadenoma,...,Thick hyperechoic margin,Vaginal ultrasound,Transvaginal ultrasound,Gestational sac,Foetus,Chocolate cyst,Cervix,Urinary bladder,Polyp,Cervical cyst
0,image10000.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,image10001.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,image10002.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,image10003.jpg,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,image10004.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1463,image11463.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1464,image11464.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1465,image11465.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1466,image11466.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df = la1.dropna(how='all').dropna(how='all', axis=1)
print(df)

           ImagePath  Round and Thin  Cumulus oophorous  Corpus luteum  \
0     100image65.jpg               0                  0              0   
1     100image71.jpg               0                  0              0   
2     100image83.jpg               0                  0              0   
3     100image87.jpg               0                  0              0   
4     100image94.jpg               0                  0              0   
...              ...             ...                ...            ...   
2292       pco_3.jpg               0                  0              0   
2293       pco_5.jpg               0                  0              0   
2294       pco_6.jpg               0                  0              0   
2295       pco_7.jpg               0                  0              0   
2296       pco_8.jpg               0                  0              0   

      Hemorrhagic ovarian cyst  Hemorrhagic corpus luteum  Endometrioma  \
0                            0      

### Train-Validate Split

Split the DataFrame into training and validation sets based on specified percentages, using optional random seed for reproducibility.

In [None]:
def train_validate_split(df, train_percent=.8, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    # test = df.iloc[perm[validate_end:]]
    return train, validate

In [None]:
train, validate = train_validate_split(df)
train

Unnamed: 0,ImagePath,Round and Thin,Cumulus oophorous,Corpus luteum,Hemorrhagic ovarian cyst,Hemorrhagic corpus luteum,Endometrioma,serous cystadenoma,Serous cystadenocarcinoma,Mucinous cystadenoma,...,Foetus,Chocolate cyst,Cervix,Urinary bladder,Polyp,Cervical cyst,Adnexa,Vagina,Uterus,Ovary
2205,image3742.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
543,image0252.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,104image0086.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
178,202image106.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1621,image2260.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820,image0581.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1809,image2722.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1141,image0957.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
370,89image0098.jpg,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
validate

Unnamed: 0,ImagePath,Round and Thin,Cumulus oophorous,Corpus luteum,Hemorrhagic ovarian cyst,Hemorrhagic corpus luteum,Endometrioma,serous cystadenoma,Serous cystadenocarcinoma,Mucinous cystadenoma,...,Foetus,Chocolate cyst,Cervix,Urinary bladder,Polyp,Cervical cyst,Adnexa,Vagina,Uterus,Ovary
951,image0724.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
415,95image93.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
290,523image18.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1279,image1117.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
249,32image0025.jpg,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2226,image3769.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1666,image2417.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1690,image2467.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1878,image3115.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Image and Label Preparation

Map labels, load and preprocess images for training, validation, and testing, and display shapes of image arrays and labels.

In [None]:
train_labels = train[["Round and Thin", "Cumulus oophorous", "Corpus luteum", "Hemorrhagic ovarian cyst", "Hemorrhagic corpus luteum", "Endometrioma", "serous cystadenoma", "Serous cystadenocarcinoma", "Mucinous cystadenoma", "Mucinous cystadenocarcinoma", "Dermoid cyst", "Dermoid plug", "Rokitansky nodule", "Dermoid mesh", "Dot dash pattern", "Floating balls sign", "Ovarian fibroma", "Ovarian thecoma", "Metastasis", "Para ovarian cyst", "Polycystic ovary", "Ovarian hyperstimulation syndrome", "Ovarian torsion", "Thick hyperechoic margin", "Vaginal ultrasound", "Transvaginal ultrasound", "Gestational sac", "Foetus", "Chocolate cyst", "Cervix", "Urinary bladder", "Polyp", "Cervical cyst"]].values
train_image_paths = [os.path.join('/workspace/anushka saini/train_val/images', filename) for filename in train['ImagePath']] #to change when using GPU. similarly for validate and test
train_images = []
for train_image_path in train_image_paths:
    image = load_img(train_image_path, target_size=(300, 300)) # change this when you run on GPU. 320X320 rahegi
    image = img_to_array(image) / 255.0  # Normalize pixel values between 0 and 1
    train_images.append(image)
train_images = np.array(train_images, dtype=np.float32)

In [None]:
train_images.shape

(1837, 300, 300, 3)

In [None]:
train_labels.shape

(1837, 33)

In [None]:
validate_labels = validate[["Round and Thin", "Cumulus oophorous", "Corpus luteum", "Hemorrhagic ovarian cyst", "Hemorrhagic corpus luteum", "Endometrioma", "serous cystadenoma", "Serous cystadenocarcinoma", "Mucinous cystadenoma", "Mucinous cystadenocarcinoma", "Dermoid cyst", "Dermoid plug", "Rokitansky nodule", "Dermoid mesh", "Dot dash pattern", "Floating balls sign", "Ovarian fibroma", "Ovarian thecoma", "Metastasis", "Para ovarian cyst", "Polycystic ovary", "Ovarian hyperstimulation syndrome", "Ovarian torsion", "Thick hyperechoic margin", "Vaginal ultrasound", "Transvaginal ultrasound", "Gestational sac", "Foetus", "Chocolate cyst", "Cervix", "Urinary bladder", "Polyp", "Cervical cyst"]].values
validate_image_paths = [os.path.join('/workspace/anushka saini/train_val/images', filename) for filename in validate['ImagePath']]
validate_images = []
for validate_image_path in validate_image_paths:
    image = load_img(validate_image_path, target_size=(300, 300)) # change this when you run on GPU
    image = img_to_array(image) / 255.0  # Normalize pixel values between 0 and 1
    validate_images.append(image)
validate_images = np.array(validate_images, dtype=np.float32)

In [None]:
validate_images.shape

(459, 300, 300, 3)

In [None]:
validate_labels.shape

(459, 33)

In [None]:
test_labels = la2[["Round and Thin", "Cumulus oophorous", "Corpus luteum", "Hemorrhagic ovarian cyst", "Hemorrhagic corpus luteum", "Endometrioma", "serous cystadenoma", "Serous cystadenocarcinoma", "Mucinous cystadenoma", "Mucinous cystadenocarcinoma", "Dermoid cyst", "Dermoid plug", "Rokitansky nodule", "Dermoid mesh", "Dot dash pattern", "Floating balls sign", "Ovarian fibroma", "Ovarian thecoma", "Metastasis", "Para ovarian cyst", "Polycystic ovary", "Ovarian hyperstimulation syndrome", "Ovarian torsion", "Thick hyperechoic margin", "Vaginal ultrasound", "Transvaginal ultrasound", "Gestational sac", "Foetus", "Chocolate cyst", "Cervix", "Urinary bladder", "Polyp", "Cervical cyst"]].values
test_image_paths = [os.path.join('/workspace/anushka saini/test/images', filename) for filename in la2['imagePath']]
test_images = []
for test_image_path in test_image_paths:
    image = load_img(test_image_path, target_size=(300, 300)) # change this when you run on GPU
    image = img_to_array(image) / 255.0  # Normalize pixel values between 0 and 1
    test_images.append(image)
test_images = np.array(test_images, dtype=np.float32)

In [None]:
test_images.shape

(1468, 300, 300, 3)

In [None]:
test_labels.shape

(1468, 33)

### Data Reshaping for Classifier

Reshape training, validation, and test images into flat arrays for classifier input and print their shapes.

In [None]:
#train dataset 80% for classifier fitting
x_train = train_images.reshape(train_images.shape[0], -1)
y_train = train_labels
# print(x_train)
# print(y_train)
print(x_train.shape)
print(y_train.shape)

(2560, 270000)
(2560, 1)


In [None]:
#validation data 20% for classifier prediction
x_test_internal = validate_images.reshape(validate_images.shape[0], -1)
y_test_internal = validate_labels
# print(y_test_internal)
print(x_test_internal.shape)
print(y_test_internal.shape)

(640, 270000)
(640, 1)


In [None]:
x_test_external = test_images.reshape(test_images.shape[0], -1)
y_test_external = test_labels
# print(y_test_external)
print(x_test_external.shape)
print(y_test_external.shape)

(1468, 270000)
(1468, 1)


### Model Training and Evaluation

Train Multiple classifiers on the training data, then evaluate and print accuracy, balanced accuracy, weighted F1 score, weighted recall, weighted precision, and weighted Jaccard score for both internal and external test datasets.

In [1]:
clf0 = RandomForestClassifier(max_depth=2, random_state=0)
clf0 = RandomForestClassifier().fit(x_train, y_train)
ypred0 = clf0.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred0) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred0) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred0, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred0, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred0, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred0, average='weighted') * 100)

Accuracy is 79.73856209150327
Balanced accuracy is 26.612548240059002
Weighted f1 score is 77.54581688085524
Weighted recall is 79.73856209150327
Weighted precision is 77.15338406092705
Weighted Jaccard score is 67.56828376571656


In [4]:
ypred1 = clf0.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred1) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred1) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred1, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred1, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred1, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred1, average='weighted') * 100)

Accuracy is 34.468664850136236
Balanced accuracy is 12.659604467666558
Weighted f1 score is 25.618007168678204
Weighted recall is 34.468664850136236
Weighted precision is 35.98085173391296
Weighted Jaccard score is 16.046263574354093


In [5]:
clf1 = RidgeClassifier().fit(x_train, y_train)
ypred2 = clf1.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred2) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred2) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred2, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred2, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred2, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred2, average='weighted') * 100)

Accuracy is 76.68845315904139
Balanced accuracy is 26.243928934926064
Weighted f1 score is 75.64852643219727
Weighted recall is 76.68845315904139
Weighted precision is 75.27922417789323
Weighted Jaccard score is 64.92137402043848


In [6]:
ypred3 = clf1.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred3) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred3) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred3, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred3, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred3, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred3, average='weighted') * 100)

Accuracy is 38.76021798365122
Balanced accuracy is 13.276143577117521
Weighted f1 score is 33.68733008838638
Weighted recall is 38.76021798365122
Weighted precision is 52.68848268811881
Weighted Jaccard score is 21.69275389663793


In [7]:
extra_tree = ExtraTreeClassifier(random_state=0)
clf2 = BaggingClassifier(extra_tree, random_state=0).fit(x_train, y_train)
ypred4 = clf2.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred4) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred4) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred4, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred4, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred4, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred4, average='weighted') * 100)

Accuracy is 79.73856209150327
Balanced accuracy is 27.219725914130503
Weighted f1 score is 77.61691025559814
Weighted recall is 79.73856209150327
Weighted precision is 77.4796603673609
Weighted Jaccard score is 67.60931030146034


In [8]:
ypred5 = clf2.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred5) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred5) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred5, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred5, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred5, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred5, average='weighted') * 100)

Accuracy is 36.10354223433242
Balanced accuracy is 12.34879580914639
Weighted f1 score is 27.312263526473984
Weighted recall is 36.10354223433242
Weighted precision is 37.7983403954412
Weighted Jaccard score is 17.182960123618987


In [9]:
clf3 = MLPClassifier(random_state=1, max_iter=300).fit(x_train, y_train)
ypred6 = clf3.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred6) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred6) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred6, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred6, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred6, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred6, average='weighted') * 100)

Accuracy is 76.25272331154684
Balanced accuracy is 19.109245011325353
Weighted f1 score is 74.83530921514436
Weighted recall is 76.25272331154684
Weighted precision is 75.24111468799791
Weighted Jaccard score is 64.71210487884237


In [10]:
ypred7 = clf3.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred7) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred7) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred7, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred7, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred7, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred7, average='weighted') * 100)

Accuracy is 41.55313351498637
Balanced accuracy is 9.636694787379692
Weighted f1 score is 36.886527211963646
Weighted recall is 41.55313351498637
Weighted precision is 44.20691903590792
Weighted Jaccard score is 24.065724696076565


In [11]:
neigh = KNeighborsClassifier(n_neighbors=3)
clf4 = neigh.fit(x_train, y_train)
ypred8 = clf4.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred8) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred8) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred8, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred8, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred8, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred8, average='weighted') * 100)

Accuracy is 77.77777777777779
Balanced accuracy is 29.151346123189743
Weighted f1 score is 76.81085215359379
Weighted recall is 77.77777777777779
Weighted precision is 76.45283918584155
Weighted Jaccard score is 66.19148488671269


In [12]:
ypred9 = clf4.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred9) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred9) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred9, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred9, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred9, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred9, average='weighted') * 100)

Accuracy is 39.782016348773844
Balanced accuracy is 10.835645620187668
Weighted f1 score is 34.7147899367031
Weighted recall is 39.782016348773844
Weighted precision is 51.59520891135634
Weighted Jaccard score is 22.528437032668773


In [13]:
clf5 = DecisionTreeClassifier(random_state=0)
clf5 = DecisionTreeClassifier().fit(x_train, y_train)
ypred10 = clf5.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred10) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred10) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred10, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred10, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred10, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred10, average='weighted') * 100)

Accuracy is 71.24183006535948
Balanced accuracy is 20.820005456841898
Weighted f1 score is 71.65513606198131
Weighted recall is 71.24183006535948
Weighted precision is 72.2432822875197
Weighted Jaccard score is 61.10887235790856


In [14]:
ypred11 = clf5.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred11) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred11) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred11, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred11, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred11, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred11, average='weighted') * 100)

Accuracy is 39.16893732970027
Balanced accuracy is 12.343522413953243
Weighted f1 score is 34.841820616604274
Weighted recall is 39.16893732970027
Weighted precision is 43.44099565192504
Weighted Jaccard score is 22.743032979340676


In [15]:
clf6 = SVC(kernel='rbf',gamma='auto').fit(x_train,y_train)
ypred12 = clf6.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred12) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred12) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred12, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred12, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred12, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred12, average='weighted') * 100)

Accuracy is 74.50980392156863
Balanced accuracy is 10.093641114982578
Weighted f1 score is 66.1475107149274
Weighted recall is 74.50980392156863
Weighted precision is 62.99207855185085
Weighted Jaccard score is 56.577454736683954


In [16]:
ypred13 = clf6.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred13) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred13) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred13, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred13, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred13, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred13, average='weighted') * 100)

Accuracy is 34.67302452316076
Balanced accuracy is 6.21337890625
Weighted f1 score is 17.986411910697374
Weighted recall is 34.67302452316076
Weighted precision is 12.142673430819638
Weighted Jaccard score is 12.117807887957891


In [17]:
clf7 = GaussianNB().fit(x_train,y_train)
ypred14 = clf7.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred14) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred14) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred14, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred14, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred14, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred14, average='weighted') * 100)

Accuracy is 67.97385620915033
Balanced accuracy is 32.701816655367594
Weighted f1 score is 71.23079182221824
Weighted recall is 67.97385620915033
Weighted precision is 77.04567027072147
Weighted Jaccard score is 59.35858994966552


In [18]:
ypred15 = clf7.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred15) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred15) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred15, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred15, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred15, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred15, average='weighted') * 100)

Accuracy is 39.1008174386921
Balanced accuracy is 12.96374934277463
Weighted f1 score is 37.63952846134832
Weighted recall is 39.1008174386921
Weighted precision is 36.81535802833412
Weighted Jaccard score is 25.111850796264363


In [20]:
clf8 = LogisticRegression().fit(x_train,y_train)
ypred16 = clf8.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred16) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred16) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred16, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred16, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred16, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred16, average='weighted') * 100)

Accuracy is 78.64923747276688
Balanced accuracy is 29.261952279348264
Weighted f1 score is 77.63794822148702
Weighted recall is 78.64923747276688
Weighted precision is 77.22650379224774
Weighted Jaccard score is 67.43175895505331


In [21]:
ypred17 = clf8.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred17) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred17) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred17, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred17, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred17, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred17, average='weighted') * 100)

Accuracy is 38.48773841961853
Balanced accuracy is 11.428545286808937
Weighted f1 score is 32.78842936739933
Weighted recall is 38.48773841961853
Weighted precision is 38.635339814047164
Weighted Jaccard score is 21.040473838833332


In [22]:
clf9 = AdaBoostClassifier().fit(x_train,y_train)
ypred18 = clf9.predict(x_test_internal)
print("Accuracy is", metrics.accuracy_score(y_test_internal, ypred18) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_internal, ypred18) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_internal, ypred18, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_internal, ypred18, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_internal, ypred18, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_internal, ypred18, average='weighted') * 100)

Accuracy is 76.68845315904139
Balanced accuracy is 14.758309175487089
Weighted f1 score is 73.13478411808299
Weighted recall is 76.68845315904139
Weighted precision is 69.89660464371258
Weighted Jaccard score is 63.278094334682436


In [23]:
ypred19 = clf9.predict(x_test_external)
print("Accuracy is", metrics.accuracy_score(y_test_external, ypred19) * 100)
print("Balanced accuracy is", metrics.balanced_accuracy_score(y_test_external, ypred19) * 100)
print("Weighted f1 score is", metrics.f1_score(y_test_external, ypred19, average='weighted') * 100)
print("Weighted recall is", metrics.recall_score(y_test_external, ypred19, average='weighted') * 100)
print("Weighted precision is", metrics.precision_score(y_test_external, ypred19, average='weighted') * 100)
print("Weighted Jaccard score is", metrics.jaccard_score(y_test_external, ypred19, average='weighted') * 100)

Accuracy is 38.41961852861036
Balanced accuracy is 6.730921455202641
Weighted f1 score is 31.400568292172654
Weighted recall is 38.41961852861036
Weighted precision is 29.683852519685512
Weighted Jaccard score is 20.130225254093105
