## Part 1 - Data Analysis and Bayes Nets

### 1) Data Visualization and Exploration

Importing the required libraries and checking for the python version.

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3,5)

In [2]:
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

In [3]:
# Common imports
import numpy as np
import pandas as pd
import os
import cv2  # OpenCV Python library for computer vision

# to make this notebook's output stable across runs
np.random.seed(42)

In [4]:
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [5]:
import seaborn as sns

In [6]:
import random

Loading only the training set for Part 1 of the coursework

In [7]:
CW_DATASET_PATH = "CW_dataset"

def load_train_data(dataset_path=CW_DATASET_PATH):
    x_train_all_path = os.path.join(dataset_path, "x_train_all.csv")
    y_train_all_path = os.path.join(dataset_path, "y_train_all.csv")
    x_test_all_path = os.path.join(dataset_path, "x_test_all.csv")
    y_test_all_path = os.path.join(dataset_path, "y_test_all.csv")

    x_train_all = pd.read_csv(x_train_all_path)
    y_train_all = pd.read_csv(y_train_all_path)
    x_test_all = pd.read_csv(x_test_all_path)
    y_test_all = pd.read_csv(y_test_all_path)

    return x_train_all, y_train_all, x_test_all,y_test_all

In [8]:
x_train, y_train, x_test, y_test = load_train_data()

FileNotFoundError: [Errno 2] No such file or directory: 'CW_dataset/x_test_all.csv'

Analysing the dataset

In [None]:
#Checking the shape of the data
x_train.shape

In [None]:
y_train.shape

In [None]:
#Displaying the first 5 rows of the dataset
x_train.head()

In [None]:
y_train.head()

In [None]:
#Checking for missing values
x_train.isnull().sum

In [None]:
y_train.isnull().sum

In [None]:
#To check if there are any missing values in the data frame
x_train.isna().any().any()

In [None]:
y_train.isna().any().any()

In [None]:
# Get a summary of the dataset using describe
x_train.describe()

In [None]:
y_train.describe()

In [None]:
label_counts = y_train['0'].value_counts().sort_index()
label_counts

Visualising the dataset using graphs

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='0', data=y_train)
plt.title("Distribution of Class Labels")
plt.xlabel("Class Labels")
plt.ylabel("Count")
plt.show()

In [None]:
# To display one image for each label

# Initialize a dictionary to store one image for each label
label_images = {}

# Iterate through the rows of the DataFrames and find one image for each label
for index, row in x_train.iterrows():
    label = y_train.iloc[index, 0]  
    
    if label not in label_images:
        # Store the first image for each unique label
        label_images[label] = row.values.reshape(48, 48)  
    
    # Break the loop if we have found one image for each unique label
    if len(label_images) == 10:
        break

# Display the images
fig, axs = plt.subplots(2, 5, figsize=(12, 6))
for i, (label, image) in enumerate(label_images.items()):
    r, c = divmod(i, 5)
    axs[r, c].imshow(image)
    axs[r, c].set_title(f'Label: {label}')
    axs[r, c].axis('off')

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

unique_class_labels = y_train['0'].unique()

fig, axes = plt.subplots(1, len(unique_class_labels), figsize=(20, 5))

for i, class_label in enumerate(unique_class_labels):
    # Select a representative image for each class
    class_images = x_train[y_train['0'] == class_label]
    
    if not class_images.empty:
        representative_image = class_images.iloc[0, :-1].values
        
        # Plot the histogram for the representative image of each class
        axes[i].hist(representative_image, bins=100)
        axes[i].set_title(f'Class {class_label}')
        axes[i].set_xlabel("Pixel Value")
        axes[i].set_ylabel("Frequency")

plt.show()


In [23]:
# to be fixed

# # store image data for each label
# label_images = {}

# # just to check if all images are being plotted
# num_images_per_label = {}

# for index, row in x_train.iterrows():
#     label = y_train.iloc[index, 0]  
#     image_data = row.values.reshape(-1, 48, 48) 

#     if label not in label_images:
#         label_images[label] = []
    
#     # add image data to associated labels
#     label_images[label].append(image_data)

#     # just to check if all images are being plotted
#     num_images_per_label[label] = len(label_images[label])

# # get numpy array from dictionary values (image data)
# box_data = np.array(list(label_images.values()))

# # labels from keys
# labels = list(label_images.keys())

# # box plots for each label 
# plt.figure(figsize=(12, 6))
# plt.boxplot(box_data, labels=labels, vert=False)
# plt.title("Box Plots for each label")
# plt.xlabel("Pixel Values")
# plt.ylabel("Label")
# plt.show()


# # just to check if all images are being plotted
# for label, num_images in num_images_per_label.items():
#     print(f"Label {label}: {num_images} images")

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.

### 2) Preprocessing

To be completed

### 3) Running Naïve Bayes Classifier on Train Dataset

In [None]:

from sklearn.naive_bayes import GaussianNB

x_train = np.asarray(x_train)
train_images = x_train.reshape(x_train.shape[0], -1)
y_train = np.asarray(y_train).ravel()
clf = GaussianNB()
clf.fit(train_images, y_train)

predictions = clf.predict(x_train)

"""
#Using the stratified train test split to split the data into train and test sets
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
train_images = train_images.reshape(train_images.shape[0], -1)
X_train, X_test, Y_train, Y_test = train_test_split(train_images,y_train,test_size=0.33, random_state=42)
clf2 = GaussianNB()
clf2.fit(X_train, Y_train)
predictions2 = clf2.predict(X_test)
"""

### 4) Evaluation Metrics for the Naïve Bayes Classifier on Dataset

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_test, predictions)

print(report)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, predictions)
confusion_matrix

In [None]:
tp = confusion_matrix.diagonal()
fn = np.sum(confusion_matrix, axis=1) - tp
fp = np.sum(confusion_matrix, axis=0) - tp
tn = np.sum(confusion_matrix) - (tp + fn + fp)

tp_rate = tp / (tp + fn)

fp_rate = fp / (fp + tn)

for class_label, tp, fp in zip(range(len(tp_rate)), tp_rate, fp_rate):
    print(f"Class {class_label}: \nTP Rate = {tp}, \nFP Rate = {fp}")

In [None]:
specificity = []
unique_labels = np.unique(y_test)

for i in range(len(unique_labels)):
    true_negative = np.sum(confusion_matrix) - np.sum(confusion_matrix[i, :]) - np.sum(confusion_matrix[:, i]) + confusion_matrix[i, i]
    total_negative = np.sum(confusion_matrix) - np.sum(confusion_matrix[i, :])
    print(f'Label {unique_labels[i]} specificity: {true_negative / total_negative}')

In [None]:
from sklearn.metrics import recall_score

sensitivity = recall_score(y_test, predictions, average=None)
for i in range(len(unique_labels)):
    print(f'Label {unique_labels[i]} sensitivity: {sensitivity[i]}')

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
def plot_roc_curves(probabilities,  y_test, unique_labels):
    roc_auc_scores = []
    plt.figure(figsize=(8, 6))

    for i in range(len(unique_labels)):
        fpr, tpr, _ = roc_curve((y_test == unique_labels[i]).astype(int), probabilities[:, i])
        roc_auc = auc(fpr, tpr)
        roc_auc_scores.append(roc_auc)
        plt.plot(fpr, tpr, lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))

    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc='lower right')
    plt.show()
    
probabilities = clf.predict_proba(x_test)
plot_roc_curves(probabilities, y_test, unique_labels)

#### Evaluation Using Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, x_train, y_train, scoring="accuracy", cv=10)
scores

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(scores)

In [None]:
scores = cross_val_score(clf, x_train, y_train, scoring="neg_mean_squared_error", cv=10)

clf_rmse_scores = np.sqrt(-scores)

display_scores(clf_rmse_scores)

### 5) Top Correlating Features - Feature Selection

In [42]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

no_of_features_per_class = [5, 10, 20]
top_features_per_cd = {}

x_train_class_vs_rest = pd.DataFrame(x_train)

for class_label in range(10):  
    onevrsall_path = os.path.join("OnevrsAll", f"{class_label}_vrs_all")
    y_train_path = os.path.join(onevrsall_path, f"y_train_{class_label}.csv")
    y_train_class_vs_rest = pd.read_csv(y_train_path)
    
    top_features_per_class = {}
    # Train a logistic regression classifier for the current one-vs-rest classification task
    classifier = OneVsRestClassifier(LogisticRegression())
    classifier.fit(x_train_class_vs_rest, y_train_class_vs_rest)
    
    # Get the coefficients (weights) for the features
    feature_weights = classifier.estimators_[0].coef_[0]
    # Sort the features by their absolute weights and select the top features
    for no_f in no_of_features_per_class:
        top_feature_indices = np.argsort(np.abs(feature_weights))[::-1][:no_f]
        top_features = x_train_class_vs_rest.columns[top_feature_indices]
        top_features_per_class[no_f] = top_features.tolist()
    
    # Store the top features for the current class
    top_features_per_cd[class_label] = top_features_per_class

# Create the final datasets with selected features
final_datasets = {}
for no_f in no_of_features_per_class:
    dataset_name = f"Data set {no_f}"
    
    # Combine selected features for all classes
    selected_features = []
    for class_label, top_features in top_features_per_cd.items():
        selected_features.extend(top_features[no_f])
    
    # Create the final dataset with selected features
    final_datasets[dataset_name] = x_train_class_vs_rest[selected_features]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [43]:
dataset1 = final_datasets['Data set 5']
dataset2 = final_datasets['Data set 10']
dataset3 = final_datasets['Data set 20']

In [None]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score
import numpy as np

class_labels = list(range(10))

def evaluate_multinomial_nb(X_train, X_test, y_train, y_test):
    nb_model = MultinomialNB()
    nb_model.fit(X_train, y_train)
    y_pred = nb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    cross_val_scores = cross_val_score(nb_model, X_train, y_train, scoring="accuracy", cv=10)
    return accuracy, precision, recall, f1, roc_auc, cross_val_scores

datasets = [dataset1, dataset2, dataset3]  
y_train_dict = {}
y_test_dict = {}
evaluation_metrics = []
dataset_number_features = [5, 10, 20]

for dataset_number, dataset in enumerate(datasets):
    X_train_fr = dataset
    x_test_int = x_test.astype('int64')
    x_test_int.columns = x_test_int.columns.astype('int64')
    X_test_fr = pd.DataFrame(x_test_int[dataset.columns], columns=dataset.columns)
    
    for class_label in class_labels:
        onevrsall_path = os.path.join("OnevrsAll", f"{class_label}_vrs_all")
        y_train_file = os.path.join(onevrsall_path, f"y_train_{class_label}.csv")
        y_test_file = os.path.join(onevrsall_path, f"y_test_{class_label}.csv")
        y_train_df = pd.read_csv(y_train_file).values.ravel()
        y_test_df = pd.read_csv(y_test_file).values.ravel()
    
        accuracy, precision, recall, f1, roc_auc, cross_val_scores = evaluate_multinomial_nb(X_train_fr, X_test_fr, y_train_df, y_test_df)

        evaluation_metrics.append({
            "Dataset": str(dataset_number_features[dataset_number]),
            "Class Label": class_label,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC AUC": roc_auc,
            "Cross Val Scores": np.mean(cross_val_scores) 
        })

df_evaluation_metrics = pd.DataFrame(evaluation_metrics)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

for metric in metrics:
    plt.figure(figsize=(4, 4)) 
    for dataset_number in [5, 10, 20]:
        dataset_metrics = df_evaluation_metrics[df_evaluation_metrics['Dataset'] == str(dataset_number)]
        x_values = dataset_metrics['Class Label']
        y_values = dataset_metrics[metric]
        plt.plot(x_values, y_values, marker='o', label=f'Dataset {dataset_number}')

    plt.title(f'{metric} by Class Label for Different Datasets')
    plt.xlabel('Class Label')
    plt.ylabel(metric)
    plt.legend()
    plt.xticks(x_values)  
    plt.xticks(rotation=45)
    plt.show()


In [None]:
df_evaluation_metrics