In [None]:
import pandas as pd

# Load the dataset
merge_data = pd.read_csv("merge_data.csv")

# Convert Gender column: male to 1, female to 0
merge_data['Gender'] = merge_data['Gender'].str.strip().str.lower().map({'male': 1, 'female': 0})

# Convert Familiarity Score: previously watched to 1, otherwise to 0
merge_data['Familiarity Score'] = merge_data['Familiarity Score'].str.strip().str.lower().map({'Never watched': 0, 'otherwise': 1})

# Handle NaN values by filling them with a default value (e.g., 0)
merge_data['Gender'] = merge_data['Gender'].fillna(0).astype(int)
merge_data['Familiarity Score'] = merge_data['Familiarity Score'].fillna(0).astype(int)

# Save the updated dataset to a new CSV file
merge_data.to_csv("merge_update.csv", index=False)

print("Data updated and saved to merge_update.csv")


In [None]:
#
#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from itertools import cycle
from scipy import interp

# Load the dataset
gsr_summary = pd.read_csv("merge_update.csv")

# Define features and target variable
X = gsr_summary.drop(columns=["Emotion"])
y = gsr_summary["Emotion"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define classifiers
classifiers = {
    'Decision Tree': DecisionTreeClassifier(random_state=3),
    'Random Forest': RandomForestClassifier(random_state=3),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=3),
    'SVM': SVC(probability=True, random_state=3),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'AdaBoost': AdaBoostClassifier(random_state=3),
    'Gradient Boosting': GradientBoostingClassifier(random_state=3),
    'MLP': MLPClassifier(max_iter=1000, random_state=3),
    'Ridge Classifier': RidgeClassifier()
}

# Initialize dictionaries to store results
accuracy_scores = {}
precision_scores = {}
recall_scores = {}
f1_scores = {}
roc_aucs = {}
fprs = {}
tprs = {}

# Binarize the output
y_train_bin = label_binarize(y_train, classes=np.unique(y))
y_test_bin = label_binarize(y_test, classes=np.unique(y))
n_classes = y_test_bin.shape[1]

# Train and evaluate each classifier
for clf_name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    
    # Calculate metrics
    accuracy_scores[clf_name] = accuracy_score(y_test, y_pred)
    precision_scores[clf_name] = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall_scores[clf_name] = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1_scores[clf_name] = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    
    # Calculate ROC curve and AUC
    if hasattr(clf, "predict_proba"):
        y_score = clf.predict_proba(X_test_scaled)
    else:
        y_score = clf.decision_function(X_test_scaled)
    
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Compute micro-average ROC curve and ROC area
    fpr_micro, tpr_micro, _ = roc_curve(y_test_bin.ravel(), y_score.ravel())
    roc_auc_micro = auc(fpr_micro, tpr_micro)
    
    # Interpolate to make the plot smooth
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= n_classes
    
    roc_aucs[clf_name] = roc_auc_micro
    fprs[clf_name] = all_fpr
    tprs[clf_name] = mean_tpr

# Print metrics
print("Accuracy Scores:")
for clf_name, acc in accuracy_scores.items():
    print(f"{clf_name}: {acc:.4f}")

print("\nPrecision Scores:")
for clf_name, prec in precision_scores.items():
    print(f"{clf_name}: {prec:.4f}")

print("\nRecall Scores:")
for clf_name, rec in recall_scores.items():
    print(f"{clf_name}: {rec:.4f}")

print("\nF1 Scores:")
for clf_name, f1 in f1_scores.items():
    print(f"{clf_name}: {f1:.4f}")

# Plot AUC curve
plt.figure(figsize=(12, 9))
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'blue', 'green', 'red', 'purple', 'yellow', 'cyan', 'magenta'])

for clf_name, color in zip(classifiers.keys(), colors):
    plt.plot(fprs[clf_name], tprs[clf_name], color=color, lw=2,
             label=f'{clf_name} (AUC = {roc_aucs[clf_name]:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.grid(True)
# Save as PDF
plt.savefig('roc_curve_plot_with_user_data.pdf', format='pdf')
# Save as PNG
plt.savefig('roc_curve_plot.png', format='png')
plt.show()