In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MinMaxScaler


In [None]:
def evaluate_model_cv(model, features, labels, cv_splits=5):
    """
    Perform cross-validation and return average accuracy, precision, recall, and F1-score.
    """
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='weighted', zero_division=0),
        'recall': make_scorer(recall_score, average='weighted', zero_division=0),
        'f1': make_scorer(f1_score, average='weighted', zero_division=0)
    }

    skf = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    scores = {metric: cross_val_score(model, features, labels, cv=skf, scoring=scorer).mean() for metric, scorer in scoring.items()}
    return scores



In [None]:
# Ensure the dataset file exists
file_path = "English-40TOPICS.csv"
if not os.path.exists(file_path):
    raise FileNotFoundError(f"The dataset file '{file_path}' was not found. Please ensure it is in the correct directory.")

# Load dataset
data = pd.read_csv(file_path)

# Assuming the last column contains labels
features = data.iloc[:, :-1].values
labels = data.iloc[:, -1].values



In [None]:
# Ensure labels are categorical
if labels.dtype.kind in {'f', 'u'}:  # Check if labels are float or unsigned int
    labels = labels.astype(int)  # Convert to integer if numeric
elif labels.dtype.kind == 'O':  # Check if labels are object (e.g., strings)
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    labels = encoder.fit_transform(labels)

# Apply Min-Max Scaling to ensure non-negative values for LDA
scaler = MinMaxScaler()
features = scaler.fit_transform(features)


In [None]:
# Apply LDA for feature extraction
lda = LatentDirichletAllocation(n_components=10, random_state=42)  # Adjust n_components as needed
features_lda = lda.fit_transform(features)

# Add variance explanation for LDA
explained_variance = lda.components_.var(axis=1)
print("\nExplained Variance by LDA Components:")
for i, var in enumerate(explained_variance):
    print(f"Topic {i + 1}: {var:.4f}")

# Initialize classifiers
classifiers = {
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB()
}


In [None]:
# Perform cross-validation for all classifiers
results = {}
for name, clf in classifiers.items():
    results[name] = evaluate_model_cv(clf, features_lda, labels)

# Print results
print("\nClassification Results:")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, score in metrics.items():
        print(f"  {metric.capitalize()}: {score:.4f}")

# Compare feature reduction before and after LDA
print("\nFeature Reduction:")
print(f"Original Feature Count: {features.shape[1]}")
print(f"Reduced Feature Count: {features_lda.shape[1]}")
