In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

# Function to load data from Excel
def load_data(text_file, tfidf_file):
    # Load text and labels
    text_data = pd.read_excel(text_file)
    
    # Load TF-IDF vectors
    tfidf_data = pd.read_excel(tfidf_file)
    
    # Extract features and labels
    X = tfidf_data.values  # TF-IDF vectors
    y = text_data['Label'].values  # Labels (questions, answers, statements)
    
    return X, y

# Function to perform hyperparameter tuning using RandomizedSearchCV
def tune_hyperparameters_perceptron(X_train, y_train):
    perceptron = Perceptron()
    param_distributions = {
        'penalty': [None, 'l2', 'l1', 'elasticnet'],
        'alpha': np.logspace(-4, 1, 10),
        'max_iter': [1000, 2000, 3000],
        'tol': [1e-4, 1e-3, 1e-2],
    }
    search = RandomizedSearchCV(perceptron, param_distributions, n_iter=10, cv=5, random_state=42)
    search.fit(X_train, y_train)
    
    return search.best_estimator_, search.best_params_

# Function to perform hyperparameter tuning for MLP
def tune_hyperparameters_mlp(X_train, y_train):
    mlp = MLPClassifier()
    param_distributions = {
        'hidden_layer_sizes': [(50,), (100,), (50,50), (100,100)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': np.logspace(-5, 3, 5),
        'learning_rate': ['constant', 'adaptive'],
    }
    search = RandomizedSearchCV(mlp, param_distributions, n_iter=10, cv=5, random_state=42)
    search.fit(X_train, y_train)
    
    return search.best_estimator_, search.best_params_

# Main program
if __name__ == "__main__":
    # Load your dataset files
    text_file = r"D:\ML\Final Viva Text.xlsx"  
    tfidf_file = r"D:\ML\tfidf_vectors.xlsx"
    
    X, y = load_data(text_file, tfidf_file)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Tune hyperparameters for Perceptron
    best_perceptron, best_perceptron_params = tune_hyperparameters_perceptron(X_train, y_train)
    print("Best Perceptron Hyperparameters:", best_perceptron_params)
    
    # Tune hyperparameters for MLP
    best_mlp, best_mlp_params = tune_hyperparameters_mlp(X_train, y_train)
    print("Best MLP Hyperparameters:", best_mlp_params)
    
    # Evaluate the models on the test set
    perceptron_predictions = best_perceptron.predict(X_test)
    mlp_predictions = best_mlp.predict(X_test)
    
    print("Perceptron Classification Report:\n", classification_report(y_test, perceptron_predictions))
    print("MLP Classification Report:\n", classification_report(y_test, mlp_predictions))


Best Perceptron Hyperparameters: {'tol': 0.0001, 'penalty': 'l2', 'max_iter': 2000, 'alpha': 0.0001}




Best MLP Hyperparameters: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (50,), 'alpha': 10.0, 'activation': 'tanh'}
Perceptron Classification Report:
               precision    recall  f1-score   support

      Answer       0.46      0.61      0.53        54
    Question       0.94      0.50      0.65        60
   Statement       0.41      0.48      0.44        58

    accuracy                           0.53       172
   macro avg       0.60      0.53      0.54       172
weighted avg       0.61      0.53      0.54       172

MLP Classification Report:
               precision    recall  f1-score   support

      Answer       0.58      0.61      0.59        54
    Question       0.74      0.82      0.78        60
   Statement       0.51      0.43      0.47        58

    accuracy                           0.62       172
   macro avg       0.61      0.62      0.61       172
weighted avg       0.61      0.62      0.62       172



In [21]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import catboost as cb
from sklearn.metrics import classification_report, accuracy_score

# Function to load data from Excel
def load_data(text_file, tfidf_file):
    # Load text and labels
    text_data = pd.read_excel(text_file)
    
    # Load TF-IDF vectors
    tfidf_data = pd.read_excel(tfidf_file)
    
    # Extract features and labels
    X = tfidf_data.values  # TF-IDF vectors
    y = text_data['Label'].values  # Labels (questions, answers, statements)
    
    return X, y

# Function to encode labels into numerical form
def encode_labels(y):
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)  # Convert labels like 'Question', 'Answer', 'Statement' into numeric values
    return y_encoded, le

# Function to evaluate classifiers and collect results
def evaluate_classifiers(X_train, X_test, y_train, y_test):
    classifiers = {
        'SVM': SVC(),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'AdaBoost': AdaBoostClassifier(),
        'XGBoost': xgb.XGBClassifier(),
        'CatBoost': cb.CatBoostClassifier(verbose=0),
        'Naive Bayes': GaussianNB(),
    }

    results = []

    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        
        # Calculate performance metrics
        accuracy = accuracy_score(y_test, predictions)
        class_report = classification_report(y_test, predictions, output_dict=True)
        
        results.append({
            'Classifier': name,
            'Accuracy': accuracy,
            'Precision (Macro Avg)': class_report['macro avg']['precision'],
            'Recall (Macro Avg)': class_report['macro avg']['recall'],
            'F1-Score (Macro Avg)': class_report['macro avg']['f1-score']
        })
    
    return pd.DataFrame(results)

# Main program
if __name__ == "__main__":
    # Load your dataset files
    text_file = r"D:\ML\Final Viva Text.xlsx"  
    tfidf_file = r"D:\ML\tfidf_vectors.xlsx"
    
    X, y = load_data(text_file, tfidf_file)

    # Encode labels (e.g., 'Question' -> 0, 'Answer' -> 1, 'Statement' -> 2)
    y_encoded, label_encoder = encode_labels(y)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
    
    # Evaluate classifiers and tabulate results
    results_df = evaluate_classifiers(X_train, X_test, y_train, y_test)
    
    # Display the tabulated results
    print(results_df)

    # Optional: Decode the predicted labels back to original form if necessary
    # predicted_labels = label_encoder.inverse_transform(predicted_numeric_labels)




      Classifier  Accuracy  Precision (Macro Avg)  Recall (Macro Avg)  \
0            SVM  0.593023               0.640211            0.591230   
1  Decision Tree  0.569767               0.584325            0.570605   
2  Random Forest  0.610465               0.611995            0.609919   
3       AdaBoost  0.505814               0.550043            0.506556   
4        XGBoost  0.622093               0.628632            0.622180   
5       CatBoost  0.651163               0.659094            0.650532   
6    Naive Bayes  0.296512               0.294907            0.294423   

   F1-Score (Macro Avg)  
0              0.600642  
1              0.572082  
2              0.607072  
3              0.512152  
4              0.623565  
5              0.653048  
6              0.293762  
