In [27]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")

# Load data
ground_truth = pd.read_csv("../source_data/test_preprocess.csv")
predictions = {
    "SVM": pd.read_csv('../evaluation/svm_predictions.csv'),
    "BERT": pd.read_csv('../evaluation/bert_predictions.csv'),
    # zero-shot models
    "GPT-3.5 Turbo Zero-Shot": pd.read_csv('../evaluation/zero_shot_gpt-3.5-turbo_predictions.csv'),
    "GPT-4 Turbo Zero-Shot": pd.read_csv('../evaluation/zero_shot_gpt-4-turbo_predictions.csv'),
    "GPT-4o Zero-Shot": pd.read_csv('../evaluation/zero_shot_gpt-4o_predictions.csv'),
    # few-shot models
    "GPT-3.5 Turbo Few-Shot": pd.read_csv('../evaluation/few_shot_gpt-3.5-turbo_predictions.csv'),
    "GPT-4 Turbo Few-Shot": pd.read_csv('../evaluation/few_shot_gpt-4-turbo_predictions.csv'),
    "GPT-4o Few-Shot": pd.read_csv('../evaluation/few_shot_gpt-4o_predictions.csv'),
    
}

# Configuration
aspect_keys = ["ac", "air_panas", "bau", "general", "kebersihan", 
               "linen", "service", "sunrise_meal", "tv", "wifi"]

def evaluate_model(true_df, pred_df, model_name):
    """Core evaluation function for a single model"""
    merged = true_df.merge(pred_df, on="review", suffixes=('_true', '_pred'))
    results = []
    
    for aspect in aspect_keys:
        y_true = merged[f"{aspect}_true"]
        y_pred = merged[f"{aspect}_pred"]
        
        report = classification_report(y_true, y_pred, 
                                      target_names=['neg', 'pos', 'neut'],
                                      output_dict=True)
        
        results.append({
            'model': model_name,
            'aspect': aspect,
            'accuracy': accuracy_score(y_true, y_pred),
            'macro_precision': report['macro avg']['precision'],
            'macro_recall': report['macro avg']['recall'],
            'macro_f1': report['macro avg']['f1-score'],
            'weighted_precision': report['weighted avg']['precision'],
            'weighted_recall': report['weighted avg']['recall'],
            'weighted_f1': report['weighted avg']['f1-score'],
            'neg_precision': report['neg']['precision'],
            'neg_recall': report['neg']['recall'],
            'neg_f1': report['neg']['f1-score'],
            'pos_precision': report['pos']['precision'],
            'pos_recall': report['pos']['recall'],
            'pos_f1': report['pos']['f1-score'],
            'neut_precision': report['neut']['precision'],
            'neut_recall': report['neut']['recall'],
            'neut_f1': report['neut']['f1-score'],
            'support': report['weighted avg']['support']
        })
    
    return pd.DataFrame(results)

# Main evaluation pipeline
all_results = []
for model_name, pred_df in predictions.items():
    print(f"Evaluating {model_name}...")
    model_results = evaluate_model(ground_truth, pred_df, model_name)
    all_results.append(model_results)

# Generate key outputs
full_results = pd.concat(all_results, ignore_index=True)

Evaluating SVM...
Evaluating BERT...
Evaluating GPT-3.5 Turbo Zero-Shot...
Evaluating GPT-4 Turbo Zero-Shot...
Evaluating GPT-4o Zero-Shot...
Evaluating GPT-3.5 Turbo Few-Shot...
Evaluating GPT-4 Turbo Few-Shot...
Evaluating GPT-4o Few-Shot...


In [28]:
# 1. Model Comparison Table
model_comparison = full_results.groupby('model').agg({
    'accuracy': 'mean',
    'macro_precision': 'mean',
    'macro_recall': 'mean',
    'macro_f1': 'mean',
    'weighted_precision': 'mean',
    'weighted_recall': 'mean',
    'weighted_f1': 'mean',
    'neg_precision': 'mean',
    'neg_recall': 'mean',
    'neg_f1': 'mean',
    'pos_precision': 'mean',
    'pos_recall': 'mean',
    'pos_f1': 'mean',
    'neut_precision': 'mean',
    'neut_recall': 'mean',
    'neut_f1': 'mean',
    'support': 'mean'
}).reset_index()

print("\n=== AVG Model Comparison ===")
model_comparison.round(4)


=== AVG Model Comparison ===


Unnamed: 0,model,accuracy,macro_precision,macro_recall,macro_f1,weighted_precision,weighted_recall,weighted_f1,neg_precision,neg_recall,neg_f1,pos_precision,pos_recall,pos_f1,neut_precision,neut_recall,neut_f1,support
0,BERT,0.9608,0.8129,0.8093,0.806,0.9595,0.9608,0.9595,0.8238,0.8362,0.8269,0.9787,0.9761,0.9774,0.6363,0.6155,0.6136,286.0
1,GPT-3.5 Turbo Few-Shot,0.9154,0.8143,0.8193,0.8058,0.9221,0.9154,0.916,0.7962,0.7544,0.7505,0.9399,0.9559,0.9469,0.7068,0.7475,0.72,286.0
2,GPT-3.5 Turbo Zero-Shot,0.915,0.7955,0.781,0.7814,0.917,0.915,0.914,0.7833,0.7069,0.7338,0.936,0.9612,0.948,0.6671,0.6748,0.6624,286.0
3,GPT-4 Turbo Few-Shot,0.9329,0.8414,0.8702,0.834,0.9505,0.9329,0.9362,0.8384,0.8315,0.8036,0.962,0.9543,0.9553,0.7236,0.8249,0.7431,286.0
4,GPT-4 Turbo Zero-Shot,0.9283,0.8276,0.8771,0.8307,0.9514,0.9283,0.9328,0.8327,0.8529,0.8118,0.9677,0.944,0.9514,0.6823,0.8345,0.7288,286.0
5,GPT-4o Few-Shot,0.9455,0.8424,0.8603,0.8391,0.9513,0.9455,0.9455,0.8549,0.8148,0.822,0.9625,0.9703,0.9654,0.7099,0.7959,0.7299,286.0
6,GPT-4o Zero-Shot,0.8909,0.7916,0.641,0.6777,0.898,0.8909,0.8826,0.8526,0.6123,0.6871,0.8947,0.9739,0.9307,0.6275,0.3367,0.4152,286.0
7,SVM,0.935,0.7,0.6718,0.6724,0.9293,0.935,0.9305,0.7453,0.745,0.7435,0.9583,0.9741,0.966,0.3962,0.2965,0.3077,286.0


In [29]:
# 2. Aspect-Level Performance
aspect_performance = full_results.groupby(['model', 'aspect']).agg({
    'macro_precision': 'mean',
    'macro_recall': 'mean',
    'macro_f1': 'mean',
    'weighted_precision': 'mean',
    'weighted_recall': 'mean',
    'weighted_f1': 'mean',
    'neg_precision': 'mean',
    'neg_recall': 'mean',
    'neg_f1': 'mean',
    'pos_precision': 'mean',
    'pos_recall': 'mean',
    'pos_f1': 'mean',
    'neut_precision': 'mean',
    'neut_recall': 'mean',
    'neut_f1': 'mean',
    'support': 'mean'
}).reset_index()

print("\n=== AVG Aspect-Level Performance ===")
aspect_performance.round(4)


=== AVG Aspect-Level Performance ===


Unnamed: 0,model,aspect,macro_precision,macro_recall,macro_f1,weighted_precision,weighted_recall,weighted_f1,neg_precision,neg_recall,neg_f1,pos_precision,pos_recall,pos_f1,neut_precision,neut_recall,neut_f1,support
0,BERT,ac,0.9258,0.9838,0.9510,0.9908,0.9895,0.9898,0.9773,0.9556,0.9663,1.0000,0.9957,0.9978,0.8000,1.0000,0.8889,286.0
1,BERT,air_panas,0.6084,0.6349,0.6209,0.9603,0.9650,0.9624,0.8333,0.9211,0.8750,0.9918,0.9837,0.9877,0.0000,0.0000,0.0000,286.0
2,BERT,bau,0.9378,0.9647,0.9503,0.9644,0.9615,0.9624,0.8261,0.9268,0.8736,0.9874,0.9672,0.9772,1.0000,1.0000,1.0000,286.0
3,BERT,general,0.7168,0.6873,0.6963,0.9404,0.9441,0.9419,0.3333,0.2000,0.2500,0.9671,0.9671,0.9671,0.8500,0.8947,0.8718,286.0
4,BERT,kebersihan,0.8854,0.9170,0.8996,0.9069,0.9056,0.9057,0.8830,0.8646,0.8737,0.9355,0.9177,0.9265,0.8378,0.9688,0.8986,286.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,SVM,linen,0.9279,0.6311,0.6601,0.9007,0.8986,0.8873,0.8784,0.8125,0.8442,0.9052,0.9695,0.9363,1.0000,0.1111,0.2000,286.0
76,SVM,service,0.8437,0.7924,0.8148,0.9000,0.9021,0.8996,0.6905,0.6744,0.6824,0.9442,0.9807,0.9621,0.8966,0.7222,0.8000,286.0
77,SVM,sunrise_meal,0.7525,0.7688,0.7521,0.9599,0.9580,0.9581,0.7273,0.5714,0.6400,0.9848,0.9848,0.9848,0.5455,0.7500,0.6316,286.0
78,SVM,tv,0.6359,0.6397,0.6378,0.9687,0.9790,0.9738,0.9231,0.9231,0.9231,0.9846,0.9961,0.9903,0.0000,0.0000,0.0000,286.0
