In [1]:
from scipy.stats import kendalltau
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from scipy.stats import spearmanr
import warnings
import csv

warnings.filterwarnings("ignore")
def evaluate_file(file_name):
    with open (file_name) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=";")
        invalid_predictions = 0
        machine_scores = []
        expert_scores = []
        for row in csvreader:
            if row[2] == "0":
                invalid_predictions += 1
                continue
            machine_scores.append(row[2])
            expert_scores.append(row[3])
        
        print("Results for",file_name)
        print("Invalid predictions:", invalid_predictions)
        
        print("Confusion Matrix:")
        matrix = confusion_matrix(expert_scores, machine_scores)
        print(matrix)
        print()

        print("Accuracy: {}".format(accuracy_score(expert_scores, machine_scores)))
        print("Accuracy for each class:")
        print(matrix.diagonal()/matrix.sum(axis=1))
        print()
        
        print("Classification Report:")
        print(classification_report(expert_scores, machine_scores))
        
        tau, p_value = kendalltau(expert_scores, machine_scores)
        print("Kendalls Tau:",tau)
        print("P-Value:",p_value)

In [2]:
evaluate_file("llava-v1.5-7b_results.csv")

Results for llava-v1.5-7b_results.csv
Invalid predictions: 0
Confusion Matrix:
[[   0    0 3313    2]
 [   0    0 1675   16]
 [   0    0  470   39]
 [   0    0  234   73]]

Accuracy: 0.09326691858467881
Accuracy for each class:
[0.         0.         0.92337917 0.23778502]

Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00      3315
           2       0.00      0.00      0.00      1691
           3       0.08      0.92      0.15       509
           4       0.56      0.24      0.33       307

    accuracy                           0.09      5822
   macro avg       0.16      0.29      0.12      5822
weighted avg       0.04      0.09      0.03      5822

Kendalls Tau: 0.22999462527114223
P-Value: 7.597395016294397e-76


In [3]:
evaluate_file("llava-v1.5-13b_results.csv")

Results for llava-v1.5-13b_results.csv
Invalid predictions: 15
Confusion Matrix:
[[3029    0  271    0]
 [ 888    0  802    1]
 [  33    0  472    4]
 [   1    0  260   46]]

Accuracy: 0.6108145341828827
Accuracy for each class:
[0.91787879 0.         0.92730845 0.14983713]

Classification Report:
              precision    recall  f1-score   support

           1       0.77      0.92      0.84      3300
           2       0.00      0.00      0.00      1691
           3       0.26      0.93      0.41       509
           4       0.90      0.15      0.26       307

    accuracy                           0.61      5807
   macro avg       0.48      0.50      0.38      5807
weighted avg       0.51      0.61      0.52      5807

Kendalls Tau: 0.6212214725416733
P-Value: 0.0


In [4]:
evaluate_file("llava-v1.6-vicuna-7b_results.csv")

Results for llava-v1.6-vicuna-7b_results.csv
Invalid predictions: 0
Confusion Matrix:
[[   0   10 3302    3]
 [   0    3 1651   37]
 [   0    0  395  114]
 [   0    0  117  190]]

Accuracy: 0.10099622122981793
Accuracy for each class:
[0.         0.0017741  0.77603143 0.61889251]

Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00      3315
           2       0.23      0.00      0.00      1691
           3       0.07      0.78      0.13       509
           4       0.55      0.62      0.58       307

    accuracy                           0.10      5822
   macro avg       0.21      0.35      0.18      5822
weighted avg       0.10      0.10      0.04      5822

Kendalls Tau: 0.38325152674062685
P-Value: 1.5047128164798846e-207


In [5]:
evaluate_file("llava-v1.6-vicuna-13b_results.csv")

Results for llava-v1.6-vicuna-13b_results.csv
Invalid predictions: 105
Confusion Matrix:
[[3152   14   50    0]
 [1225   39  421    0]
 [  97   11  394    7]
 [   1    1  244   61]]

Accuracy: 0.6377470701416827
Accuracy for each class:
[0.9800995  0.0231454  0.7740668  0.19869707]

Classification Report:
              precision    recall  f1-score   support

           1       0.70      0.98      0.82      3216
           2       0.60      0.02      0.04      1685
           3       0.36      0.77      0.49       509
           4       0.90      0.20      0.33       307

    accuracy                           0.64      5717
   macro avg       0.64      0.49      0.42      5717
weighted avg       0.65      0.64      0.54      5717

Kendalls Tau: 0.6201305483119269
P-Value: 0.0
