In [None]:
import pandas as pd
import numpy as np
from scipy.stats import kendalltau


from stories.llm_wrappers import OllamaWrapper
from stories.prompts import make_rank_evaluation_prompt

In [2]:
with open("evaluation_prompt_0.txt", "r") as fin:
    prompt = fin.read()

In [3]:
llm = OllamaWrapper("llama3.1-greedy:latest", temperature=0.0)

In [4]:
response = llm.generate_json_response(prompt)
response

{'0': 'Goal Scored',
 '1': 'Pass Accuracy %',
 '2': 'Ball Possession %',
 '3': 'Distance Covered (Kms)',
 '4': 'Free Kicks',
 '5': 'Corners',
 '6': 'Blocked shots'}

In [5]:
explanations_df = pd.read_csv("explanations.csv").set_index("story_id")

In [34]:
true_rank = explanations_df.loc[0].sort_values(by="Shap value", key=lambda s: s.abs(), ascending=False)
true_rank["rank"] = range(len(true_rank))
true_rank.reset_index(drop=True, inplace=True)
true_rank.set_index("Feature name", inplace=True)
true_rank

Unnamed: 0_level_0,Feature value,Shap value,Model coefficient,Average feature value,Feature description,rank
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Goal Scored,2,1.369433,2.603175,1.384615,Number of goals scored by the team during the ...,0
Offsides,0,-1.008332,0.969639,1.192308,Number of times the team was caught offside,1
Ball Possession %,59,-0.350419,-0.366783,49.538462,Percentage of ball possession by the team duri...,2
Pass Accuracy %,89,0.307003,0.305696,83.384615,Percentage of passes that successfully reached...,3
Distance Covered (Kms),97,0.260371,-0.323417,106.615385,Total distance covered by the team's players d...,4
Free Kicks,24,0.240641,0.112049,13.884615,Number of free kicks taken by the team,5
Corners,5,0.234285,1.307775,4.576923,Number of corner kicks taken by the team,6
Blocked,2,0.220169,-0.33351,3.538462,Number of shots that were blocked by the opponent,7


In [33]:
true_rank

Unnamed: 0_level_0,Feature name,Feature value,Shap value,Model coefficient,Average feature value,Feature description,rank
story_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Goal Scored,2,1.369433,2.603175,1.384615,Number of goals scored by the team during the ...,0
0,Offsides,0,-1.008332,0.969639,1.192308,Number of times the team was caught offside,1
0,Ball Possession %,59,-0.350419,-0.366783,49.538462,Percentage of ball possession by the team duri...,2
0,Pass Accuracy %,89,0.307003,0.305696,83.384615,Percentage of passes that successfully reached...,3
0,Distance Covered (Kms),97,0.260371,-0.323417,106.615385,Total distance covered by the team's players d...,4
0,Free Kicks,24,0.240641,0.112049,13.884615,Number of free kicks taken by the team,5
0,Corners,5,0.234285,1.307775,4.576923,Number of corner kicks taken by the team,6
0,Blocked,2,0.220169,-0.33351,3.538462,Number of shots that were blocked by the opponent,7


In [10]:
matches = 0
for rank, feature_name in response.items():
    try:
        matches += 1 if true_rank.loc[feature_name]["rank"] == int(rank) else 0
    except KeyError:
        continue

In [11]:
print(f"rank metric: {matches/len(response)}, {matches}, {len(response)}") 

rank metric: 0.2857142857142857, 2, 7


In [None]:

sorted_feature_names = [feature_name for _, feature_name in sorted(response.items(), key=lambda x: int(x[0])) if feature_name in set(true_rank["Feature name"].values)]
true_= [feature_name for feature_name in true_rank["Feature name"].values if feature_name in set(sorted_feature_names)]

In [None]:
tau, p = kendalltau(
    true_rank.loc[sorted_feature_names]["rank"].values, 
    true_rank.loc[true_]["rank"].values
)
print(tau, p)

0.8666666666666666 0.016666666666666666
