In [None]:
import sys
sys.path.append('/home/redacted/dev/moral-summarization')

from moral_summarization.eval import evaluate_CoT_moral_words_predictions


results_dir = '../results/final_prompts'
#models = ['DeepSeek-R1-Distill-Qwen-32B', 'c4ai-command-r-plus-4bit'] # 
models = ['Meta-Llama-3-70B-Instruct']
article_list = 'articles_in_test_set.txt'

## CoT results

In [None]:
results_df, length_df = evaluate_CoT_moral_words_predictions(
    results_dir, models, only_test_set=True, article_list=article_list, seed='345')

In [None]:
# remove last row of length_df
length_df = length_df.iloc[:-1]

In [None]:
from statistics import median

median(length_df['Meta-Llama-3-70B-Instruct'])

## Classifier results in style of CoT

In [None]:
import pandas as pd
from ast import literal_eval

from moral_summarization.eval import f1_moral_predictions

with open(article_list) as f:
    articles_in_test_set = f.read().splitlines()

literal_eval_columns = ['predicted_words', 'labeled_words']
converters = {column: literal_eval for column in literal_eval_columns}
results_class = pd.read_csv('../results/predictions_with_words.csv', converters=converters)

In [None]:
class_results_df = pd.DataFrame(columns=['predictions', 'labels', 'f1'], index=articles_in_test_set)

for article in articles_in_test_set:
    article_results = results_class[results_class['article'] == article]

    predicted_words = article_results['predicted_words'].to_list()
    predicted_words = [word for sublist in predicted_words for word in sublist]
    class_results_df.loc[article, 'predictions'] = predicted_words

    labeled_words = article_results['labeled_words'].to_list()
    labeled_words = [word for sublist in labeled_words for word in sublist]
    class_results_df.loc[article, 'labels'] = labeled_words

    f1 = f1_moral_predictions(labeled_words, predicted_words)
    class_results_df.loc[article, 'f1'] = f1

In [None]:
# Add mean to the dataframe
class_results_df['f1'].mean(axis=0)

## Classifier seqeval results

In [None]:
import ast
from moral_summarization.metrics import seqeval_metrics

class_seqeval_results_df = pd.DataFrame(columns=['f1'], index=articles_in_test_set)

for article in articles_in_test_set:
    article_results = results_class[results_class['article'] == article]

    predictions = article_results['predictions'].to_list()
    cleaned_strings = [pred.replace('\n', '').replace(' ', ', ') for pred in predictions]
    class_labels = [ast.literal_eval(cleaned_string) for cleaned_string in cleaned_strings]
    predictions = [word for sublist in class_labels for word in sublist]

    labels = article_results['labels'].to_list()
    cleaned_strings = [pred.replace('\n', '').replace('    ', ', ') for pred in labels]
    class_labels = [ast.literal_eval(cleaned_string) for cleaned_string in cleaned_strings]
    labels = [word for sublist in class_labels for word in sublist]

    metrics = seqeval_metrics([predictions], [labels])

    class_seqeval_results_df.loc[article, 'f1'] = metrics['f1']

## Count median of words annotated and predicted by classifier

In [None]:
import sys
sys.path.append('/home/redacted/dev/moral-summarization')

from moral_summarization.data_utils import *

literal_eval_columns = ['predicted_words', 'labeled_words']
converters = {column: literal_eval for column in literal_eval_columns}
results_class = pd.read_csv('../results/predictions_with_words.csv', converters=converters)

from statistics import median

article_list = 'articles_in_test_set.txt'
with open(article_list, 'r') as f:
    articles = f.readlines()
articles = [article.strip() for article in articles]

# count sum of length of all strings in the column predicted_words
count_pred = {article: 0 for article in articles}
count_label = {article: 0 for article in articles}
for id, row in results_class.iterrows():
    count_pred[row['article']] += len(row['predicted_words'])
    count_label[row['article']] += len(row['labeled_words'])

print(median(count_pred.values()))
print(median(count_label.values()))