# Notebook Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import ast

import os

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

# Convert CSV to Dataframe

In [None]:
df = pd.read_csv('/content/drive/My Drive/266_project/mistral_7b_data/few_shot_entity_predictions_100.csv')
df2 = pd.read_csv('/content/drive/My Drive/266_project/mistral_7b_data/entity_predictions_100.csv')
df3 = pd.read_csv('/content/drive/My Drive/266_project/mistral_7b_data/finetuned_few_shot_entity_predictions_100.csv')

In [None]:
df = df[['Instruction', 'Prediction', 'True Value']]
df2= df2[['Instruction', 'Prediction', 'True Value']]
df3= df3[['Instruction', 'Prediction', 'True Value']]

# Model Evaluation

In [None]:
import pandas as pd
import ast

def safe_eval_dict(value):
    try:
        if isinstance(value, str):
            return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        pass
    return {}

df['True Value'] = df['True Value'].apply(safe_eval_dict)
df['Prediction'] = df['Prediction'].apply(safe_eval_dict)

df2['True Value'] = df2['True Value'].apply(safe_eval_dict)
df2['Prediction'] = df2['Prediction'].apply(safe_eval_dict)

df3['True Value'] = df3['True Value'].apply(safe_eval_dict)
df3['Prediction'] = df3['Prediction'].apply(safe_eval_dict)

def calculate_metrics(row):
    fp_list, fn_list, tp_list = [], [], []
    all_keys = set(row['True Value'].keys()) | set(row['Prediction'].keys())

    for key in all_keys:
        true_entities = set(row['True Value'].get(key, []))
        predicted_entities = set(row['Prediction'].get(key, []))

        tp = len(true_entities & predicted_entities)
        fp = len(predicted_entities - true_entities)
        fn = len(true_entities - predicted_entities)

        fp_list.append(fp)
        fn_list.append(fn)
        tp_list.append(tp)

    total_fp = sum(fp_list)
    total_fn = sum(fn_list)
    total_tp = sum(tp_list)

    return pd.Series([total_fp, total_fn, total_tp], index=['False Positives', 'False Negatives', 'True Positives'])



df[['False Positives', 'False Negatives', 'True Positives']] = df.apply(calculate_metrics, axis=1)
df2[['False Positives', 'False Negatives', 'True Positives']] = df2.apply(calculate_metrics, axis=1)
df3[['False Positives', 'False Negatives', 'True Positives']] = df3.apply(calculate_metrics, axis=1)

## Few-Shot Mistral-7b

In [None]:
df['Precision'] = df['True Positives'] / (df['True Positives'] + df['False Positives'])
df['Recall'] = df['True Positives'] / (df['True Positives'] + df['False Negatives'])
df['F1 Score'] = 2 * (df['Precision'] * df['Recall']) / (df['Precision'] + df['Recall'])

df.fillna(0, inplace=True)

average_precision = df['Precision'].mean()
average_recall = df['Recall'].mean()
average_f1_score = df['F1 Score'].mean()

average_precision, average_recall, average_f1_score

(0.168, 0.17826190476190476, 0.15410739260739262)

### Evaluation without Empty Predictions

In [None]:
df_filtered = df[df['Prediction'].apply(lambda x: bool(x))]

In [None]:
df_filtered =df_filtered.reset_index()

In [None]:
df_filtered['Precision'] = df_filtered['True Positives'] / (df_filtered['True Positives'] + df_filtered['False Positives'])
df_filtered['Recall'] = df_filtered['True Positives'] / (df_filtered['True Positives'] + df_filtered['False Negatives'])
df_filtered['F1 Score'] = 2 * (df_filtered['Precision'] * df_filtered['Recall']) / (df_filtered['Precision'] + df_filtered['Recall'])

df_filtered.fillna(0, inplace=True)

average_precision = df_filtered['Precision'].mean()
average_recall = df_filtered['Recall'].mean()
average_f1_score = df_filtered['F1 Score'].mean()

average_precision, average_recall, average_f1_score

(0.44210526315789467, 0.46911027568922303, 0.4055457700194542)

## Zero-Shot Fine-tuned Mistral-7b

In [None]:
df2['Precision'] = df2['True Positives'] / (df2['True Positives'] + df2['False Positives'])
df2['Recall'] = df2['True Positives'] / (df2['True Positives'] + df2['False Negatives'])
df2['F1 Score'] = 2 * (df2['Precision'] * df2['Recall']) / (df2['Precision'] + df2['Recall'])

df2.fillna(0, inplace=True)

average_precision = df2['Precision'].mean()
average_recall = df2['Recall'].mean()
average_f1_score = df2['F1 Score'].mean()

average_precision, average_recall, average_f1_score

(0.4403333333333333, 0.4581666666666666, 0.441582251082251)

### Evaluation without Empty Predictions

In [None]:
df2_filtered = df2[df2['Prediction'].apply(lambda x: bool(x))]

In [None]:
df2_filtered['Precision'] = df2_filtered['True Positives'] / (df2_filtered['True Positives'] + df2_filtered['False Positives'])
df2_filtered['Recall'] = df2_filtered['True Positives'] / (df2_filtered['True Positives'] + df2_filtered['False Negatives'])
df2_filtered['F1 Score'] = 2 * (df2_filtered['Precision'] * df2_filtered['Recall']) / (df2_filtered['Precision'] + df2_filtered['Recall'])

df2_filtered.fillna(0, inplace=True)

average_precision = df2_filtered['Precision'].mean()
average_recall = df2_filtered['Recall'].mean()
average_f1_score = df2_filtered['F1 Score'].mean()

average_precision, average_recall, average_f1_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_filtered['Precision'] = df2_filtered['True Positives'] / (df2_filtered['True Positives'] + df2_filtered['False Positives'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_filtered['Recall'] = df2_filtered['True Positives'] / (df2_filtered['True Positives'] + df2_filtered['False Negatives'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide

(0.6572139303482587, 0.6838308457711442, 0.659077986689927)

## Few-Shot Fine-tuned Mistral-7b

In [None]:
df3['Precision'] = df3['True Positives'] / (df3['True Positives'] + df3['False Positives'])
df3['Recall'] = df3['True Positives'] / (df3['True Positives'] + df3['False Negatives'])
df3['F1 Score'] = 2 * (df3['Precision'] * df3['Recall']) / (df3['Precision'] + df3['Recall'])

df3.fillna(0, inplace=True)

average_precision = df3['Precision'].mean()
average_recall = df3['Recall'].mean()
average_f1_score = df3['F1 Score'].mean()

average_precision, average_recall, average_f1_score

(0.3615595238095238, 0.3677792207792209, 0.343032745032745)

### Evaluation without Empty Predictions

In [None]:
df3_filtered = df3[df3['Prediction'].apply(lambda x: bool(x))]

In [None]:
df3_filtered['Precision'] = df3_filtered['True Positives'] / (df3_filtered['True Positives'] + df3_filtered['False Positives'])
df3_filtered['Recall'] = df3_filtered['True Positives'] / (df3_filtered['True Positives'] + df3_filtered['False Negatives'])
df3_filtered['F1 Score'] = 2 * (df3_filtered['Precision'] * df3_filtered['Recall']) / (df3_filtered['Precision'] + df3_filtered['Recall'])

df3_filtered.fillna(0, inplace=True)

average_precision = df3_filtered['Precision'].mean()
average_recall = df3_filtered['Recall'].mean()
average_f1_score = df3_filtered['F1 Score'].mean()

average_precision, average_recall, average_f1_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3_filtered['Precision'] = df3_filtered['True Positives'] / (df3_filtered['True Positives'] + df3_filtered['False Positives'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3_filtered['Recall'] = df3_filtered['True Positives'] / (df3_filtered['True Positives'] + df3_filtered['False Negatives'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide

(0.44636978248089365, 0.4540484207150874, 0.42349721608980867)