In [2]:
import pandas as pd
import string

# Function to clean the text: remove punctuation and convert to lower case
def clean_text(text):
    if pd.isna(text):
        return text
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra spaces
    text = text.strip()
    return text

# Load the files
annotated_corpus_path = 'annotated_corpus.xlsx'
similes_output_path = 'similes_output.csv'

# Read the files
annotated_df = pd.read_excel(annotated_corpus_path)
similes_output_df = pd.read_csv(similes_output_path)

# Clean the "Simile" columns in both dataframes
annotated_df['Simile_clean'] = annotated_df['Simile'].apply(clean_text)
similes_output_df['Simile_clean'] = similes_output_df['Simile'].apply(clean_text)

# Clean the "Tenor", "Vehicle", and "Shared_property" columns
annotated_df['Tenor_clean'] = annotated_df['Tenor'].apply(clean_text)
annotated_df['Vehicle_clean'] = annotated_df['Vehicle'].apply(clean_text)
annotated_df['Shared_property_clean'] = annotated_df['Shared_property'].apply(clean_text)

similes_output_df['Tenor_clean'] = similes_output_df['Tenor'].apply(clean_text)
similes_output_df['Vehicle_clean'] = similes_output_df['Vehicle'].apply(clean_text)
similes_output_df['Shared Property_clean'] = similes_output_df['Shared Property'].apply(clean_text)

# Function to calculate precision and recall for a specific column
def calculate_precision_recall(annotated, output):
    annotated_set = set(annotated)
    output_set = set(output)
    
    true_positives = annotated_set.intersection(output_set)
    false_positives = output_set - annotated_set
    false_negatives = annotated_set - output_set
    
    precision = len(true_positives) / (len(true_positives) + len(false_positives)) if (len(true_positives) + len(false_positives)) > 0 else 0
    recall = len(true_positives) / (len(false_negatives) + len(true_positives)) if (len(true_positives) + len(false_negatives)) > 0 else 0
    
    return precision, recall

# Calculate precision and recall for each component
simile_precision, simile_recall = calculate_precision_recall(annotated_df['Simile_clean'], similes_output_df['Simile_clean'])
tenor_precision, tenor_recall = calculate_precision_recall(annotated_df['Tenor_clean'], similes_output_df['Tenor_clean'])
vehicle_precision, vehicle_recall = calculate_precision_recall(annotated_df['Vehicle_clean'], similes_output_df['Vehicle_clean'])
shared_property_precision, shared_property_recall = calculate_precision_recall(annotated_df['Shared_property_clean'], similes_output_df['Shared Property_clean'])

# Print the results
print(f'Simile Precision: {simile_precision:.4f}, Recall: {simile_recall:.4f}')
print(f'Tenor Precision: {tenor_precision:.4f}, Recall: {tenor_recall:.4f}')
print(f'Vehicle Precision: {vehicle_precision:.4f}, Recall: {vehicle_recall:.4f}')
print(f'Shared Property Precision: {shared_property_precision:.4f}, Recall: {shared_property_recall:.4f}')


Simile Precision: 0.8962, Recall: 0.9500
Tenor Precision: 0.7160, Recall: 0.7945
Vehicle Precision: 0.8155, Recall: 0.8571
Shared Property Precision: 0.0000, Recall: 0.0000
