In [1]:
!pip install pandas



In [2]:
!pip install openpyxl



In [2]:
import pandas as pd

# Paths to your files (adjust as needed)
PATH_ORIGINAL = "final_data.csv"
PATH_CORRECTED = "5768_bank_of_canada.xlsx"  # (Watch out for any typos like 5768 vs 5678)
PATH_OUTPUT   = "final_data_with_new_labels.csv"

# 1. Read the original CSV
df_original = pd.read_csv(PATH_ORIGINAL)

# 2. Read the Excel file that contains the corrected sentiment labels
df_corrections = pd.read_excel(PATH_CORRECTED, engine="openpyxl")

#--- OPTIONAL STRING NORMALIZATION -------------------------------------------
# If your `sentences` have whitespace or casing differences, you might do:
df_original['sentences']   = df_original['sentences'].astype(str).str.strip()
df_corrections['sentences'] = df_corrections['sentences'].astype(str).str.strip()
#---------------------------------------------------------------------------

# 3. Merge on the 'sentences' column.
#    "left" join ensures all rows from df_original remain, even if there's no match.
df_merged = pd.merge(
    df_original,
    df_corrections[['sentences', 'New_Label']],  # only keep relevant columns
    on='sentences',
    how='left'
)

# 4. Define valid labels
valid_labels = {"dovish", "hawkish", "neutral", "irrelevant"}

# 5. Check for any invalid new labels (not in valid_labels)
invalid_mask = df_merged['New_Label'].notna() & ~df_merged['New_Label'].isin(valid_labels)
if invalid_mask.any():
    print("WARNING: The rows below have invalid New_Label values (not replaced):")
    print(df_merged.loc[invalid_mask, ['sentences', 'New_Label']])

# 6. Overwrite 'sentiment_label' with 'New_Label' **if** the New_Label is valid.
#    If New_Label is NaN or invalid, keep the old sentiment_label.
def pick_label(row):
    if pd.notnull(row['New_Label']) and row['New_Label'] in valid_labels:
        return row['New_Label']
    else:
        return row['sentiment_label']

df_merged['sentiment_label'] = df_merged.apply(pick_label, axis=1)

# 7. (Optional) Drop the New_Label column if you no longer need it
df_merged.drop(columns=['New_Label'], inplace=True)

# 8. Save to a new CSV
df_merged.to_csv(PATH_OUTPUT, index=False)

print("Merging complete. Final CSV with updated sentiment labels saved to:", PATH_OUTPUT)


Merging complete. Final CSV with updated sentiment labels saved to: final_data_with_new_labels.csv


In [2]:
import string
import pandas as pd

df = pd.read_csv('final_data_with_new_labels.csv')

# Define a set of allowed characters (you can adjust this set as needed)
allowed_chars = set(string.ascii_letters + string.digits + string.punctuation + string.whitespace)

# Read the entire file content
with open('final_data_with_new_labels.csv', 'r', encoding='utf-8') as file:
    content = file.read()

# Check for any characters not in the allowed set
invalid_chars = set(content) - allowed_chars

if invalid_chars:
    print("WARNING: The file contains characters that are not in the clam scan safe format:")
    print(invalid_chars)
else:
    print("The file is in clam scan safe format.")

{'à', '”', '€', 'é', '‑', 'œ', '“', '’', '–', 'ñ'}
