In [9]:
!pip install pandas



In [7]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [3]:
import pandas as pd

# Paths to your files (adjust as needed)
PATH_ORIGINAL = "final_data.csv"
PATH_CORRECTED = "revised_phillipines_full.xlsx"
PATH_OUTPUT   = "final_data_with_all_new_labels.csv"

# 1. Read the original CSV
df_original = pd.read_csv(PATH_ORIGINAL)

# 2. Read the Excel file that contains the corrected columns
df_corrections = pd.read_excel(PATH_CORRECTED, engine="openpyxl")

#--- OPTIONAL STRING NORMALIZATION -------------------------------------------
df_original['sentences'] = df_original['sentences'].astype(str).str.strip()
df_corrections['sentences'] = df_corrections['sentences'].astype(str).str.strip()
#---------------------------------------------------------------------------

# 3. Merge on the 'sentences' column.
#    "left" join ensures all rows from df_original remain, even if there's no match.
df_merged = pd.merge(
    df_original,
    df_corrections[['sentences', 'New_Label', 'New_time_label', 'New_certain_label']],  # relevant columns
    on='sentences',
    how='left'
)

# 4. Define valid label sets
valid_sentiment = {"hawkish", "dovish", "neutral", "irrelevant"}
valid_time      = {"forward looking", "not forward looking"}
valid_certain   = {"certain", "uncertain"}

# 5. Check for any invalid new labels
invalid_senti_mask = df_merged['New_Label'].notna() & ~df_merged['New_Label'].isin(valid_sentiment)
invalid_time_mask  = df_merged['New_time_label'].notna() & ~df_merged['New_time_label'].isin(valid_time)
invalid_cert_mask  = df_merged['New_certain_label'].notna() & ~df_merged['New_certain_label'].isin(valid_certain)

if invalid_senti_mask.any():
    print("\nWARNING: The rows below have invalid 'New_Label' values (not replaced):")
    print(df_merged.loc[invalid_senti_mask, ['sentences', 'New_Label']])

if invalid_time_mask.any():
    print("\nWARNING: The rows below have invalid 'New_time_label' values (not replaced):")
    print(df_merged.loc[invalid_time_mask, ['sentences', 'New_time_label']])

if invalid_cert_mask.any():
    print("\nWARNING: The rows below have invalid 'New_certain_label' values (not replaced):")
    print(df_merged.loc[invalid_cert_mask, ['sentences', 'New_certain_label']])

# 6. Overwrite the original columns only when the new values are valid (and not NaN).

def pick_label(row, original_col, new_col, valid_set):
    """
    If new_col is non-null and in valid_set, use it.
    Otherwise, fallback to original_col.
    """
    new_val = row[new_col]
    if pd.notnull(new_val) and new_val in valid_set:
        return new_val
    else:
        return row[original_col]

df_merged['sentiment_label'] = df_merged.apply(
    pick_label, axis=1,
    args=('sentiment_label', 'New_Label', valid_sentiment)
)

df_merged['time_label'] = df_merged.apply(
    pick_label, axis=1,
    args=('time_label', 'New_time_label', valid_time)
)

df_merged['certain_label'] = df_merged.apply(
    pick_label, axis=1,
    args=('certain_label', 'New_certain_label', valid_certain)
)

# 7. Drop the "New_" columns if you no longer need them in final output
df_merged.drop(columns=['New_Label', 'New_time_label', 'New_certain_label'], inplace=True)

# 8. Save to a new CSV
df_merged.to_csv(PATH_OUTPUT, index=False)

print("Merging complete. Final CSV with updated labels saved to:", PATH_OUTPUT)


Merging complete. Final CSV with updated labels saved to: final_data_with_all_new_labels.csv


In [2]:
import string
import pandas as pd

df = pd.read_csv('final_data_with_new_labels.csv')

# Define a set of allowed characters (you can adjust this set as needed)
allowed_chars = set(string.ascii_letters + string.digits + string.punctuation + string.whitespace)

# Read the entire file content
with open('final_data_with_new_labels.csv', 'r', encoding='utf-8') as file:
    content = file.read()

# Check for any characters not in the allowed set
invalid_chars = set(content) - allowed_chars

if invalid_chars:
    print("WARNING: The file contains characters that are not in the clam scan safe format:")
    print(invalid_chars)
else:
    print("The file is in clam scan safe format.")

{'à', '”', '€', 'é', '‑', 'œ', '“', '’', '–', 'ñ'}
