In [8]:
import os
import re
import pandas as pd

# === Set paths ===
transcript_folder = "/Users/cynthianyongesa/Desktop/Desktop - Cynthia's Macbook Pro/DATA/4_PA_LAB_PY/1_SPEECH_COOKIE/cookie_transcripts_txt"
demographics_path = "/Users/cynthianyongesa/Desktop/Desktop - Cynthia's Macbook Pro/DATA/4_PA_LAB_PY/1_SPEECH_COOKIE/cookie_demographics.csv"
raw_output_path = "cookie_transcripts_all.csv"
merged_output_path = "cookie_transcripts_with_demographics.csv"
clean_output_path = os.path.join(transcript_folder, "cookie_transcripts_clean.csv")

# === Load transcripts into a DataFrame ===
def load_transcripts(folder_path):
    data = []
    for fname in os.listdir(folder_path):
        if fname.endswith(".txt"):
            pid = fname.replace(".txt", "")
            with open(os.path.join(folder_path, fname), "r") as f:
                text = f.read()
            data.append({"participant_id": pid, "transcript": text})
    return pd.DataFrame(data)

df = load_transcripts(transcript_folder)
df.to_csv(raw_output_path, index=False)
print("✅ Transcripts loaded and saved!")
print(f"📄 Raw CSV: {raw_output_path}")

# === Merge with demographics data ===
demographics_df = pd.read_csv(demographics_path)
merged_df = df.merge(demographics_df, on="participant_id", how="left")

# Report missing demographics
missing_data = merged_df[merged_df.isnull().any(axis=1)]
if not missing_data.empty:
    print("⚠️ Missing demographics for:")
    for pid in missing_data["participant_id"]:
        print(f" - {pid}")
else:
    print("✅ All participants matched with demographics data.")

merged_df.to_csv(merged_output_path, index=False)
print(f"📁 Merged CSV saved to: {merged_output_path}")

# === Text cleaning function ===
def clean_text(text):
    text = text.lower()                      # Convert to lowercase
    text = re.sub(r"\s+", " ", text)         # Normalize whitespace
    text = re.sub(r"[^\w\s]", "", text)      # Remove punctuation
    return text.strip()

# Apply cleaning
merged_df["transcript_clean"] = merged_df["transcript"].apply(clean_text)
merged_df.to_csv(clean_output_path, index=False)

print("🧼 Cleaned transcripts saved.")
print(f"📁 Cleaned CSV: {clean_output_path}")

✅ Transcripts loaded and saved!
CSV: cookie_transcripts_all.csv
