clean picture description features the same way as other feature sets

In [1]:
import os, sys
import pandas as pd

# add project root
sys.path.append("/Users/gilanorup/Desktop/Studium/MSc/MA/code/masters_thesis_gn/src")
from config.constants import GIT_DIRECTORY
from data_preparation.data_cleaning_helpers import impute_mean_dataframe

BASE = os.path.join(GIT_DIRECTORY, "results", "features")
ID = "Subject_ID"

INPUTS = {
    "picture_description":      os.path.join(BASE, "picture_description.csv"),
    "picture_description_1min": os.path.join(BASE, "picture_description_1min.csv"),
    "picture_description_2min": os.path.join(BASE, "picture_description_2min.csv"),
}

# subjects to exclude (cookieTheft âˆª picnicScene)
EXCLUDE_CT = {138, 141, 469, 516, 1065}
EXCLUDE_PS = {390}
EXCLUDE = {str(x) for x in (EXCLUDE_CT | EXCLUDE_PS)}

cookie_final_path = os.path.join(BASE, "filtered", "cookieTheft_filtered2.csv")
picnic_final_path = os.path.join(BASE, "filtered", "picnicScene_filtered2.csv")

cookie_final = pd.read_csv(cookie_final_path)
picnic_final = pd.read_csv(picnic_final_path)
cookie_final[ID] = cookie_final[ID].astype(str)
picnic_final[ID] = picnic_final[ID].astype(str)

# drop meta columns that are not features
META_DROP = {"duration_used_sec"}
cookie_final = cookie_final.drop(columns=[c for c in META_DROP if c in cookie_final.columns], errors="ignore")
picnic_final = picnic_final.drop(columns=[c for c in META_DROP if c in picnic_final.columns], errors="ignore")

# compute intersection of feature columns (exclude Subject_ID)
feat_ct = set(cookie_final.columns) - {ID}
feat_ps = set(picnic_final.columns) - {ID}
FEATURE_INTERSECTION = sorted(feat_ct & feat_ps)

def load_pic(path):
    df = pd.read_csv(path)
    df[ID] = df[ID].astype(str)
    # remove excluded subjects
    df = df[~df[ID].isin(EXCLUDE)]
    # drop meta columns if present in picture_description inputs
    df = df.drop(columns=[c for c in META_DROP if c in df.columns], errors="ignore")
    # keep only Subject_ID + feature intersection (drop extras)
    keep = [ID] + [c for c in FEATURE_INTERSECTION if c in df.columns]
    df = df[keep]
    # mean-impute numeric cols
    df = impute_mean_dataframe(df, exclude_cols=[ID])
    return df

df5 = load_pic(INPUTS["picture_description"])
df1 = load_pic(INPUTS["picture_description_1min"])
df2 = load_pic(INPUTS["picture_description_2min"])

# ensure identical column order across caps
ordered_cols = [ID] + FEATURE_INTERSECTION
df5 = df5[ordered_cols]
df1 = df1[ordered_cols]
df2 = df2[ordered_cols]

# save filtered outputs
outdir = os.path.join(BASE, "filtered")
os.makedirs(outdir, exist_ok=True)
df5.to_csv(os.path.join(outdir, "picture_description_filtered.csv"), index=False)
df1.to_csv(os.path.join(outdir, "picture_description_1min_filtered.csv"), index=False)
df2.to_csv(os.path.join(outdir, "picture_description_2min_filtered.csv"), index=False)

print("done.")


Imputed missing values in 'article_pause_contentword' with mean=10.2962
Imputed missing values in 'article_pause_contentword' with mean=4.1145
Imputed missing values in 'article_pause_contentword' with mean=7.8715
done.
