In [8]:
import json
import pickle
import pandas as pd

from pathlib import Path

### UUID splits to raw month

In [2]:
uuid_path = Path("/projects/frame_align/data/text_uuid_splits/")

In [3]:
month_set_dict = {}
for file in uuid_path.iterdir():
    with open(file, "rb") as f:
        data = pickle.load(f)
        month = list(data.keys())[0]
        if month not in month_set_dict:
            month_set_dict[month] = []
        month_set_dict[month].extend([file.stem])

In [14]:
pickle.dump(month_set_dict, open("/projects/frame_align/data/text_month_set_dict.pkl", "wb"))

In [3]:
month_set_dict = pickle.load(open("/projects/frame_align/data/text_month_set_dict.pkl", "rb"))

In [None]:
# Combine the text annotations for each month
for month, sets in month_set_dict.items():
    print(f"{month}: {len(sets)}")
    month_annotations_text = []
    for set_no in sets:
        set_df = pd.read_json(f"/projects/frame_align/data/annotated/text/textframes_{set_no}.jsonl", lines=True)
        month_annotations_text.append(set_df)
    month_annotations_text = pd.concat(month_annotations_text)
    month_annotations_text.to_json(f"/projects/frame_align/data/annotated/text_combined/textframes_{month}.jsonl", orient="records", lines=True)


2023-08-01_2023-08-31: 8
2023-09-01_2023-09-30: 3
2023-05-01_2023-05-31: 8
2024-04-01_2024-04-30: 4
2023-07-01_2023-07-31: 8
2023-10-01_2023-10-31: 5
2024-03-01_2024-03-31: 4
2023-06-01_2023-06-30: 3
2024-01-01_2024-01-31: 4
2023-12-01_2023-12-31: 4
2024-02-01_2024-02-29: 2
2023-11-01_2023-11-30: 3


In [4]:
# Merge the text annotations with the vision annotations
for month in month_set_dict.keys():
    print(f"{month}")
    text_path = Path(f"/projects/frame_align/data/annotated/text_combined/textframes_{month}.jsonl")
    vision_path = Path(f"/projects/frame_align/data/annotated/vision/visionframes_{month}_pixtral_anno.jsonl")
    assert text_path.exists()
    assert vision_path.exists()
    month_annotations_text = pd.read_json(text_path, lines=True)
    try:
        month_annotations_vision = pd.read_json(vision_path, lines=True)
    except:
        month_annos = []
        with open(vision_path, "r") as f:
            for line in f:
                try:
                    json_data = json.loads(line)
                    month_annos.append(json_data)
                except:
                    pass
        month_annotations_vision = pd.DataFrame(month_annos)
    print("Text annotations: ", len(month_annotations_text), "Vision annotations: ", len(month_annotations_vision))
    month_annotations_text.rename(columns={"id": "uuid", "reason":"frames-list-justification"}, inplace=True)
    common_uuids = set(month_annotations_text["uuid"]).intersection(set(month_annotations_vision["uuid"]))
    print("Common uuids: ", len(common_uuids))
    month_annotations = month_annotations_text.merge(month_annotations_vision, on="uuid", how="outer")
    print("Merged df length: ", len(month_annotations))
    month_annotations.to_json(f"/projects/frame_align/data/annotated/merged/raw/merged_anno_{month}.jsonl", orient="records", lines=True)

2023-08-01_2023-08-31
Text annotations:  75534 Vision annotations:  75442
Common uuids:  75442
Merged df length:  75534
2023-09-01_2023-09-30
Text annotations:  25811 Vision annotations:  25811
Common uuids:  25811
Merged df length:  25811
2023-05-01_2023-05-31
Text annotations:  75296 Vision annotations:  75296
Common uuids:  75296
Merged df length:  75296
2024-04-01_2024-04-30
Text annotations:  36269 Vision annotations:  36258
Common uuids:  36258
Merged df length:  36269
2023-07-01_2023-07-31
Text annotations:  71805 Vision annotations:  71723
Common uuids:  71723
Merged df length:  71805
2023-10-01_2023-10-31
Text annotations:  41016 Vision annotations:  41016
Common uuids:  41016
Merged df length:  41016
2024-03-01_2024-03-31
Text annotations:  36498 Vision annotations:  18161
Common uuids:  16498
Merged df length:  38161
2023-06-01_2023-06-30
Text annotations:  25567 Vision annotations:  25567
Common uuids:  25567
Merged df length:  25567
2024-01-01_2024-01-31
Text annotations: 

In [None]:
# Total number of annotations
total_annotations = 0
for month in month_set_dict.keys():
    month_annotations = pd.read_json(f"/projects/frame_align/data/annotated/merged/raw/merged_anno_{month}.jsonl", lines=True)
    total_annotations += len(month_annotations)
print("Total succesful annotations: ", total_annotations)

Total annotations:  506392


In [13]:
# Failed annotations
failed_text = 0
failed_vision = 0
text_path = Path("/projects/frame_align/data/annotated/text/")
vision_path = Path("/projects/frame_align/data/annotated/vision/")
for file in text_path.iterdir():
    if file.suffix == '.tsv':
        failed_df = pd.read_csv(file, sep="\t")
        failed_text += len(failed_df)
for file in vision_path.iterdir():
    if file.suffix == '.tsv':
        failed_df = pd.read_csv(file, sep="\t")
        failed_vision += len(failed_df)
print("Failed text annotations: ", failed_text)
print("Failed vision annotations: ", failed_vision)
    

Failed text annotations:  12288
Failed vision annotations:  68411


### Processing

In [9]:
frame_short_dict = {'economic': 'economic',
 'capacity and resources': 'cap&res',
 'morality': 'morality',
 'fairness and equality': 'fairness',
 'legality, constitutionality and jurisprudence': 'legality',
 'policy prescription and evaluation': 'policy',
 'crime and punishment': 'crime',
 'security and defense': 'security',
 'health and safety': 'health',
 'quality of life': 'quality_life',
 'cultural identity': 'culture',
 'public opinion': 'public_op',
 'political': 'political',
 'external regulation and reputation': 'regulation',
 'other': 'other'}

In [10]:
orig_col_names = ['topic', 'topic_justification', 'entity-name', 'entity-gender',
       'sentiment', 'sentiment-reason', 'frames-list',
       'frames-list-justification', 'issue_frame', 'issue_frame_justification',
       'uuid', 'title', 'vision_frames_frames-list', 'vision_frames_reason',
       'entity_entity-name', 'entity_entity-gender', 'entity_sentiment',
       'entity_sentiment-reason', 'image_url']
clean_col_names = ['text-topic', 'text-topic-exp', 'text-entity-name', 'text-entity-gender','text-entity-sentiment', 'text-entity-sentiment-exp', 'text-generic-frame',
       'text-generic-frame-exp', 'text-issue-frame', 'text-issue-frame-exp',
       'uuid', 'title', 'img-generic-frame', 'img-frame-exp',
       'img-entity-name', 'img-entity-gender', 'img-entity-sentiment',
       'img-entity-sentiment-exp', 'image-url']

In [11]:
raw_path = Path('/projects/frame_align/data/annotated/merged/raw')

In [12]:
nan_count_text = 0
nan_count_vision = 0
for month_file in raw_path.iterdir():
    month_df = pd.read_json(month_file, orient='records', lines=True)
    month_df = month_df[orig_col_names]
    month_df.columns = clean_col_names
    nan_count_text += month_df['text-generic-frame'].isna().sum()
    nan_count_vision += month_df['img-generic-frame'].isna().sum()
print("Text nan count: ", nan_count_text)
print("Vision nan count: ", nan_count_vision)

Text nan count:  3702
Vision nan count:  20225


### Adding Topics and Political Leaning

In [13]:
topic_mapping = json.load(open("/projects/frame_align/data/annotated/topics/latest_topic_labels.json", "r"))

In [None]:
political_leaning = {"left" : ['alternet.org', 'editor.cnn.com', 'democracynow.org', 'dailybeast.com', 'huffpost.com', 'theintercept.com','jacobin.com', 'motherjones.com', 'newyorker.com', 'slate.com',   'msnbc.com', 'vox.com'],
'left_lean' : ['abcnews.com','apnews.com', 'theatlantic.com', 'bloomberg.com', 'cbsnews.com', 'insider.com', 'nbcnews.com', 'thenytimes.com', 'npr.com', 'politico.com', 'propublica.org', 'time.com', 'washingtonpost.com', 'yahoonews.com','usatoday.com', 'theguardian.com'],
"center" : ['axios.com', 'bbc.com', 'forbes.com', 'newsweek.com', 'reuters.com', 'realclearpolitics.com', 'thehill.com'],
"right_lean" : ['thedispatch.com', 'theepochtimes.com', 'foxbusiness.com', 'ijr.com', 'nypost.com', 'thepostmillennial.com', 'washingtonexaminer.com', 'washingtontimes.com'],
"right" : ['theamericanconservative.com', 'theamericanspectator.com', 'breitbart.com', 'dailycaller.com', 'dailywire.com', 'dailymail.com', 'foxnews.com', 'newsmax.com', 'oann.com', 'thefederalist.com']}
all_hosts = [values for k, v in political_leaning.items() for values in v]
host_mapping = {host: k for k, v in political_leaning.items() for host in v}

In [33]:
orig_columns_to_keep = ['id', 'authors', 'date_publish', 'description','language', 'maintext', 'source_domain','url']

In [57]:
for month_file in raw_path.iterdir():
    month_df = pd.read_json(month_file, orient='records', lines=True)
    month_name = month_file.stem[12:]
    print(f"{month_name}, OG length: ", len(month_df), end=",")
    # Clean column names
    month_df = month_df[orig_col_names]
    month_df.columns = clean_col_names
    # Drop NaN framing values
    month_df.dropna(subset=['text-generic-frame', 'img-generic-frame'], inplace=True)
    # Shorten frame names
    clean_frame_preds_text = month_df['text-generic-frame'].apply(lambda frame_list: [frame_short_dict[frame.lower()] for frame in frame_list if frame.lower() in frame_short_dict])
    clean_frame_preds_text_len = clean_frame_preds_text.apply(len)
    clean_frame_preds_img = month_df['img-generic-frame'].apply(lambda frame_list: [frame_short_dict[frame.lower()] for frame in frame_list if frame.lower() in frame_short_dict])
    clean_frame_preds_img_len = clean_frame_preds_img.apply(len)
    month_df[(clean_frame_preds_text_len < 0) | (clean_frame_preds_img_len < 0)].to_csv(f"/projects/frame_align/data/annotated/merged/bad_preds/{month_name}_badpreds.csv", index=False)
    # month_df['text-generic-frame'] = 
    month_df['img-generic-frame-len'] = clean_frame_preds_img_len
    month_df['text-generic-frame-len'] = clean_frame_preds_text_len
    month_df = month_df[(month_df['img-generic-frame-len'] > 0) & (month_df['text-generic-frame-len'] > 0)]
    print(f" Invalid removal: {len(month_df)}", end=",")
    # Add GPT topics
    month_df['gpt-topic'] = month_df['text-topic'].apply(lambda x: topic_mapping[x] if x in topic_mapping else None)
    month_df = month_df[month_df['gpt-topic'] != "Sports"]
    print(f" Post sports: {len(month_df)}", end=",")
    orig_df = pd.read_csv(f"/projects/frame_align/data/raw/text/{month_name}/datawithtopics_merged.csv")
    merged_df = month_df.merge(orig_df[orig_columns_to_keep], left_on='uuid', right_on='id', how='left')
    merged_df.drop(columns=['id'], inplace=True)
    # Only keeping english articles
    merged_df = merged_df[merged_df['language'] == 'en']
    print(f" Post non-english: {len(merged_df)}", end=",")
    # Remove bbc and dailymail
    merged_df = merged_df[~merged_df['source_domain'].isin(['www.bbc.com', 'www.dailymail.com'])]
    print(f" Post bbc filter: {len(merged_df)}")
    # Add political leaning
    political_leaning = []
    for row_no, row in merged_df.iterrows():
        for host in all_hosts:
            if host in row['source_domain']:
                political_leaning.append(host_mapping[host])
                break
        else:
            political_leaning.append(None)
    assert len(political_leaning) == len(merged_df)
    merged_df['political_leaning'] = political_leaning
    month_df.to_json(f"/projects/frame_align/data/annotated/merged/processed/json/{month_file.stem}.jsonl", orient="records", lines=True)
    month_df.to_csv(f"/projects/frame_align/data/annotated/merged/processed/csv/{month_file.stem}.csv", index=False)

2024-02-01_2024-02-29, OG length:  14274, Invalid removal: 9455, Post sports: 8482, Post non-english: 8482
 Post bbc filter: 8482
2023-07-01_2023-07-31, OG length:  71805, Invalid removal: 48896, Post sports: 44141, Post non-english: 43066
 Post bbc filter: 40190
2024-01-01_2024-01-31, OG length:  39317, Invalid removal: 26734, Post sports: 25029, Post non-english: 25029
 Post bbc filter: 25029
2023-05-01_2023-05-31, OG length:  75296, Invalid removal: 51831, Post sports: 47695, Post non-english: 47105
 Post bbc filter: 43999
2024-04-01_2024-04-30, OG length:  36269, Invalid removal: 24412, Post sports: 22696, Post non-english: 22696
 Post bbc filter: 22696
2023-11-01_2023-11-30, OG length:  28781, Invalid removal: 19528, Post sports: 18497, Post non-english: 18497
 Post bbc filter: 18497
2023-10-01_2023-10-31, OG length:  41016, Invalid removal: 28285, Post sports: 26095, Post non-english: 25853
 Post bbc filter: 25456
2024-03-01_2024-03-31, OG length:  38161, Invalid removal: 10724, 

In [58]:
# Total number of annotations
total_annotations = 0
combined_annotations = []
for month in month_set_dict.keys():
    month_annotations = pd.read_json(f"/projects/frame_align/data/annotated/merged/processed/json/merged_anno_{month}.jsonl", lines=True)
    combined_annotations.append(month_annotations)
    total_annotations += len(month_annotations)
print("Total processed annotations: ", total_annotations)
combined_annotations = pd.concat(combined_annotations)
combined_annotations.to_json("/projects/frame_align/data/annotated/merged/processed/combined_annotations.jsonl", orient="records", lines=True)
combined_annotations.to_csv("/projects/frame_align/data/annotated/merged/processed/combined_annotations.csv", index=False)

Total processed annotations:  302897
