In [107]:
import json
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

In [108]:
# Run src/data/combine_annotations.py first to generate the combined annotations file

In [109]:
frames = ["Economic","Capacity and resources","Morality","Fairness and equality","Legality, constitutionality and jurisprudence","Policy prescription and evaluation","Crime and punishment","Security and defense","Health and safety","Quality of life","Cultural identity","Public opinion","Political","External regulation and reputation","Other"]
frames = [f.lower() for f in frames]

In [110]:
analysis_path = Path('/projects/frame_align/data/annotated/analysis')
text_analysis_path = analysis_path / 'text'
vision_analysis_path = analysis_path / 'vision'

In [111]:
combined_df = pd.read_csv(analysis_path/'combined_annotations.csv')
combined_df.shape

uuids = combined_df['text_id'].unique()
len(uuids)


111405

In [112]:
combined_df['text_frame_name'] = combined_df['text_frame_name'].str.lower()
combined_df['vision_frame-name'] = combined_df['vision_frame-name'].str.lower()

text_valid = combined_df['text_frame_name'].isin(frames)
vision_valid = combined_df['vision_frame-name'].isin(frames)
intersection = text_valid & vision_valid
combined_df = combined_df[intersection]

In [113]:
annotated_articles = []
data_dir = Path('/projects/frame_align/data/raw/2023-2024/')
for month_dir in data_dir.iterdir():
    if not month_dir.is_dir():
        continue
    df = pd.read_csv(month_dir/"datawithtopiclabels.csv")
    df = df[df['id'].isin(uuids)]
    annotated_articles.append(df)
annotated_articles = pd.concat(annotated_articles)
annotated_articles['month'].value_counts()
annotated_articles.to_csv(analysis_path.parent.parent / "srishti-analysis"/'annotated_articles.csv', index=False)

In [114]:
# annotated_articles.to_csv(analysis_path/'_srishti_annotated_articles.csv', index=False)

In [115]:
# annotated_articles = pd.read_csv(analysis_path/'_srishti_annotated_articles.csv')

In [116]:
annotated_articles.head(2)

Unnamed: 0,topic,auto_topic_label,id,authors,date_download,date_modify,date_publish,description,filename,image_url,language,localpath,maintext,source_domain,title,title_page,title_rss,url,month,topic_label
11900,-1,police_new_man_woman_says_year_school_old_arre...,d41fe6f9-2313-4f94-a8e2-82915987fa37,['Mauricio Maldonado'],2023-11-15 05:20:58+00:00,,2023-11-14 23:25:18,Lauderhill police detectives are investigating...,https%3A%2F%2Fwww.cbsnews.com%2Fmiami%2Fnews%2...,https://assets2.cbsnewsstatic.com/hub/i/r/2023...,en,,MIAMI - Lauderhill police detectives are inves...,www.cbsnews.com,Police: Juvenile shot in the neck in Lauderhill,,,https://www.cbsnews.com/miami/news/police-juve...,2023-11-01_2023-11-30,Crime
11901,-1,police_new_man_woman_says_year_school_old_arre...,c9c538db-7ee1-4467-8351-6bd74b844a3b,"['Kcal-News Staff', 'The Kcal News Staff Is A ...",2023-11-29 17:11:42+00:00,,2023-11-29 08:57:05,The ordinance is meant to curb the rental of p...,https%3A%2F%2Fwww.cbsnews.com%2Flosangeles%2Fn...,https://assets2.cbsnewsstatic.com/hub/i/r/2022...,en,,The Los Angeles City Council approved a draft ...,www.cbsnews.com,LA City Council moves forward with policy requ...,,,https://www.cbsnews.com/losangeles/news/la-cit...,2023-11-01_2023-11-30,Crime


In [117]:
print(annotated_articles.columns)
print("*"*50)
print(combined_df.columns)

Index(['topic', 'auto_topic_label', 'id', 'authors', 'date_download',
       'date_modify', 'date_publish', 'description', 'filename', 'image_url',
       'language', 'localpath', 'maintext', 'source_domain', 'title',
       'title_page', 'title_rss', 'url', 'month', 'topic_label'],
      dtype='object')
**************************************************
Index(['text_topic', 'text_topic_justification', 'text_summary',
       'text_entity_name', 'text_justification_entity_sentiment',
       'text_entity_sentiment', 'text_frame_justification', 'text_frame_id',
       'text_frame_name', 'text_tone', 'text_justification_tone',
       'text_issue_frame', 'text_issue_frame_justification', 'text_id',
       'vision_caption', 'vision_main-actor', 'vision_sentiment',
       'vision_sentiment-justification', 'vision_facial-expression',
       'vision_facial-expression-justification', 'vision_perceivable-gender',
       'vision_perceivable-gender-justification', 'vision_symbolic-object',
       '

In [118]:
# find common columns , exclude 'id' column
common_col = list(set(annotated_articles.columns) & set(combined_df.columns))
common_col


[]

In [119]:
# Drop common columns from annotated_articles and merge with combined_df
annotated_articles_df = annotated_articles.drop(columns=common_col)
merged_combined_df = combined_df.merge(annotated_articles_df, left_on='text_id', right_on='id', how='left')
# make text_id as first column
merged_combined_df = merged_combined_df[['text_id'] + [col for col in merged_combined_df.columns if col != 'text_id']]
merged_combined_df.head(2)

Unnamed: 0,text_id,text_topic,text_topic_justification,text_summary,text_entity_name,text_justification_entity_sentiment,text_entity_sentiment,text_frame_justification,text_frame_id,text_frame_name,...,language,localpath,maintext,source_domain,title,title_page,title_rss,url,month,topic_label
0,fd2357e4-f19e-429d-b9d1-4261661c0c0f,Cosy Crime,The article discusses the expected trend of 'c...,The article discusses the expected rise of 'co...,Cosy crime genre,The article discusses the cosy crime genre as ...,positive,The article discusses the trend of 'cosy crime...,11,cultural identity,...,en,,A sleuth in search of clues to putting on a su...,www.theguardian.com,Cosy crime dramas are killer tickets at the Ed...,,,https://www.theguardian.com/stage/2023/aug/06/...,2023-08-01_2023-08-31,Crime
1,ca6c063d-16af-43f2-b785-7b74c92a58e2,Sports,The article discusses the transfer of a footba...,Sheffield United have completed the signing of...,Cameron Archer,Archer's transfer to Sheffield United on a sig...,positive,The article primarily focuses on the financial...,1,economic,...,en,,Last updated on .From the section Sheff Utd\nC...,www.bbc.com,Cameron Archer: Sheff Utd sign England Under-2...,,,https://www.bbc.com/sport/football/66595410,2023-08-01_2023-08-31,Crime


In [120]:
merged_combined_df.columns

Index(['text_id', 'text_topic', 'text_topic_justification', 'text_summary',
       'text_entity_name', 'text_justification_entity_sentiment',
       'text_entity_sentiment', 'text_frame_justification', 'text_frame_id',
       'text_frame_name', 'text_tone', 'text_justification_tone',
       'text_issue_frame', 'text_issue_frame_justification', 'vision_caption',
       'vision_main-actor', 'vision_sentiment',
       'vision_sentiment-justification', 'vision_facial-expression',
       'vision_facial-expression-justification', 'vision_perceivable-gender',
       'vision_perceivable-gender-justification', 'vision_symbolic-object',
       'vision_symbolic-meaning', 'vision_symbolic-meaning-explanation',
       'vision_frame-id', 'vision_frame-name', 'vision_frame-justification',
       'vision_image_url', 'vision_title', 'vision_uuid', 'topic',
       'auto_topic_label', 'id', 'authors', 'date_download', 'date_modify',
       'date_publish', 'description', 'filename', 'image_url', 'language

In [121]:
len(merged_combined_df['topic_label'].unique())

11

In [131]:
merged_combined_df['source_domain'].value_counts()

source_domain
www.cbsnews.com                    28233
www.bbc.com                        12603
apnews.com                          7424
www.theguardian.com                 5778
www.forbes.com                      5568
nypost.com                          5041
www.foxnews.com                     3209
www.breitbart.com                   2607
www.axios.com                       2564
www.huffpost.com                    2475
www.nbcnews.com                     2247
www.newsweek.com                    2087
www.reuters.com                     2065
dailycaller.com                     2013
www.insider.com                     1854
www.usatoday.com                    1361
www.newsmax.com                     1236
www.washingtontimes.com              851
www.foxbusiness.com                  568
time.com                             538
thehill.com                          488
www.msnbc.com                        468
www.politico.com                     425
www.oann.com                         397
sl

In [122]:
merged_combined_df.shape

(103142, 51)

Filter length texts

In [123]:
merged_combined_df['maintext'] = merged_combined_df['maintext'].astype(str)
merged_combined_df['text_length'] = merged_combined_df['maintext'].apply(lambda x: len(x))

lower_quantile = merged_combined_df['text_length'].quantile(0.05)
upper_quantile = merged_combined_df['text_length'].quantile(0.95)

merged_combined_df = merged_combined_df[(merged_combined_df['text_length'] >= lower_quantile) & (merged_combined_df['text_length'] <= upper_quantile)]
merged_combined_df.shape

(92915, 52)

In [132]:
# left_hosts = ['alternet.org', 'editor.cnn.com', 'democracynow.org', 'dailybeast.com', 'huffpost.com', 'theintercept.com','jacobin.com', 'motherjones.com', 'newyorker.com', 'slate.com',   'msnbc.com', 'vox.com']
# left_lean_hosts = ['abcnews.com','apnews.com', 'theatlantic.com', 'bloomberg.com', 'cbsnews.com', 'insider.com', 'nbcnews.com', 'thenytimes.com', 'npr.com', 'politico.com', 'propublica.org', 'time.com', 'washingtonpost.com', 'yahoonews.com','usatoday.com', 'theguardian.com']
# center_hosts = ['axios.com', 'bbc.com', 'forbes.com', 'newsweek.com', 'reuters.com', 'realclearpolitics.com', 'thehill.com']
# right_lean_hosts = ['thedispatch.com', 'theepochtimes.com', 'foxbusiness.com', 'ijr.com', 'nypost.com', 'thepostmillennial.com', 'washingtonexaminer.com', 'washingtontimes.com']
# right_hosts = ['theamericanconservative.com', 'theamericanspectator.com', 'breitbart.com', 'dailycaller.com', 'dailywire.com', 'dailymail.com', 'foxnews.com', 'newsmax.com', 'oann.com', 'thefederalist.com']


In [130]:
merged_combined_df.to_csv(analysis_path.parent.parent / "srishti-analysis"/'merged_combined_annotations.csv', index=False)
print(f"Saved merged_combined_annotations.csv to {analysis_path.parent.parent / 'srishti-analysis'}")

Saved merged_combined_annotations.csv to /projects/frame_align/data/srishti-analysis


In [124]:
# for each month, get unique frame. for each frame, gte 2000 articles

months = merged_combined_df['month'].unique()
for month in months:
    month_df = merged_combined_df[merged_combined_df['month'] == month]
    vision_frame_counts = month_df['text_frame_name'].value_counts()
    text_frame_counts = month_df['vision_frame-name'].value_counts()
    print(f"Month frame count: {month} - vision: {len(vision_frame_counts)}, text: {len(text_frame_counts)}")

Month frame count: 2023-08-01_2023-08-31 - vision: 15, text: 12
Month frame count: 2023-09-01_2023-09-30 - vision: 15, text: 12
Month frame count: 2023-10-01_2023-10-31 - vision: 15, text: 9
Month frame count: 2023-11-01_2023-11-30 - vision: 15, text: 11
Month frame count: 2023-07-01_2023-07-31 - vision: 15, text: 12
Month frame count: 2023-12-01_2023-12-31 - vision: 14, text: 10
Month frame count: 2023-05-01_2023-05-31 - vision: 15, text: 12
Month frame count: 2023-06-01_2023-06-30 - vision: 15, text: 12


In [125]:
# for each month, per text frame, get 2000 unique articles
sampled_df = pd.DataFrame()
for month in months:
    for frame in frames:
        month_df = merged_combined_df[(merged_combined_df['month'] == month) & (merged_combined_df['text_frame_name'] == frame)]
        if month_df.shape[0] >=18:
            month_df = month_df.sample(n=18, random_state=1)
        sampled_df = pd.concat([sampled_df, month_df])

In [126]:
# sampled_df.to_csv(analysis_path/'sampled_annotated_articles_srishti.csv', index=False)

In [127]:
sampled_df.to_csv(analysis_path.parent.parent / "srishti-analysis"/'sampled_annotated_articles.csv', index=False)

In [128]:
frame_counts_pivot = sampled_df.pivot_table(index='month', columns='text_frame_name', values='text_id', aggfunc='count', fill_value=0)
frame_counts_pivot



text_frame_name,capacity and resources,crime and punishment,cultural identity,economic,external regulation and reputation,fairness and equality,health and safety,"legality, constitutionality and jurisprudence",morality,other,policy prescription and evaluation,political,public opinion,quality of life,security and defense
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-05-01_2023-05-31,18,18,18,18,18,18,18,18,18,18,18,18,15,18,18
2023-06-01_2023-06-30,18,18,18,18,5,18,18,18,18,18,18,18,16,18,18
2023-07-01_2023-07-31,18,18,18,18,10,18,18,18,18,18,18,18,18,18,18
2023-08-01_2023-08-31,18,18,18,18,12,18,18,18,18,18,18,18,18,18,18
2023-09-01_2023-09-30,18,18,18,18,18,18,18,18,18,18,18,18,18,18,18
2023-10-01_2023-10-31,18,18,18,18,2,18,18,18,18,18,18,18,11,18,18
2023-11-01_2023-11-30,18,18,18,18,2,18,18,18,18,18,18,18,12,18,18
2023-12-01_2023-12-31,4,18,18,18,0,17,18,18,10,6,18,18,1,18,18
