In [1]:
# accumumate all headlines over a series of days based off a list of media organisation you want to include and start dates
import datetime
import pandas as pd
from collections import defaultdict
from mediadata import get_headline_df, get_biden_trump_dataframes, get_accumulated_headlines



In [15]:
start_date = datetime.date(2024, 3, 18)
end_date = datetime.date(2024, 3, 21)
media_orgs = ['FOX', 'CNN']

df = get_accumulated_headlines(start_date, end_date, media_orgs)

In [3]:
from nltk.corpus import stopwords

def remove_stopwords(df: pd.DataFrame, column = 'Headline') -> pd.DataFrame:
    """Removes the stopwords from the Headline column of a dataframe"""
    set_stopwords = set(stopwords.words('English'))
    df_no_stopwords = df.copy()
    df_no_stopwords[column] = df_no_stopwords[column].apply(lambda x: ' '.join([i for i in x.split() if i not in set_stopwords]))
    return df_no_stopwords

df_no_stopwords = remove_stopwords(df)

In [27]:
biden, trump, _, _ = get_biden_trump_dataframes(df_no_stopwords)
biden.head()

Unnamed: 0,Headline,MediaOrg,contains_biden,contains_trump,contains_both
14,Biden's mysterious new shoes reportedly provid...,FOX,True,False,False
39,President Biden's mysterious new shoes reporte...,FOX,True,False,False
119,VP Harris urged leave Biden ticket amid ‘disap...,FOX,True,False,False
156,"Benjamin Netanyahu blasts Schumer, Biden wanin...",FOX,True,False,False
160,DHS responds migrant flown US Biden program al...,FOX,True,False,False


In [29]:
def build_documents(df: pd.DataFrame):
    """Builds documents based off dataframe containing different values in the MediaOrg column"""
    documents = []
    all_media_orgs = list(set(df['MediaOrg']))
    for org in all_media_orgs:
        sub_df = df[df['MediaOrg'] == org]
        sub_df_document = ' '.join(list(sub_df['Headline']))
        documents.append(sub_df_document)
    return documents, all_media_orgs

def get_tf_idf_df(documents, labels):
    from sklearn.feature_extraction.text import TfidfVectorizer

    vectoriser = TfidfVectorizer()
    vectors = vectoriser.fit_transform(documents)
    feature_names = vectoriser.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    tf_idf_df = pd.DataFrame(denselist, columns=feature_names)
    tf_idf_df.index = labels
    return tf_idf_df[[i for i in tf_idf_df.columns if not i.isnumeric()]].T

# get the documents for what each media organisation is saying about biden and trump
biden_documents, biden_labels = build_documents(biden)
trump_documents, trump_labels = build_documents(trump)

# get the tf_idf measure for biden and trump 
tf_idf_df = get_tf_idf_df(biden_documents, biden_labels)
trump_tf_idf = get_tf_idf_df(trump_documents, trump_labels)

In [45]:
tf_idf_df.sort_values('FOX', ascending=False).head(15)[['FOX']]

Unnamed: 0,FOX
biden,0.801116
new,0.127122
report,0.108962
event,0.108962
rips,0.090802
provide,0.072641
stability,0.072641
obama,0.072641
baby,0.072641
admin,0.072641


In [46]:
tf_idf_df.sort_values('CNN', ascending=False).head(15)[['CNN']]

Unnamed: 0,CNN
biden,0.431309
reinvigorate,0.202063
oversight,0.202063
parnas,0.202063
invite,0.202063
west,0.202063
officials,0.202063
critical,0.202063
seeks,0.202063
committee,0.202063
