In [1]:
# accumumate all headlines over a series of days based off a list of media organisation you want to include and start dates
import datetime
import pandas as pd
from collections import defaultdict
from mediadata import get_headline_df, get_biden_trump_dataframes, get_accumulated_headlines



In [2]:
start_date = datetime.date(2024, 3, 18)
end_date = datetime.date(2024, 3, 21)
media_orgs = ['FOX', 'CNN']

df = get_accumulated_headlines(start_date, end_date, media_orgs)

In [3]:
from nltk.corpus import stopwords
import nltk
# nltk.download()

def remove_stopwords(df: pd.DataFrame, column = 'Headline') -> pd.DataFrame:
    """Removes the stopwords from the Headline column of a dataframe"""
    set_stopwords = set(stopwords.words('English'))
    df_no_stopwords = df.copy()
    df_no_stopwords[column] = df_no_stopwords[column].apply(lambda x: ' '.join([i for i in x.split() if i not in set_stopwords]))
    return df_no_stopwords

df_no_stopwords = remove_stopwords(df)

In [4]:
biden, trump, _, _ = get_biden_trump_dataframes(df_no_stopwords)
biden.head()

Unnamed: 0,Headline,MediaOrg
8,"Benjamin Netanyahu blasts Schumer, Biden wanin...",FOX
20,Kirby asked point-blank Biden believes Netanya...,FOX
26,VP Harris urged leave Biden ticket amid ‘disap...,FOX
92,"Swing state voter blames Biden inflation, 'not...",FOX
148,Netanyahu condemns Biden Schumer misplaced pri...,FOX


In [14]:
from typing import List
def build_documents(df: pd.DataFrame):
    """Builds documents based off dataframe containing different values in the MediaOrg column"""
    documents = []
    all_media_orgs = list(set(df['MediaOrg']))
    for org in all_media_orgs:
        sub_df = df[df['MediaOrg'] == org]
        sub_df_document = ' '.join(list(sub_df['Headline']))
        documents.append(sub_df_document)
    return documents, all_media_orgs

def build_documents_for_organisation(dfs: List[pd.DataFrame]):
    labels = ['Biden', 'Trump', 'Both', 'Neither']
    documents = []
    for df in dfs:
        sub_df_document = ' '.join(list(df['Headline']))
        documents.append(sub_df_document)
    return documents, labels
    

def get_tf_idf_df(documents, labels):
    from sklearn.feature_extraction.text import TfidfVectorizer

    vectoriser = TfidfVectorizer()
    vectors = vectoriser.fit_transform(documents)
    feature_names = vectoriser.get_feature_names_out()
    dense = vectors.todense()
    denselist = dense.tolist()
    tf_idf_df = pd.DataFrame(denselist, columns=feature_names)
    tf_idf_df.index = labels
    return tf_idf_df[[i for i in tf_idf_df.columns if not i.isnumeric()]].T

# get the documents for what each media organisation is saying about biden and trump
biden_documents, biden_labels = build_documents(biden)
trump_documents, trump_labels = build_documents(trump)

# get the tf_idf measure for biden and trump 
tf_idf_df = get_tf_idf_df(biden_documents, biden_labels)
trump_tf_idf = get_tf_idf_df(trump_documents, trump_labels)

In [7]:
tf_idf_df.sort_values('FOX', ascending=False).head(15)[['FOX']]

Unnamed: 0,FOX
biden,0.801116
new,0.127122
report,0.108962
event,0.108962
rips,0.090802
provide,0.072641
stability,0.072641
obama,0.072641
baby,0.072641
admin,0.072641


In [8]:
tf_idf_df.sort_values('CNN', ascending=False).head(15)[['CNN']]

Unnamed: 0,CNN
biden,0.431309
reinvigorate,0.202063
oversight,0.202063
parnas,0.202063
invite,0.202063
west,0.202063
officials,0.202063
critical,0.202063
seeks,0.202063
committee,0.202063


## This section compares 

In [32]:
cnn_df = df_no_stopwords[df_no_stopwords['MediaOrg'] == 'CNN'].copy()
fox_df = df_no_stopwords[df_no_stopwords['MediaOrg'] == 'FOX'].copy()


def get_tf_idf_analysis_for_single_org(df):
    cnn_biden, cnn_trump, cnn_both, cnn_neither = get_biden_trump_dataframes(df)
    cnn_documents, cnn_labels = build_documents_for_organisation([cnn_biden, cnn_trump, cnn_both, cnn_neither])
    cnn_tf_idf_df = get_tf_idf_df(cnn_documents, cnn_labels)
    return cnn_tf_idf_df
    
    
cnn_tf_idf_df = get_tf_idf_analysis_for_single_org(cnn_df)
fox_tf_idf_df = get_tf_idf_analysis_for_single_org(fox_df)

In [33]:

cnn_tf_idf_df.sort_values('Trump', ascending=False).head(10)

Unnamed: 0,Biden,Trump,Both,Neither
trump,0.0,0.759268,0.315728,0.222518
bond,0.0,0.167473,0.0,0.063805
ex,0.0,0.167473,0.0,0.068714
could,0.0,0.150725,0.0,0.088346
case,0.0,0.117231,0.0,0.034357
aide,0.0,0.117231,0.0,0.034357
prison,0.0,0.117231,0.0,0.053989
says,0.0,0.100484,0.0,0.196324
peter,0.0,0.100484,0.0,0.049081
make,0.0,0.100484,0.0,0.058897


In [34]:
fox_tf_idf_df.sort_values('Trump', ascending=False).head(10)

Unnamed: 0,Biden,Trump,Both,Neither
trump,0.0,0.844828,0.523916,0.216502
senate,0.0,0.15498,0.0,0.055603
case,0.013678,0.100376,0.0,0.06002
bloodbath,0.0,0.083646,0.161205,0.021436
ban,0.0,0.082656,0.0,0.034421
gop,0.0,0.082656,0.0,0.055603
primary,0.0,0.082656,0.0,0.037068
republican,0.0,0.072324,0.0,0.052955
legal,0.0,0.072324,0.0,0.037068
abc,0.0,0.072324,0.0,0.018534


In [35]:
df[

Unnamed: 0,Headline,MediaOrg
0,Suspect captured following fatal shooting of N...,FOX
1,How to turn your Android phone into a Wi-Fi ho...,FOX
2,Rickie Fowler screams at fan after awkward tee...,FOX
3,US holds conference on military AI use with do...,FOX
4,5 things to know about MLK Jr. and the legacy ...,FOX
...,...,...
232,How much are wind turbines dragging down home ...,CNN
233,She thought she’d never see the stranger she m...,CNN
234,Remember when Katharine Hepburn wore pants and...,CNN
235,‘That is not Bob Dylan:’ Fans criticize first ...,CNN
