In [37]:
# accumumate all headlines over a series of days based off a list of media organisation you want to include and start dates
import datetime
import pandas as pd
from collections import defaultdict
from mediadata import get_headline_df, get_biden_trump_dataframes, get_accumulated_headlines
from tf_idf import remove_stopwords, build_documents, get_tf_idf_df


In [3]:
start_date = datetime.date(2024, 3, 18)
end_date = datetime.date(2024, 3, 21)
media_orgs = ['FOX', 'CNN', 'ABCUS', 'SKYUS', 'NYPOST', 'DAILY', 'MSN', 'NBC', 'SMH', 'ABCAU', 'NINE', 'NEWSCOM', ]

df = get_accumulated_headlines(start_date, end_date, media_orgs)

In [5]:
from nltk.corpus import stopwords
import nltk
# nltk.download()

def remove_stopwords(df: pd.DataFrame, column = 'Headline') -> pd.DataFrame:
    """Removes the stopwords from the Headline column of a dataframe"""
    set_stopwords = set(stopwords.words('English'))
    df_no_stopwords = df.copy()
    df_no_stopwords[column] = df_no_stopwords[column].apply(lambda x: ' '.join([i for i in x.split() if i not in set_stopwords]))
    return df_no_stopwords

df_no_stopwords = remove_stopwords(df)

In [15]:
biden, trump, _, _ = get_biden_trump_dataframes(df)

In [17]:
from typing import List
def build_documents(df: pd.DataFrame):
    """Builds documents based off dataframe containing different values in the MediaOrg column"""
    documents = []
    all_media_orgs = list(set(df['MediaOrg']))
    for org in all_media_orgs:
        sub_df = df[df['MediaOrg'] == org]
        sub_df_document = ' '.join(list(sub_df['Headline']))
        documents.append(sub_df_document)
    return documents, all_media_orgs

def build_documents_for_organisation(dfs: List[pd.DataFrame]):
    labels = ['Biden', 'Trump', 'Both', 'Neither']
    documents = []
    for df in dfs:
        sub_df_document = ' '.join(list(df['Headline']))
        documents.append(sub_df_document)
    return documents, labels
    

def get_tf_idf_df(documents, labels):
    from sklearn.feature_extraction.text import TfidfVectorizer

    tfidf_vectorizer=TfidfVectorizer(use_idf=True)

    # just send in all your docs here
    fitted_vectorizer=tfidf_vectorizer.fit(documents)
    vectors=fitted_vectorizer.transform(documents)
    
    feature_names = fitted_vectorizer.get_feature_names_out()
    dense = vectors.todense()
    denselist = dense.tolist()
    tf_idf_df = pd.DataFrame(denselist, columns=feature_names)
    tf_idf_df.index = labels
    return tf_idf_df[[i for i in tf_idf_df.columns if not i.isnumeric()]].T

# get the documents for what each media organisation is saying about biden and trump
biden_documents, biden_labels = build_documents(biden)
trump_documents, trump_labels = build_documents(trump)

# get the tf_idf measure for biden and trump 
tf_idf_df = get_tf_idf_df(biden_documents, biden_labels)
# trump_tf_idf = get_tf_idf_df(trump_documents, trump_labels)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
documents = biden_documents
tfidf_vectorizer=TfidfVectorizer(use_idf=True)

# just send in all your docs here
fitted_vectorizer=tfidf_vectorizer.fit(documents)
vectors=fitted_vectorizer.transform(documents)


# Get idf scores
idf_scores = fitted_vectorizer.idf_
feature_names = fitted_vectorizer.get_feature_names_out()
# Create a dictionary mapping words to their idf scores
word_to_idf = dict(zip(feature_names, idf_scores))

# Choose a document to inspect, for example the first document
first_vector = vectors[0]

# Create a dictionary mapping words to their tf scores for the chosen document
df = pd.DataFrame(first_vector.T.todense(), index=feature_names, columns=["tf"])
word_to_tf = df.to_dict()["tf"]


In [None]:
word_to_idf


In [21]:
tf_idf_df.sort_values('FOX', ascending=False)

Unnamed: 0,SKYUS,FOX,DAILY,NINE,CNN,NYPOST,NBC,MSN,ABCUS,SMH
biden,0.134998,0.505591,0.251192,0.194979,0.230276,0.386350,0.391481,0.375753,0.282163,0.168814
to,0.000000,0.301044,0.000000,0.000000,0.151804,0.156734,0.184339,0.099082,0.186010,0.111287
after,0.000000,0.206249,0.000000,0.000000,0.000000,0.136666,0.000000,0.000000,0.000000,0.000000
the,0.147012,0.153926,0.000000,0.000000,0.000000,0.129456,0.243611,0.163676,0.000000,0.245115
in,0.044497,0.139771,0.000000,0.000000,0.101203,0.215509,0.221207,0.247706,0.232512,0.000000
...,...,...,...,...,...,...,...,...,...,...
it,0.000000,0.000000,0.000000,0.000000,0.000000,0.102499,0.000000,0.129594,0.000000,0.000000
islanders,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.064295,0.043198,0.000000,0.000000
islamophobic,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.050816,0.000000,0.000000
invite,0.000000,0.000000,0.000000,0.000000,0.176490,0.034166,0.000000,0.000000,0.000000,0.000000


In [79]:
tf_idf_df = get_tf_idf_df(biden_documents, biden_labels)
tf_idf_df.sort_values('CNN', ascending=False)[['CNN']]

Unnamed: 0,CNN
biden,0.431309
reinvigorate,0.202063
oversight,0.202063
parnas,0.202063
invite,0.202063
...,...
expose,0.000000
excuse,0.000000
ex,0.000000
event,0.000000


In [65]:
word = 'Trump'
# fox count
print(trump_documents[0].count(word))
tf_0 = trump_documents[0].count(word)/len(trump_documents[0].split())
# cnn count
print(trump_documents[1].count(word))
tf_1 = trump_documents[1].count(word)/len(trump_documents[1].split())

import math
print(f"Fox tf-idf  {tf_0 * math.log((1 + 2)/(1 + 101/(101 + 56)))}" )

print(f"CNN tf-idf {tf_1 * math.log((1 + 2)/(1 + 56/(101 + 56)))}")

101
56
Fox tf-idf  0.06140580749350993
CNN tf-idf 0.08730784279998348


In [41]:
trump_tf_idf.sort_values('FOX', ascending=False).head(15)[['FOX']]

Unnamed: 0,FOX
trump,0.836558
senate,0.124241
case,0.099393
bloodbath,0.082828
tiktok,0.081488
watch,0.069847
race,0.069847
accuses,0.069847
primary,0.066262
ban,0.066262


In [43]:
trump_tf_idf.sort_values('CNN', ascending=False).head(25)[['CNN']]

Unnamed: 0,CNN
trump,0.777
bond,0.13875
ex,0.13875
could,0.124875
properties,0.097504
historic,0.097504
prosecution,0.097504
aide,0.097125
case,0.097125
prison,0.097125


## This section compares 

In [32]:
cnn_df = df_no_stopwords[df_no_stopwords['MediaOrg'] == 'CNN'].copy()
fox_df = df_no_stopwords[df_no_stopwords['MediaOrg'] == 'FOX'].copy()


def get_tf_idf_analysis_for_single_org(df):
    cnn_biden, cnn_trump, cnn_both, cnn_neither = get_biden_trump_dataframes(df)
    cnn_documents, cnn_labels = build_documents_for_organisation([cnn_biden, cnn_trump, cnn_both, cnn_neither])
    cnn_tf_idf_df = get_tf_idf_df(cnn_documents, cnn_labels)
    return cnn_tf_idf_df
    
    
cnn_tf_idf_df = get_tf_idf_analysis_for_single_org(cnn_df)
fox_tf_idf_df = get_tf_idf_analysis_for_single_org(fox_df)

In [36]:

cnn_tf_idf_df.sort_values('Trump', ascending=False).head(20)

Unnamed: 0,Biden,Trump,Both,Neither
trump,0.0,0.759268,0.315728,0.222518
bond,0.0,0.167473,0.0,0.063805
ex,0.0,0.167473,0.0,0.068714
could,0.0,0.150725,0.0,0.088346
case,0.0,0.117231,0.0,0.034357
aide,0.0,0.117231,0.0,0.034357
prison,0.0,0.117231,0.0,0.053989
says,0.0,0.100484,0.0,0.196324
peter,0.0,0.100484,0.0,0.049081
make,0.0,0.100484,0.0,0.058897


In [34]:
fox_tf_idf_df.sort_values('Trump', ascending=False).head(10)

Unnamed: 0,Biden,Trump,Both,Neither
trump,0.0,0.844828,0.523916,0.216502
senate,0.0,0.15498,0.0,0.055603
case,0.013678,0.100376,0.0,0.06002
bloodbath,0.0,0.083646,0.161205,0.021436
ban,0.0,0.082656,0.0,0.034421
gop,0.0,0.082656,0.0,0.055603
primary,0.0,0.082656,0.0,0.037068
republican,0.0,0.072324,0.0,0.052955
legal,0.0,0.072324,0.0,0.037068
abc,0.0,0.072324,0.0,0.018534


In [35]:
df[

Unnamed: 0,Headline,MediaOrg
0,Suspect captured following fatal shooting of N...,FOX
1,How to turn your Android phone into a Wi-Fi ho...,FOX
2,Rickie Fowler screams at fan after awkward tee...,FOX
3,US holds conference on military AI use with do...,FOX
4,5 things to know about MLK Jr. and the legacy ...,FOX
...,...,...
232,How much are wind turbines dragging down home ...,CNN
233,She thought she’d never see the stranger she m...,CNN
234,Remember when Katharine Hepburn wore pants and...,CNN
235,‘That is not Bob Dylan:’ Fans criticize first ...,CNN
