In [1]:
import datetime
from urllib.parse import urlsplit

import advertools as adv
import pandas as pd
import plotly
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [2]:
sitemap_filterfunc = [
    ('https://www.ft.com/sitemaps/news.xml',  lambda x: x),
    ('https://www.nytimes.com/sitemaps/new/news.xml.gz', 
     lambda df: df[df['loc'].str.contains('/2022/')]),
    ('https://www.bbc.com/sitemaps/https-index-com-news.xml',
     lambda df: df[df['publication_name'].eq('BBC News')]),
    ('https://www.economist.com/googlenews.xml', lambda x: x),
    ('https://www.bloomberg.com/feeds/bbiz/sitemap_news.xml', lambda x: x),
    ('https://news.sky.com/sitemap/sitemap-news.xml', lambda x: x),
    ('https://www.washingtonpost.com/arcio/news-sitemap/', lambda x: x),
    ('https://www.foxnews.com/sitemap.xml?type=news', lambda x: x),
    ('https://www.reuters.com/arc/outboundfeeds/news-sitemap-index/?outputType=xml', lambda x: x),
    ('https://www.cnbc.com/sitemap_news.xml', lambda x: x),
    # ('https://news.yahoo.com/sitemaps/news-sitemap_googlenewsindex_US_en-US.xml.gz', lambda x: x),
]

sitemap_filterfunc_df = pd.DataFrame(sitemap_filterfunc, 
                                     columns=['sitemap', 'filter_func'])
sitemap_filterfunc_df['netloc'] = [urlsplit(u).netloc for u in sitemap_filterfunc_df['sitemap']]

stopwords = ['to', 'of', 'the', 'in', 'for', 'and', 'on', 'a', 'as', 'with',
             'from', 'over', 'is', 'at', '—', '-', 'be', '2022', '–', 'it', 'by',
             'we', 'why', 'but', 'my', 'how', 'not', 'an', 'are', 'no', 'go',
             'your', 'up', 'his', 'its', 'this', 'says', 'can', 'if', 'you',
             'will']

now_raw = datetime.datetime.utcnow()
now = datetime.datetime.strftime(now_raw, '%d %b, %Y')

In [3]:
sitemapdfs = []

for sitemap in sitemap_filterfunc_df['sitemap']:
    df = adv.sitemap_to_df(sitemap)
    sitemapdfs.append(df)

2022-03-16 00:44:19,218 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.ft.com/sitemaps/news.xml
2022-03-16 00:44:20,557 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.nytimes.com/sitemaps/new/news-4.xml.gz
2022-03-16 00:44:20,634 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.nytimes.com/sitemaps/new/news-3.xml.gz
2022-03-16 00:44:20,676 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.nytimes.com/sitemaps/new/news-2.xml.gz
2022-03-16 00:44:20,713 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.nytimes.com/sitemaps/new/news-1.xml.gz
2022-03-16 00:44:20,999 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.bbc.com/sitemaps/https-sitemap-com-news-3.xml
2022-03-16 00:44:21,008 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.bbc.com/sitemaps/https-sitemap-com-news-5.xml
2022-03-16 00:44:21,137 | INFO | sitemaps.py:536 | sitemap_to_df | Getting https://www.bbc.com/sitemaps/https-sitemap-

In [4]:
sitemapdfs_concat = pd.concat(sitemapdfs, ignore_index=True)
sitemapdfs_concat['netloc'] = [urlsplit(u).netloc for u in sitemapdfs_concat['sitemap']]

In [5]:
all_sitemapdfs = pd.merge(sitemapdfs_concat, sitemap_filterfunc_df,
                          left_on='netloc', right_on='netloc', how='left')

In [6]:
def news_sitemap_wordcount(news_sitemap, phrase_len=1, showtop=20,
                           filter_func=lambda df: df):
    news_sitemap = filter_func(news_sitemap)
    title = adv.word_frequency(news_sitemap['news_title'],
                               rm_words=stopwords,
                               phrase_len=phrase_len)
    title = title.rename(columns={'abs_freq': 'count'})
    now_raw = datetime.datetime.utcnow()
    now = datetime.datetime.strftime(now_raw, '%d %b, %Y')
    return title[:showtop]

def plot_word_counts(df, name, num_articles):
    fig = px.bar(df[::-1], x='count', y='word', orientation='h',
                 # color_discrete_sequence=['#416649'],
                 height=600, width=450, hover_name='word',
                 title=f'<b>{name}</b> - ({num_articles} articles)',
                 template='none')
    fig.layout.margin.l = 150
    fig.layout.xaxis.zeroline = False
    return fig

In [7]:
included_publications = [
    'Financial Times',
    'The New York Times',
    'BBC News',
    'The Economist',
    'Bloomberg',
    'Sky News',
    'Washington Post',
    'Fox News',
    'Reuters',
    'CNBC',
    # 'Yahoo News',
]

figures = []
barcolors = plotly.colors.qualitative.T10 + ['#BAB0AC']
count = -1
for pub in all_sitemapdfs['publication_name'].unique():
    if pub in included_publications:
        count += 1

        for ngram in [1, 2]:
            temp_df = all_sitemapdfs[all_sitemapdfs['publication_name'].eq(pub)]
            filter_func = temp_df['filter_func'].iloc[0]
            wordcount_df = news_sitemap_wordcount(
                temp_df,
                phrase_len=ngram,
                filter_func=filter_func if not isinstance(filter_func, float) else lambda x: x)
            fig = plot_word_counts(wordcount_df,
                                   num_articles=len(temp_df),
                                   name=temp_df['publication_name'].iloc[0])
            fig.data[0].marker.color=barcolors[count]
            fig.layout.title.font.color = '#416649'
            figures.append(fig)


In [8]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=5, cols=4, vertical_spacing=0.04, horizontal_spacing=0.08,
                    subplot_titles=[f.layout.title.text for f in figures])

index = 0
for i in range(1, 6):
    for j in range(4):
        fig.add_trace(trace=figures[index].data[0], row=i, col=j+1)
        index += 1

In [10]:
num_articles = all_sitemapdfs[all_sitemapdfs['publication_name'].isin(included_publications)].shape[0]
fig.layout.height = 2600
fig.layout.template = 'gridon'
fig.layout.margin.l = 100
fig.layout.margin.t = 200
fig.layout.title = f'<b>Most used words in article headlines 1- & 2-grams<b><br>{now} ({num_articles:,} articles)<br><br>'
fig.write_html('/Users/me/Desktop/temp/newsfig.html', config={'displayModeBar': False})