# Daily Fail

Import required Python libraries

In [156]:
import pandas as pd
import requests
import spacy

from collections import Counter
from bs4 import BeautifulSoup

DATA_FILE = "headlines.txt"


Use `requests` to download latest headlines with `BeautifulSoup`, then dedupe any new headlines from the existing headline dataset

In [157]:

response = requests.get("https://www.dailymail.co.uk/news/headlines/index.html")
if response.ok:
    soup = BeautifulSoup(response.text, "html.parser")
    sidebar = soup.select("ul.link-bogr2 li span.pufftext strong")
    new_headlines = []
    for headline in sidebar:
        new_headlines.append(headline.get_text(strip=True))
    
    with open(DATA_FILE, "r") as file:
        existing_headlines = file.readlines()[-250:]
        existing_headlines = [line.strip() for line in existing_headlines]
        
    new_headlines = [s for s in new_headlines if s not in existing_headlines]

    with open(DATA_FILE, "a") as file:
        for s in new_headlines:
            file.write(s + "\n")
    
else:
    print("Error: ", response.status_code)

Read in existing headlines from text file

In [158]:
nlp = spacy.load("en_core_web_lg")
filename = DATA_FILE

with open(filename, "r") as file:
    existing_headlines = file.readlines()

Iterate through file to identify NER's

In [159]:
entity_counts = Counter()
for string in existing_headlines:
    doc = nlp(string.strip())
    for ent in doc.ents:
        entity_counts[(ent.text, ent.label_)] += 1


1. Create a Pandas DataFrame from the Counter object
2. Split the "Entity" column into two separate columns "Text" and "Label"
3. Drop the "Entity" column


In [160]:
pd.set_option('display.max_rows', None)
df = pd.DataFrame(entity_counts.items(), columns=['Entity', 'Count'], index=None)
df[['Text', 'Label']] = pd.DataFrame(df['Entity'].tolist(), index=df.index)
df = df.drop('Entity', axis=1)
df = df.loc[df['Label'].isin(['PERSON', 'ORG', 'GPE', 'WORK_OF_ART'])]
df = df.sort_values(by=['Count', 'Text'], ascending=[False, True])
df = df.head(20)

df

Unnamed: 0,Count,Text,Label
20,11,Oscars,ORG
40,11,Vanity Fair,ORG
71,7,Academy Awards,ORG
14,7,Hollywood,GPE
124,6,BBC,ORG
50,6,Oscar,PERSON
42,5,Beverly Hills,GPE
34,5,Florence Pugh,PERSON
132,5,Joey Essex,PERSON
7,4,Best Picture,WORK_OF_ART
