In [25]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

BBC_end_phrases = ["Follow BBC", "Follow Essex news", "@bbc", "Instagram, external", "@BBCAfrica", "Sign up for our morning newsletter", "Listen to highlights from"]
Metro_end_phrases = ["@metro.co.uk", "Arrow\nMORE", "Metro.co.uk", "Follow Metro Sport"]

def scraper(row=None, url=None, src=None, get_images=False):
    if not row is None:
        url = row["link"]
        src = row["source"]
        
    try:
        raw = requests.get(url)
        soup = BeautifulSoup(raw.content, "lxml")

        if src == "BBC":
            article_by_ptag = soup.find("article").find_all("p")
            article = []
            for i in article_by_ptag:
                text = i.get_text()
                
                flag = False
                for word in BBC_end_phrases:
                    if word in text and not flag:
                        flag = True

                if "PromoHeadline" in str(i) or flag:
                    break
                article.append(i.get_text())

        elif src == "Daily mail": ### this catches junk text for some articles
            fulltext = soup.find("div", {"itemprop": "articleBody"}).get_text(separator="\n", strip=True)
            article = fulltext.split("\n")
        
        elif src == "Metro": 
            article_by_ptag = soup.find("div", {"class": "article__content"}).find_all("p")
            article = []
            for i in article_by_ptag:
                text = i.get_text()
                
                flag = False
                for word in Metro_end_phrases:
                    if word in text and not flag:
                        flag = True
                if flag: break

                if 'target="_blank"' in str(i):
                    pass
                else:
                    article.append(text)

        else:
            print("{} can't be scraped at the moment, sorry".format(src))
            article = None  

        return article

    except Exception as error:
        print(url, error)


In [None]:
df = pd.read_csv("data/migrant-dehumanization.csv")[["source", "link"]]
df["source"] = df["source"].replace({1: "Metro", 2: "Financial Times", 3: "Daily mail", 4: "BBC"})
dfs = []

to_scrape = ["BBC", "Daily mail", "Metro"]
drop_duplicates = True
scrape = True
export = True

if drop_duplicates:
    df = df.drop_duplicates()

if scrape:
    for site in to_scrape:
        data = df[df["source"] == site].apply(scraper, axis=1)
        dfs.append(data)

if export:
    for i in range(len(dfs)):
        df = dfs[i].dropna()
        df.to_csv("data/corpus/{}_corpus.txt".format(to_scrape[i]))