In [1]:
import requests
from bs4 import BeautifulSoup
import re

We want to reproduce the methodology used for the original collection of the documents, so we filter the headlines searching for those containing mentions of immigrants, muslims and Roma. We use the same set of neutral(?) keywords used originally by Poletto et al. 2017

ethnic group  | religion    | Roma
--------------|-------------|------
immigrat*     | terrorismo  | rom
immigrazione  | terrorist*  | nomad*
migrant*      | islam       |
stranier*     | mussulman*  |
profug*       | corano      |

We add also the frequently used variant "musulman*".

In [2]:
keywords = re.compile(r"immigrat.|immigrazione|migrant.|straner.|profug.|terroris(mo|t.)|islam|mus(s)?ulman.|corano|\Wrom\W|nomad.")

Headlines in the test set come from the online editions of:
- La Stampa
- La Repubblica
- Il Giornale
- Liberoquotidiano

## La Stampa

Extract all titles with containing the keywords from the cronaca, politica and esteri sections of "La Stampa"

In [3]:
sections = ["cronaca/", "politica/", "esteri/"]
base_url = "https://www.lastampa.it/"
titles_lastampa = []

for section in sections:
    max_pages = 10000
    page = 0
    print("processing", section)
    while page < max_pages:
        page += 1
        page_url = base_url + section + f"{page}/"
        response = requests.get(page_url)
        if response.status_code != 200:
            print(page_url, response.status_code)
            continue
        source = BeautifulSoup(response.text)
        if page == 1:
            max_pages = int(source.find(class_="pagination__counter").text.split()[-1])
        entry_titles = source.find_all(class_="entry__title")
        for t in entry_titles:
            text = t.a.get_text(strip=True)
            if keywords.search(text.lower()):
                titles_lastampa.append(text)

processing cronaca/
processing politica/
processing esteri/


In [4]:
print(titles_lastampa)

['Migranti, meno arrivi nel 2024. La denuncia delle Ong: “Europa sempre meno accogliente”', 'Migranti, meno arrivi nel 2024. La denuncia delle Ong: “Europa sempre meno accogliente”', '“Migranti maltrattati e sedati con psicofarmaci”. Il rapporto choc del Consiglio d’Europa sui Cpr italiani', 'Rimini, smantellato traffico di migranti: 12 misure cautelari, coinvolti un dipendente Inps e un sindacalista', 'Il Csm accoglie le dimissioni della giudice Apostolico, aveva partecipato a manifestazioni in favore dei migranti', 'Spari dalla nave libica, decine di migranti si gettano in mare: il video dei soccorsi di Medici senza frontiere', 'Geo Barents attaccata dalla guardia libica: migranti in mare e famiglie divise', 'Valditara: "Il patriarcato è ideologia, l\'aumento delle violenze sessuali è legato all\'immigrazione irregolare"', 'Valditara: “Il patriarcato non esiste, abusi legati all’immigrazione irregolare”. Elena Cecchettin: “Giulia uccisa da un ragazzo bianco”', 'Paolo Pillitteri: “Le 

In [5]:
print(len(titles_lastampa), "headlines from La Stampa")

103 headlines from La Stampa


## La Repubblica

In [6]:
sections = ["cronaca/", "politica/", "esteri/"]
base_url = "https://www.repubblica.it/"
titles_repubblica = []

for section in sections:
    max_pages = 10000
    page = 0
    print("processing", section)
    while page < max_pages:
        page += 1
        page_url = base_url + section + f"{page}/"
        response = requests.get(page_url)
        if response.status_code != 200:
            print(page_url, response.status_code)
            continue
        source = BeautifulSoup(response.text)
        if page == 1:
            max_pages = int(source.find(class_="pagination__counter").text.split()[-1])
        articles = source.find_all("article", class_="type-articolo")
        for art in articles:
            text = art.h2.a.get_text(strip=True)
            if keywords.search(text.lower()):
                titles_repubblica.append(text)

processing cronaca/
processing politica/
processing esteri/


In [7]:
len(titles_repubblica)

47

## Il Giornale

In [8]:
sections = ["interni.html", "cronache.html", "esteri.html"]
base_url = "https://www.ilgiornale.it/sezioni/"
titles_ilgiornale = []

for section in sections:
    print("processing", section)
    page = 1
    while page < 400:
        response = requests.get(base_url + section, params={'page': page})
        # Break the cicle if the page was not found
        if response.status_code != 200:
            break
        source = BeautifulSoup(response.text)
        entry_titles = source.find_all(class_="card__title")
        for t in entry_titles:
            text = t.get_text(strip=True)
            if keywords.search(text.lower()):
                titles_ilgiornale.append(text)
        page += 1

processing interni.html
processing cronache.html
processing esteri.html


In [9]:
len(titles_ilgiornale)

439

## Liberoquotidiano

In [10]:
sections = ["politica/", "giustizia/", "italia/", "europa/", "esteri/", "piulibero/", "terra-promessa"]
base_url = "https://www.liberoquotidiano.it/"
titles_libero = []

for section in sections:
    page = 1
    print("processing", section)
    while True:
        page_url = base_url + section + f"page/{page}/"
        response = requests.get(page_url)
        # Break the cicle if the page was not found
        if response.status_code != 200:
            break
        source = BeautifulSoup(response.text)
        articles = source.main.find_all('article')

        for art in articles:
            text = art.header.h2.get_text(strip=True)
            if keywords.search(text.lower()):
                titles_libero.append(text)
        page += 1

processing politica/
processing giustizia/
processing italia/
processing europa/
processing esteri/
processing piulibero/
processing terra-promessa


In [11]:
len(titles_libero)

68

In [12]:
from pathlib import Path

documents = titles_lastampa + titles_repubblica + titles_ilgiornale + titles_libero
old_documents = []
data_dir = Path('../data')

with open(data_dir/ 'headlines.txt', 'r', encoding='utf-8') as infile:
    for line in infile.readlines():
        old_documents.append(line.strip())

documents = list(set(old_documents + documents))

with open(data_dir / 'headlines.txt', 'w', encoding='utf-8') as outfile:
    for document in documents:
        outfile.write(document + '\n')