# Scraping articles from Open website

In [2]:
#I import the necessary packages to webscrape
import requests
from lxml.html import fromstring
import time
import pandas as pd

#I create a function to extract the information from the website
def fetch_article_html(url):
    response = requests.get(url)
    return response.text

#I create a function to extract the text of the article and to put together all of the paragraphs
def extract_article_text(html):
    tree = fromstring(html)
    article_text = tree.xpath("//div[3]/p/text() | //div[2]/p/a/text() | //div[2]/p/strong/text()")
    return ' '.join(article_text)

#I define the url of the website and loop over the 68 pages present in the section I am interested in
MAXPAGES = 68
baseurl = "https://www.open.online/temi/elezioni-politiche-2022/page/"
allurls = [f"{baseurl}{i}/" for i in range(1, MAXPAGES + 1)]

#I create a list in which to store the information
articles_data = []

#I loop over every url and check if the page exist. Then I extract the link and title of each article from the website and 
#then for every link (so for every aerticle) I retrieve the test of that article. I start from the third page because the 
#articles in the first page were written after November 2022, while in all of the other news outlets they were not written 
#after October 2022
for url in allurls[2:]:
    try:
        #I fetch the HTML source from the website
        response = requests.get(url)
        if response.status_code == 404:
            print("Page not found:", url)
            continue
        htmlsource = response.text

        tree = fromstring(htmlsource)

        #I find the links and titles
        article_links = tree.xpath("//article/div/h2/a/@href")
        article_title = tree.xpath("//article/div/h2/a/text()")

        #I extract the article texts
        for link, title in zip(article_links, article_title):
            try:
                #I fetch the HTML source of each article page
                article_html = fetch_article_html(link)

                #I extract the full article text
                article_text = extract_article_text(article_html)

                #I append the links, titles and text to the list created above
                articles_data.append({
                    'Link': link,
                    'Title': title,
                    'Text': article_text
                })

                #I delay of 1 second before the next request
                time.sleep(1)  

            except requests.exceptions.RequestException as e:
                print("Connection error:", e)
                continue

    except requests.exceptions.RequestException as e:
        print("Connection error:", e)
        continue

# I create a dataframe from the articles data dictionary
df = pd.DataFrame(articles_data)

# Irint the DataFrame
print(df)

                                                  Link  \
0    https://www.open.online/2022/09/29/elezioni-po...   
1    https://www.open.online/2022/09/29/elezioni-po...   
2    https://www.open.online/2022/09/29/commissione...   
3    https://www.open.online/2022/09/29/elezioni-po...   
4    https://www.open.online/2022/09/29/governo-mel...   
..                                                 ...   
787  https://www.open.online/2022/07/17/centrodestr...   
788  https://www.open.online/2022/07/17/crisi-di-go...   
789  https://www.open.online/2022/07/16/crisi-gover...   
790  https://www.open.online/2022/07/16/sondaggi-el...   
791  https://www.open.online/2022/07/15/crisi-di-go...   

                                                 Title  \
0    Usa, la Casa Bianca chiarisce parole di Biden ...   
1    Fiano, Prestigiacomo, Paragone, Sensi: chi las...   
2    Lo stop dell’Europa a Giorgia Meloni sul reddi...   
3    Stefano Bonaccini: «Io prossimo segretario Pd?...   
4    Viceprem

In [23]:
#I transform the dataframe into a csv file, I add a column and I save it 
articles_Open = pd.DataFrame.from_dict(articles_data)
articles_Open['News outlet'] = pd.Series('Open', index=articles_Open.index)
articles_Open.to_csv('articles_Open.csv', index=False)
#I check that everything went smoothly
articles_Open.head()

Unnamed: 0,Link,Title,Text,News outlet
0,https://www.open.online/2022/09/29/elezioni-po...,"Usa, la Casa Bianca chiarisce parole di Biden ...",Dopo le dichiarazioni poco chiare di sulle e...,Open
1,https://www.open.online/2022/09/29/elezioni-po...,"Fiano, Prestigiacomo, Paragone, Sensi: chi las...",C’è chi lo sapeva e chi è stato colto alla spr...,Open
2,https://www.open.online/2022/09/29/commissione...,Lo stop dell’Europa a Giorgia Meloni sul reddi...,Il non va abolito. Bisogna applicarlo meglio...,Open
3,https://www.open.online/2022/09/29/elezioni-po...,Stefano Bonaccini: «Io prossimo segretario Pd?...,« ». Ad affermarlo è il presidente dell’Emilia...,Open
4,https://www.open.online/2022/09/29/governo-mel...,"Vicepremier, agricoltura, interni, infrastrutt...",L’ tra Giorgia Meloni e Matteo Salvini è serv...,Open


I create a condition with a list of rows and then use it to extract those rows from the dataset and put into another dataset, 
which will serve for the labeled dataset. Then, I drop those columns from the original dataset, as this is the unlabeled dataset.

In [24]:
condition = list(range(1, 30)) + list(range(380, 454)) + list(range(760, 791))
articlesOpen_labeled = articles_Open.loc[condition].copy()
articlesOpen_labeled.head()
articlesOpen_labeled.to_csv('articlesOpen_labeled.csv', index=False)
articlesOpen_unlabeled = articles_Open.drop(list(range(1, 30)) + list(range(380, 454)) + list(range(760, 791)))
articlesOpen_unlabeled.to_csv('articlesOpen_unlabeled.csv', index=False)
#I restore the indexes of both dataframes
articlesOpen_labeled.reset_index(drop=True, inplace=True)
articlesOpen_unlabeled.reset_index(drop=True, inplace=True)

In [25]:
#I check that everything went smoothly
articlesOpen_unlabeled

Unnamed: 0,Link,Title,Text,News outlet
0,https://www.open.online/2022/09/29/elezioni-po...,"Usa, la Casa Bianca chiarisce parole di Biden ...",Dopo le dichiarazioni poco chiare di sulle e...,Open
1,https://www.open.online/2022/09/27/elezioni-po...,Guido Crosetto: «Con Meloni premier sarà il go...,«Se qualcuno pensa di fare il nuovo esecutivo ...,Open
2,https://www.open.online/2022/09/27/governo-mel...,Il totoministri del nuovo governo Meloni (ment...,Dopo la vittoria alle elezioni la si prepara...,Open
3,https://www.open.online/2022/09/26/elezioni-po...,"Usa, l’agenzia di rating S&P Global sulle elez...",«Il nuovo governo italiano si trova davanti a ...,Open
4,https://www.open.online/2022/09/26/elezioni-po...,"Elezioni politiche, i seggi stimati per Camera...",Il le elezioni e ora la politica si avvia ...,Open
...,...,...,...,...
653,https://www.open.online/2022/07/24/di-battista...,"«Nemici dei lavoratori», «Siete come Meloni»: ...",«Il 25 settembre la sfida sarà tra me e ». Le...,Open
654,https://www.open.online/2022/07/24/elodie-vs-p...,"Elodie, attacco social a Fratelli d’Italia: «H...",«A me sinceramente ». Così l’artista pop su...,Open
655,https://www.open.online/2022/07/24/m5s-conte-l...,Ora che è finita col Pd Conte pensa alle liste...,Le parole pronunciate dai due più diretti inte...,Open
656,https://www.open.online/2022/07/24/letta-rottu...,Letta contraccambia l’addio di Conte: «Con il ...,La con 5 stelle «in queste elezioni è ». Co...,Open


In [26]:
#I check that everything went smoothly
articlesOpen_labeled

Unnamed: 0,Link,Title,Text,News outlet
0,https://www.open.online/2022/09/29/elezioni-po...,"Fiano, Prestigiacomo, Paragone, Sensi: chi las...",C’è chi lo sapeva e chi è stato colto alla spr...,Open
1,https://www.open.online/2022/09/29/commissione...,Lo stop dell’Europa a Giorgia Meloni sul reddi...,Il non va abolito. Bisogna applicarlo meglio...,Open
2,https://www.open.online/2022/09/29/elezioni-po...,Stefano Bonaccini: «Io prossimo segretario Pd?...,« ». Ad affermarlo è il presidente dell’Emilia...,Open
3,https://www.open.online/2022/09/29/governo-mel...,"Vicepremier, agricoltura, interni, infrastrutt...",L’ tra Giorgia Meloni e Matteo Salvini è serv...,Open
4,https://www.open.online/2022/09/29/governo-mel...,Così il governo Meloni cambierà reddito di cit...,Il di mette nel mirino reddito di cittadin...,Open
...,...,...,...,...
129,https://www.open.online/2022/07/17/crisi-di-go...,"Lega, oggi altro vertice per decidere la linea...",Il segretario della Lega farà il punto con i...,Open
130,https://www.open.online/2022/07/17/centrodestr...,"Centrodestra diviso sul voto, Fi e Lega frenan...",Il scricchiola sotto il peso della crisi di ...,Open
131,https://www.open.online/2022/07/17/crisi-di-go...,Dopo l’irrigidimento di Conte la crisi è quasi...,I tessitori sono rimasti all’opera fino alle 2...,Open
132,https://www.open.online/2022/07/16/crisi-gover...,"Crisi di governo, Renzi: «O Draghi fa il bis a...",", a ruota libera all’assemblea di Italia Viva ...",Open
