In [1]:
# this code does not save the links in the correct order
"""
Alicja's and Cristian's code. The aim is to scrape data from a Mobility News website and store it in a Data Frame. 
After this, the next step would be to find an API to analyze the text data stored.

This code is faster. Retrieves 200 links in about 1 minute. It removes the link at the end of each article. 
Give it some time to begin printing.

It has no link limitation. Adds data to the DataFrame after each 10 links. 
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
import re

#function to clean article text 
def clean_article_text(article_text):
    #pattern to match the source and link sections
    pattern = r'Source: https://\S+'
    
    #re.sub to remove the matched pattern
    cleaned_text = re.sub(pattern, '', article_text)
    
    return cleaned_text

base_url = "https://www.sharedmobility.news/category/mob/"
page_number = 1

#make lists for the data
article_texts = []
article_urls = []
article_titles = []

#function to fetch and process articles
def process_article(link):
    article_response = requests.get(link)
    if article_response.status_code == 200:
        article_html_content = article_response.text
        article_soup = BeautifulSoup(article_html_content, 'html.parser')

        # get title and text
        article_title = article_soup.find('h1', class_='entry-title').get_text()
        article_text = article_soup.find('div', class_='entry-content').get_text()
        
        #clean article
        cleaned_article_text = clean_article_text(article_text)
        
        #add to lists
        article_titles.append(article_title)
        article_texts.append(cleaned_article_text)
        article_urls.append(link)

#counter for processed links
link_counter = 0

while True:  #collect articles (NO LIMIT)
    #construct the page URL
    url = f"{base_url}page/{page_number}/" if page_number > 1 else base_url

    response = requests.get(url)
    if response.status_code == 200:
        #parse HTML
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        #find all the links within the 'a' element within the main section
        link_elements = soup.find('main', class_='site-main rbc-content').find_all('a', class_='p-url')

        #process articles: multithreading
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(process_article, link_element.get('href')) for link_element in link_elements]

        #update counter
        link_counter += len(link_elements)
       

        #if the link counter is a multiple of 10, update DataFrame
        if link_counter % 10 == 0:
            #df with extracted data
            data = {
                "ID": range(1, link_counter + 1),  #link counter for the ID
                "Text": article_texts,
                "URL": article_urls,
                "Article Title": article_titles
            }

            df = pd.DataFrame(data)

            #save as CSV file
            df.to_csv("shared_mobility_data_partial.csv", index=False)
            print(f"Data saved to shared_mobility_data_partial.csv (Links processed: {link_counter})")

        #if no links are found
        if not link_elements:
            break

        page_number += 1  #go to next page
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)
        break


#save as CSV file
df.to_csv("shared_mobility_data.csv", index=False)
print(f"Data saved to shared_mobility_data_full.csv (Links processed: {link_counter})")


Data saved to shared_mobility_data_partial.csv (Links processed: 10)
Data saved to shared_mobility_data_partial.csv (Links processed: 20)
Data saved to shared_mobility_data_partial.csv (Links processed: 30)
Data saved to shared_mobility_data_partial.csv (Links processed: 40)
Data saved to shared_mobility_data_partial.csv (Links processed: 50)
Data saved to shared_mobility_data_partial.csv (Links processed: 60)
Data saved to shared_mobility_data_partial.csv (Links processed: 70)
Data saved to shared_mobility_data_partial.csv (Links processed: 80)
Data saved to shared_mobility_data_partial.csv (Links processed: 90)
Data saved to shared_mobility_data_partial.csv (Links processed: 100)
Data saved to shared_mobility_data_partial.csv (Links processed: 110)
Data saved to shared_mobility_data_partial.csv (Links processed: 120)
Data saved to shared_mobility_data_partial.csv (Links processed: 130)
Data saved to shared_mobility_data_partial.csv (Links processed: 140)
Data saved to shared_mobility