In [30]:
"""
Newsapi.org
--> IF THEY MATCH OR OVERLAP, DROP DUPLICATE
        This code could also be used in general and for the news API. 
        It does not delete the link or source at the end of each mobility news article. 

Alicja's and Cristian's code. The aim is to scrape data from a Mobility News website and store it in a Data Frame. 
After this, the next step would be to find an API to analyze the text data stored.

This code is faster. Retrieves 200 links in about 1 minute. It removes the link at the end of each article. 
Give it some time to begin printing.

It has no link limitation. Adds data to the DataFrame after each 10 links. 
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures

base_url = "https://www.sharedmobility.news/category/mob/ride/"
page_number = 1

#make lists for the data
article_texts = []
article_urls = []
article_titles = []

#function to fetch and process articles
def process_article(link):
    article_response = requests.get(link)
    if article_response.status_code == 200:
        article_html_content = article_response.text
        article_soup = BeautifulSoup(article_html_content, 'html.parser')

        # get title and text
        article_title = article_soup.find('h1', class_='entry-title').get_text()
        article_text = article_soup.find('div', class_='entry-content').get_text()

        #add to lists
        article_titles.append(article_title)
        article_texts.append(article_text)
        article_urls.append(link)

#counter for processed links
link_counter = 0

while True:  #collect articles (NO LIMIT)
    #construct the page URL
    url = f"{base_url}page/{page_number}/" if page_number > 1 else base_url

    response = requests.get(url)
    if response.status_code == 200:
        #parse HTML
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        #find all the links within the 'a' element within the main section
        link_elements = soup.find('main', class_='site-main rbc-content').find_all('a', class_='p-url')

        #process articles: multithreading
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(process_article, link_element.get('href')) for link_element in link_elements]

        #update counter
        link_counter += len(link_elements)
       

        #if the link counter is a multiple of 10, update DataFrame
        if link_counter % 10 == 0:
            #df with extracted data
            data = {
                "ID": range(1, link_counter + 1),  # Use the link counter for the ID
                "Text": article_texts,
                "URL": article_urls,
                "Article Title": article_titles
            }

            df = pd.DataFrame(data)

            #save as CSV file
            df.to_csv("shared_mobility_data.csv", index=False)
            print(f"Data saved to shared_mobility_data.csv (Links processed: {link_counter})")

        #if no links are found
        if not link_elements:
            break

        page_number += 1  #go to next page
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)
        break

#create final DataFrame with all data
data = {
    "ID": range(1, link_counter + 1),  # Use the link counter for the ID
    "Text": article_texts,
    "URL": article_urls,
    "Article Title": article_titles
}

df = pd.DataFrame(data)

#save as CSV file
df.to_csv("shared_mobility_data.csv", index=False)
print(f"Data saved to shared_mobility_data.csv (Links processed: {link_counter})")


Links processed: 10
Data saved to shared_mobility_data.csv (Links processed: 10)
Links processed: 20
Data saved to shared_mobility_data.csv (Links processed: 20)
Links processed: 30
Data saved to shared_mobility_data.csv (Links processed: 30)
Links processed: 40
Data saved to shared_mobility_data.csv (Links processed: 40)
Links processed: 50
Data saved to shared_mobility_data.csv (Links processed: 50)
Links processed: 60
Data saved to shared_mobility_data.csv (Links processed: 60)
Links processed: 70
Data saved to shared_mobility_data.csv (Links processed: 70)
Links processed: 80
Data saved to shared_mobility_data.csv (Links processed: 80)
Links processed: 90
Data saved to shared_mobility_data.csv (Links processed: 90)
Links processed: 100
Data saved to shared_mobility_data.csv (Links processed: 100)
Links processed: 110
Data saved to shared_mobility_data.csv (Links processed: 110)
Links processed: 120
Data saved to shared_mobility_data.csv (Links processed: 120)
Links processed: 130
D

In [22]:
"""
REMOVES LINK AT THE BOTTOM OF EACH ARTICLE. However, it cannot remove both links. 


Cannot be used for News API. 
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
import re  # Import the 're' module

# Define a function to clean the article text
def clean_article_text(article_text):
    # Define a pattern to match the source and link sections
    pattern = r'Source: https://\S+'
    
    # Use re.sub to remove the matched pattern from the text
    cleaned_text = re.sub(pattern, '', article_text)
    
    return cleaned_text

base_url = "https://www.sharedmobility.news/category/mob/ride/"
page_number = 1

# Store the data in lists
article_texts = []
article_urls = []
article_titles = []

# Set the limit for the number of links to collect
link_limit = 200

# Define a function to fetch and process an article
def process_article(link):
    article_response = requests.get(link)
    if article_response.status_code == 200:
        article_html_content = article_response.text
        article_soup = BeautifulSoup(article_html_content, 'html.parser')

        # Extract the article title and text
        article_title = article_soup.find('h1', class_='entry-title').get_text()
        article_text = article_soup.find('div', class_='entry-content').get_text()

        # Clean the article text using the clean_article_text function
        cleaned_article_text = clean_article_text(article_text)

        # Add data to lists
        article_titles.append(article_title)
        article_texts.append(cleaned_article_text)
        article_urls.append(link)

# Initialize a counter for processed links
link_counter = 0

# While loop to fetch articles until the link limit is reached
while link_counter < link_limit:
    # Construct the URL for the current page
    url = f"{base_url}page/{page_number}/" if page_number > 1 else base_url

    response = requests.get(url)
    if response.status_code == 200:
        # Parse the HTML
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        # Finding all the links within the 'a' element within the main section
        link_elements = soup.find('main', class_='site-main rbc-content').find_all('a', class_='p-url')

        # Process the articles using multithreading
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(process_article, link_element.get('href')) for link_element in link_elements]

        # Update the link counter
        link_counter += len(link_elements)

        page_number += 1  # Move to the next page
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)
        break

# Create a DataFrame from the extracted data
data = {
    "ID": range(1, link_counter + 1),  # Use the link counter for the ID
    "Text": article_texts,
    "URL": article_urls,
    "Article Title": article_titles
}

df = pd.DataFrame(data)

# Save the data to a CSV file
df.to_csv("shared_mobility_data.csv", index=False)
print("Data saved to shared_mobility_data.csv")

Failed to retrieve the page. Status code: 404
Data saved to shared_mobility_data.csv
