In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv

### Fetching the main fake news based on COVID-19 misinformation

In [None]:
def scrape_wikipedia_sub_subheadings(article_title):
    # Replace spaces with underscores for Wikipedia URL format
    article_title_url = article_title.replace(' ', '_')

    # Wikipedia base URL
    base_url = 'https://en.wikipedia.org/wiki/'

    # Full URL to scrape
    full_url = base_url + article_title_url

    # Send a GET request to the Wikipedia page
    response = requests.get(full_url)

    # Check if the page was retrieved successfully
    if response.status_code == 200:
        # Parse the content of the request with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # DataFrame to store the headings, subheadings, sub-subheadings, and links
        headings_df = pd.DataFrame(columns=['Sub-subheading', 'Subheading', 'Main Heading', 'Links'])

        # Initialize current headings
        current_main_heading = None
        current_sub_heading = None
        current_sub_subheading = None
        index = 0

        # Iterate through all heading tags
        for tag in soup.find_all(['h2', 'h3', 'h4', 'p', 'a']):
            # Remove the '[edit]' part from the text
            text = tag.get_text().replace('[edit]', '').strip()
            # Determine tag type and process accordingly
            if tag.name == 'h2':
                current_main_heading = text
            elif tag.name == 'h3':
                current_sub_heading = text
                # Reset sub-subheading when a new subheading is found
                current_sub_subheading = None
                # Add the subheading to the dataframe
                headings_df.loc[index] = [text, current_sub_heading, current_main_heading, []]
                index += 1
            elif tag.name == 'h4':
                current_sub_subheading = text
                # Add the sub-subheading to the dataframe
                headings_df.loc[index] = [text, current_sub_heading, current_main_heading, []]
                index += 1
            elif tag.name == 'a' and 'href' in tag.attrs:
                # Check if the link is a valid article link
                link = tag['href']
                if link.startswith('/wiki/') and ':' not in link:
                    # Add the link to the appropriate heading in the dataframe
                    if current_sub_subheading:
                        headings_df.loc[headings_df['Sub-subheading'] == current_sub_subheading, 'Links'].apply(lambda x: x.append(link))
                    elif current_sub_heading:
                        headings_df.loc[headings_df['Subheading'] == current_sub_heading, 'Links'].apply(lambda x: x.append(link))
        return headings_df
    else:
        # Return an empty DataFrame if the page could not be retrieved
        return pd.DataFrame()

# Example usage
article_title = "COVID-19 misinformation"
headings_df = scrape_wikipedia_sub_subheadings(article_title)
headings_df.head()  # Display the first few rows of the dataframe


### Checking existence in 2020 for use with mobility

In [None]:
with open('data/wiki_pageviews_covid-master/data/topics_linked.csv', 'r') as file:
    topics = csv.reader(file)
    

### Plotting the page view counts of groups of info

In [None]:
def fetch_pageview_count(language, articles):
    api_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}"

    params = {
        "project": f"{language}.wikipedia",   # Language-specific Wikipedia project
        "access": "all-access",
        "agent": "user",
        "granularity": "monthly",
        "start": "20180101",
        "end": "20230101"
    }

    headers = {
        "User-Agent": "WikiWackyNews"
    }

    pageviews_data = {}

    for article in articles:
        params["article"] = article

        # Make the API request
        response = requests.get(api_url.format(**params), headers=headers)
        
        # Access the data and save it as a dataframe
        if response.status_code == 200:
            data = response.json()
            item_list = data.get("items", [])
            
            if item_list:
                df = pd.DataFrame(item_list).copy()
                df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%d%H')
                pageviews_data[article] = df
            else:
                print(f"No data available for {article}")
        else:
            print(f"Error fetching data for {article}. Status Code: {response.status_code}")

    return pageviews_data

In [None]:
# Example usage:
language = "en"
articles_list = ["Covid-19", "Hydroxychloroquine"]
result = fetch_pageview_count(language, articles_list)

# result contains a dictionary where keys are article names and values are DataFrames with pageviews data.
print(result)