In [15]:
# EUvsDISINFO Scraper
# Import necessary libraries
import pandas as pd
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async        # Import stealth_async for stealth mode in order to avoid detection

nest_asyncio.apply()  # Apply nest_asyncio to allow nested event loops

async def scrape_eu_vs_disinfo():
    pw = None
    browser = None
    results = []  # Initialize an empty list to store results
    try:
        pw = await async_playwright().start()  # Start Playwright
        browser = await pw.chromium.launch(headless=False)  # Launch Chromium browser in non-headless mode because the website detects headless browsers
        page = await browser.new_page()  # Create a new page in the browser


        await stealth_async(page)  # Apply stealth mode to avoid detection
        url = f"https://euvsdisinfo.eu/disinformation-cases/page/1/?disinfo_countries%5B0%5D=country_77544&_=1750026467335"
        await page.goto(url, timeout=120000, wait_until='domcontentloaded')  # Navigate to the page
        await asyncio.sleep(1)  # Small delay for ethical scraping

        pagination_tags = await page.query_selector('div.b-pagination a:last-child')  # Select last pagination item
        num_pages = await pagination_tags.inner_text()  # Get the number of pages
        print(f"Number of pages: {num_pages}")
        num_pages = int(num_pages)

        for page_num in range(1, num_pages + 1):  # Iterating through web pages
            if page_num > 1:

                await stealth_async(page)  # Apply stealth mode to avoid detection
                print(f"Navigating to page {page_num}...")
                url = f"https://euvsdisinfo.eu/disinformation-cases/page/{page_num}/?disinfo_countries%5B0%5D=country_77544&_=1750026467335"
                await page.goto(url, timeout=120000, wait_until='domcontentloaded')  # Navigate to the page
                print(f"Page {page_num} loaded.")
                await asyncio.sleep(1)  # Small delay for ethical scraping

            article_cards = await page.query_selector_all('a.b-archive__database-item') # Select all article cards on the page

            # Extract data from the first few cards
            for i, card in enumerate(article_cards):
                outlet = []  # Initialize an empty list for outlets
                tags = []  # Initialize an empty list for tags
                title = "N/A"  # Initialize title as "N/A"
                summary = "N/A"  # Initialize summary as "N/A"
                response = "N/A"  # Initialize response as "N/A"
                date = "N/A"  # Initialize date as "N/A"
                languages = []  # Initialize an empty list for languages
                countries = []  # Initialize an empty list for countries

                url = await card.get_attribute('href') # Get the URL of the article

                article_page = await browser.new_page()  # Create a new page for article


                await stealth_async(article_page)  # Apply stealth mode to avoid detection
                await article_page.goto(f"https://euvsdisinfo.eu{url}", timeout=120000, wait_until='domcontentloaded')  # Navigate to the article page
                await asyncio.sleep(1) # Small delay for ethical scraping

                try:
                    await article_page.wait_for_selector('h1.b-page__title', timeout=100)  # Wait for the title to load
                except Exception as e:
                    title_tag = await article_page.query_selector('h1.b-report__title')  # Select the title of the article
                    title = await title_tag.inner_text() if title_tag else "N/A"  # Get the title text
                    title = title.replace("DISINFO: ", "")  # Clean the title text

                    summary_tag = await article_page.query_selector('div.b-report__summary')  # Select the summary of the article
                    summary = await summary_tag.inner_text() if summary_tag else "N/A"  # Get the summary text
                    summary = summary.replace("SUMMARY\n\n", "")  # Clean the summary text

                    response_tag = await article_page.query_selector('div.b-report__response')  # Select the response of the article
                    response = await response_tag.inner_text() if response_tag else "N/A"  # Get the response text
                    response = response.replace("RESPONSE\n\n", "")  # Clean the response text

                    details = await article_page.query_selector_all('ul.b-report__details-list li')  # Select the details of the article

                    if len(details) == 2:
                        date_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(1) span')  # Select the date of the article
                        date = await date_tag.inner_text() if date_tag else "N/A"  # Get the date text

                        countries_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(2) span')  # Select the countries of the article
                        countries = [await countries_tag.inner_text()]  # Get the countries text
                        countries = countries[0].split(", ") if countries else []  # Split the countries text into a list
                    elif len(details) >= 3:
                        outlet_tag = await article_page.query_selector_all('ul.b-report__details-list li:nth-child(1) a')  # Select the outlets of the article
                        for j, outlet_item in enumerate(outlet_tag):  # Iterate through each outlet
                            outlet_text = await outlet_item.inner_text()  # Get the outlet text
                            if outlet_text and not outlet_text.strip().__contains__("archived"):  # Clean the outlet text and append it to the outlet list
                                outlet.append(outlet_text.strip().replace("\\n(opens in a new tab)", "").replace("\n(opens in a new tab)", ""))

                        date_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(2) span')  # Select the date of the article
                        date = await date_tag.inner_text() if date_tag else "N/A"  # Get the date text

                        if len(details) == 4:
                            languages_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(3) span')  # Select the languages of the article
                            languages = [await languages_tag.inner_text()]  # Get the languages text
                            languages = languages[0].split(", ") if languages else []  # Split the languages text into a list
                            languages = list(set(languages))  # Remove duplicates

                            countries_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(4) span')  # Select the countries of the article
                            countries = [await countries_tag.inner_text()]  # Get the countries text
                            countries = countries[0].split(", ") if countries else []  # Split the countries text into a list
                        elif len(details) == 3:
                            languages = []

                            countries_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(3) span')  # Select the countries of the article
                            countries = [await countries_tag.inner_text()]  # Get the countries text
                            countries = countries[0].split(", ") if countries else []  # Split the countries text into a list

                    tags_tag = await article_page.query_selector_all('div.b-report__keywords a span')  # Select the tags of the article
                    for tag_item in tags_tag:  # Iterate through each tag
                        tag_text = await tag_item.inner_text()  # Get the tag text
                        if tag_text:
                            tags.append(tag_text.strip())  # Append the tag text to the tags list

                    results.append({  # Append the extracted data to the results list
                        'Title': title,
                        'URL': f"https://euvsdisinfo.eu{url}",
                        'Summary': summary,
                        'Response': response,
                        'Outlet': outlet,
                        'Date': date,
                        'Languages': languages,
                        'Countries': countries,
                        'Tags': tags
                    })
                await article_page.close()  # Close the article page after extracting data

        await browser.close()  # Close the browser after scraping
    except Exception as e:
        print(f"An error occurred during EUvsDISINFO search: {e}")
        # Optional: Take a screenshot on error for debugging
        # if page: await page.screenshot(path='error_screenshot.png')

    finally:
        if browser:  # Ensure the browser is closed
            await browser.close()
            print("Browser closed.")
        if pw:
            await pw.stop()  # Stop Playwright
            print("Playwright stopped.")

    return results  # Return the results list containing the scraped data

df = asyncio.run(scrape_eu_vs_disinfo())  # Run the scraping function
if df:
    df = pd.DataFrame(df)  # Convert the results list to a DataFrame
    print(f"Scraped {len(df)} results from EUvsDISINFO.")  # Display the number of results scraped
    df.to_csv('euvsdisinfo.csv', index=False)  # Save the DataFrame to a CSV file
else:
    print("\nNo results scraped from EUvsDISINFO.")  # Display a message if no results were scraped


Number of pages: 514
Navigating to page 455...
Page 455 loaded.
Browser closed.
Playwright stopped.
Scraped 18 results from EUvsDISINFO.


# Export pages separately

In [16]:
# EUvsDISINFO Scraper
# Import necessary libraries
import pandas as pd
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async        # Import stealth_async for stealth mode in order to avoid detection

nest_asyncio.apply()  # Apply nest_asyncio to allow nested event loops

async def scrape_eu_vs_disinfo(page_num=1):
    pw = None
    browser = None
    results = []  # Initialize an empty list to store results
    try:
        pw = await async_playwright().start()  # Start Playwright
        browser = await pw.chromium.launch(headless=False)  # Launch Chromium browser in non-headless mode because the website detects headless browsers
        page = await browser.new_page()  # Create a new page in the browser


        await stealth_async(page)  # Apply stealth mode to avoid detection
        print(f"Navigating to page {page_num}...")
        url = f"https://euvsdisinfo.eu/disinformation-cases/page/{page_num}/?disinfo_countries%5B0%5D=country_77544&_=1750022717762"
        await page.goto(url, timeout=120000, wait_until='domcontentloaded')  # Navigate to the page
        print(f"Page {page_num} loaded.")
        await asyncio.sleep(1) # Small delay for ethical scraping

        article_cards = await page.query_selector_all('a.b-archive__database-item') # Select all article cards on the page
        # Extract data from the first few cards
        for i, card in enumerate(article_cards):
            outlet = []  # Initialize an empty list for outlets
            tags = []  # Initialize an empty list for tags
            title = "N/A"  # Initialize title variable
            summary = "N/A"  # Initialize summary variable
            response = "N/A"  # Initialize response variable
            date = "N/A"  # Initialize date variable
            languages = []  # Initialize an empty list for languages
            countries = []  # Initialize an empty list for countries

            url = await card.get_attribute('href') # Get the URL of the article

            article_page = await browser.new_page()  # Create a new page for article


            await stealth_async(article_page)  # Apply stealth mode to avoid detection
            await article_page.goto(f"https://euvsdisinfo.eu{url}", timeout=120000, wait_until='domcontentloaded')  # Navigate to the article page
            await asyncio.sleep(1) # Small delay for ethical scraping

            try:
                await article_page.wait_for_selector('h1.b-page__title', timeout=50)  # Wait for the title to load
            except Exception as e:
                title_tag = await article_page.query_selector('h1.b-report__title')  # Select the title of the article
                title = await title_tag.inner_text() if title_tag else "N/A"  # Get the title text
                title = title.replace("DISINFO: ", "")  # Clean the title text

                summary_tag = await article_page.query_selector('div.b-report__summary')  # Select the summary of the article
                summary = await summary_tag.inner_text() if summary_tag else "N/A"  # Get the summary text
                summary = summary.replace("SUMMARY\n\n", "")  # Clean the summary text

                response_tag = await article_page.query_selector('div.b-report__response')  # Select the response of the article
                response = await response_tag.inner_text() if response_tag else "N/A"  # Get the response text
                response = response.replace("RESPONSE\n\n", "")  # Clean the response text

                details = await article_page.query_selector_all('ul.b-report__details-list li')  # Select the details of the article

                if len(details) == 2:
                    date_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(1) span')  # Select the date of the article
                    date = await date_tag.inner_text() if date_tag else "N/A"  # Get the date text

                    countries_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(2) span')  # Select the countries of the article
                    countries = [await countries_tag.inner_text()]  # Get the countries text
                    countries = countries[0].split(", ") if countries else []  # Split the countries text into a list
                elif len(details) >= 3:
                    outlet_tag = await article_page.query_selector_all('ul.b-report__details-list li:nth-child(1) a')  # Select the outlets of the article
                    for j, outlet_item in enumerate(outlet_tag):  # Iterate through each outlet
                        outlet_text = await outlet_item.inner_text()  # Get the outlet text
                        if outlet_text and not outlet_text.strip().__contains__("archived"):  # Clean the outlet text and append it to the outlet list
                            outlet.append(outlet_text.strip().replace("\\n(opens in a new tab)", "").replace("\n(opens in a new tab)", ""))

                    date_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(2) span')  # Select the date of the article
                    date = await date_tag.inner_text() if date_tag else "N/A"  # Get the date text

                    if len(details) == 4:
                        languages_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(3) span')  # Select the languages of the article
                        languages = [await languages_tag.inner_text()]  # Get the languages text
                        languages = languages[0].split(", ") if languages else []  # Split the languages text into a list
                        languages = list(set(languages))  # Remove duplicates

                        countries_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(4) span')  # Select the countries of the article
                        countries = [await countries_tag.inner_text()]  # Get the countries text
                        countries = countries[0].split(", ") if countries else []  # Split the countries text into a list
                    elif len(details) == 3:
                        languages = []

                        countries_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(3) span')  # Select the countries of the article
                        countries = [await countries_tag.inner_text()]  # Get the countries text
                        countries = countries[0].split(", ") if countries else []  # Split the countries text into a list

                tags_tag = await article_page.query_selector_all('div.b-report__keywords a span')  # Select the tags of the article
                for tag_item in tags_tag:  # Iterate through each tag
                    tag_text = await tag_item.inner_text()  # Get the tag text
                    if tag_text:
                        tags.append(tag_text.strip())  # Append the tag text to the tags list

                results.append({  # Append the extracted data to the results list
                    'Title': title,
                    'URL': f"https://euvsdisinfo.eu{url}",
                    'Summary': summary,
                    'Response': response,
                    'Outlet': outlet,
                    'Date': date,
                    'Languages': languages,
                    'Countries': countries,
                    'Tags': tags
                })
            await article_page.close()  # Close the article page after extracting data

        await browser.close()  # Close the browser after scraping
    except Exception as e:
        print(f"An error occurred during EUvsDISINFO search: {e}")
        # Optional: Take a screenshot on error for debugging
        # if page: await page.screenshot(path='error_screenshot.png')

    finally:
        if browser:  # Ensure the browser is closed
            await browser.close()
            print("Browser closed.")
        if pw:
            await pw.stop()  # Stop Playwright
            print("Playwright stopped.")

    return results  # Return the results list containing the scraped data

pages = [5, 6, 7, 10, 11, 17, 24, 25, 27, 57, 60, 65, 70, 78, 97, 211, 488]  # Pages which we had to rescrape

for i in pages:
    df = asyncio.run(scrape_eu_vs_disinfo(i))  # Run the scraping function for each page
    if df:
        df = pd.DataFrame(df)  # Convert the results list to a DataFrame
        df.to_csv(f'euvsdisinfo_results_page_{i}.csv', index=False)  # Save the DataFrame to a CSV file
    else:
        print(f"\nNo results scraped from EUvsDISINFO for page {i}.")  # Display a message if no results were scraped


Navigating to page 5...
Page 5 loaded.
Browser closed.
Playwright stopped.
Navigating to page 6...
Page 6 loaded.
Browser closed.
Playwright stopped.
Navigating to page 7...
Page 7 loaded.
Browser closed.
Playwright stopped.
Navigating to page 10...
Page 10 loaded.
Browser closed.
Playwright stopped.
Navigating to page 11...
Page 11 loaded.
Browser closed.
Playwright stopped.
Navigating to page 17...
Page 17 loaded.
Browser closed.
Playwright stopped.
Navigating to page 24...
Page 24 loaded.
Browser closed.
Playwright stopped.
Navigating to page 25...
Page 25 loaded.
Browser closed.
Playwright stopped.
Navigating to page 27...
Page 27 loaded.
Browser closed.
Playwright stopped.
Navigating to page 57...
Page 57 loaded.
Browser closed.
Playwright stopped.
Navigating to page 60...
Page 60 loaded.
Browser closed.
Playwright stopped.
Navigating to page 65...
Page 65 loaded.
Browser closed.
Playwright stopped.
Navigating to page 70...
Page 70 loaded.
Browser closed.
Playwright stopped.
Navig

In [8]:
import pandas as pd

df = pd.DataFrame([])  # Initialize an empty DataFrame to store all pages
for i in range(1, 515):
    pg = pd.read_csv(f'euvsdisinfo_results_page_{i}.csv')  # Read the CSV file for each page
    df = pd.concat([df, pg], ignore_index=True)  # Append the DataFrame for each page to the list

df.to_csv('euvsdisinfo_results.csv', index=False)  # Save the combined DataFrame to a CSV file