In [12]:
# EUvsDISINFO Scraper
# Import necessary libraries
import pandas as pd
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async        # Import stealth_async for stealth mode in order to avoid detection

nest_asyncio.apply()  # Apply nest_asyncio to allow nested event loops

async def scrape_eu_vs_disinfo():
    pw = None
    browser = None
    results = []  # Initialize an empty list to store results
    try:
        pw = await async_playwright().start()  # Start Playwright
        browser = await pw.chromium.launch(headless=False)  # Launch Chromium browser in non-headless mode because the website detects headless browsers
        page = await browser.new_page()  # Create a new page in the browser

        await asyncio.sleep(0.1)  # Small delay for ethical scraping
        await stealth_async(page)  # Apply stealth mode to avoid detection
        print(f"Navigating to page 1...")
        url = f"https://euvsdisinfo.eu/disinformation-cases/page/1/?disinfo_countries%5B0%5D=country_77544&_=1750012768539"
        await page.goto(url, timeout=120000, wait_until='domcontentloaded')  # Navigate to the page
        print(f"Page 1 loaded.")

        pagination_tags = await page.query_selector('div.b-pagination a:last-child')  # Select last pagination item
        num_pages = await pagination_tags.inner_text()  # Get the number of pages
        print(f"Number of pages: {num_pages}")

        for page_num in range(1, 2):  # Iterating through web pages
            if page_num > 1:
                await asyncio.sleep(0.1)  # Small delay for ethical scraping
                await stealth_async(page)  # Apply stealth mode to avoid detection
                print(f"Navigating to page {page_num}...")
                url = f"https://euvsdisinfo.eu/disinformation-cases/page/{page_num}/?disinfo_countries%5B0%5D=country_77544&_=1750012768539"
                await page.goto(url, timeout=120000, wait_until='domcontentloaded')  # Navigate to the page
                print(f"Page {page_num} loaded.")

            article_cards = await page.query_selector_all('a.b-archive__database-item') # Select all article cards on the page

            # Extract data from the first few cards
            for i, card in enumerate(article_cards):
                outlet = []  # Initialize an empty list for outlets
                tags = []  # Initialize an empty list for tags

                url = await card.get_attribute('href') # Get the URL of the article

                article_page = await browser.new_page()  # Create a new page for article

                await asyncio.sleep(0.1) # Small delay for ethical scraping
                await stealth_async(article_page)  # Apply stealth mode to avoid detection
                await article_page.goto(f"https://euvsdisinfo.eu{url}", timeout=120000, wait_until='domcontentloaded')  # Navigate to the article page

                title_tag = await article_page.query_selector('h1.b-report__title')  # Select the title of the article
                title = await title_tag.inner_text() if title_tag else "N/A"  # Get the title text
                title = title.replace("DISINFO: ", "")  # Clean the title text

                summary_tag = await article_page.query_selector('div.b-report__summary')  # Select the summary of the article
                summary = await summary_tag.inner_text() if summary_tag else "N/A"  # Get the summary text
                summary = summary.replace("SUMMARY\n\n", "")  # Clean the summary text

                response_tag = await article_page.query_selector('div.b-report__response')  # Select the response of the article
                response = await response_tag.inner_text() if response_tag else "N/A"  # Get the response text
                response = response.replace("RESPONSE\n\n", "")  # Clean the response text

                outlet_tag = await article_page.query_selector_all('ul.b-report__details-list li:nth-child(1) a')  # Select the outlets of the article
                for j, outlet_item in enumerate(outlet_tag):  # Iterate through each outlet
                    outlet_text = await outlet_item.inner_text()  # Get the outlet text
                    if outlet_text and not outlet_text.strip().__contains__("archived"):  # Clean the outlet text and append it to the outlet list
                        outlet.append(outlet_text.strip().replace("\\n(opens in a new tab)", "").replace("\n(opens in a new tab)", ""))  

                date_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(2) span')  # Select the date of the article
                date = await date_tag.inner_text() if date_tag else "N/A"  # Get the date text

                languages_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(3) span')  # Select the languages of the article
                languages = [await languages_tag.inner_text()]  # Get the languages text
                languages = languages[0].split(", ") if languages else []  # Split the languages text into a list
                languages = list(set(languages))  # Remove duplicates

                countries_tag = await article_page.query_selector('ul.b-report__details-list li:nth-child(4) span')  # Select the countries of the article
                countries = [await countries_tag.inner_text()]  # Get the countries text
                countries = countries[0].split(", ") if countries else []  # Split the countries text into a list

                tags_tag = await article_page.query_selector_all('div.b-report__keywords a span')  # Select the tags of the article
                for tag_item in tags_tag:  # Iterate through each tag
                    tag_text = await tag_item.inner_text()  # Get the tag text
                    if tag_text:
                        tags.append(tag_text.strip())  # Append the tag text to the tags list

                results.append({  # Append the extracted data to the results list
                    'Title': title,
                    'URL': f"https://euvsdisinfo.eu{url}",
                    'Summary': summary,
                    'Response': response,
                    'Outlet': outlet,
                    'Date': date,
                    'Languages': languages,
                    'Countries': countries,
                    'Tags': tags
                })
                await article_page.close()  # Close the article page after extracting data

        await browser.close()  # Close the browser after scraping
    except Exception as e:
        print(f"An error occurred during EUvsDISINFO search: {e}")
        # Optional: Take a screenshot on error for debugging
        # if page: await page.screenshot(path='error_screenshot.png')

    finally:
        if browser:  # Ensure the browser is closed
            await browser.close()
            print("Browser closed.")
        if pw:
            await pw.stop()  # Stop Playwright
            print("Playwright stopped.")

    return results  # Return the results list containing the scraped data

df = asyncio.run(scrape_eu_vs_disinfo())  # Run the scraping function
if df:
    df = pd.DataFrame(df)  # Convert the results list to a DataFrame
    print("\n--- EUvsDISINFO Results ---")
    display(df)  # Display the DataFrame with the scraped data
    df.to_csv('euvsdisinfo_results.csv', index=False)  # Save the DataFrame to a CSV file
else:
    print("\nNo results scraped from EUvsDISINFO.")  # Display a message if no results were scraped


Navigating to page 1...
Page 1 loaded.
Number of pages: 514
Browser closed.
Playwright stopped.

--- EUvsDISINFO Results ---


Unnamed: 0,Title,URL,Summary,Response,Outlet,Date,Languages,Countries,Tags
0,Russia proves the great strength of its economy,https://euvsdisinfo.eu/report/russia-proves-th...,Russia proved the great strength of its econom...,The headline is deliberately misleading. This ...,[noticiaslatam.lat],"June 10, 2025",[Spanish],"[Russia, Iran, China]","[Russian superiority, Sanctions, economy]"
1,Russia hit only military infrastructure in Ukr...,https://euvsdisinfo.eu/report/russia-hit-only-...,The Russian military inflicted a mass strike o...,The claim is demonstrably false. While some of...,"[sputnikglobe.com, noticiaslatam.lat]","June 09, 2025","[English, Spanish]","[Russia, Ukraine]","[War crimes, Full-scale Invasion of Ukraine, R..."
2,The US Deep State feeds the Ukraine war,https://euvsdisinfo.eu/report/the-us-deep-stat...,The US Deep State feeds the Ukraine war. Blood...,Recurring pro-Kremlin disinformation narrative...,[RT English],"June 09, 2025",[English],"[US, Ukraine, Russia]","[Destabilising Russia, Russophobia, Full-scale..."
3,Ukrainian carries out state terrorism,https://euvsdisinfo.eu/report/ukrainian-carrie...,Western media are concealing from their audien...,Recurring pro-Kremlin disinformation narrative...,[Archived],"June 07, 2025",[German],"[Ukraine, Russia]","[Freedom of speech, Mainstream media, Media, A..."
4,Poles and Lithuanians are fleeing the Suwałki ...,https://euvsdisinfo.eu/report/poles-and-lithua...,Residents of Poland and Lithuania are fleeing ...,Recurring pro-Kremlin disinformation suggestin...,[poland.news-pravda.com],"June 07, 2025",[Polish],"[Poland, Russia, Lithuania, Belarus]","[Kaliningrad, Military, security threat, Anti-..."
5,Beastly Anglo-Saxons will have no choice but t...,https://euvsdisinfo.eu/report/beastly-anglo-sa...,The person at the top of the power pyramid of ...,This claim is a recurring pro-Kremlin disinfor...,"[palestine.shafaqna.com, saudi.shafaqna.com, a...","June 06, 2025",[Arabic],"[UK, US, Russia, Ukraine, China]","[Anglo-Saxon, NATO, Nuclear issues, War agains..."
6,The new Polish President is the candidate of t...,https://euvsdisinfo.eu/report/the-new-polish-p...,The new Polish President is the candidate of t...,Poland is repeatedly presented by pro-Kremlin ...,[poland.news-pravda.com],"June 06, 2025",[Polish],"[Poland, Russia, Ukraine]","[Elections, Anti-Russian, Russophobia, Stepan ..."
7,Foreign mercenaries arrive in Ukraine only to ...,https://euvsdisinfo.eu/report/foreign-mercenar...,The glorification of foreign mercenaries in Uk...,A common pro-Kremlin disinformation narrative ...,"[bgr.news-front.su, bgr.news-front.su]","June 05, 2025",[Bulgarian],"[Ukraine, Russia]","[NATO, Foreign mercenaries, War against Ukrain..."
8,Kyiv’s terror attack against airfields won’t p...,https://euvsdisinfo.eu/report/kyivs-terror-att...,"According to the New York Times, Kyiv’s terror...",This is a disingenuous distortion of the origi...,[esrt.press],"June 03, 2025",[Spanish],"[Ukraine, Russia, US]","[Russian superiority, Russian Ministry of Defe..."
9,Hitting military airfields in Russia is a Ukra...,https://euvsdisinfo.eu/report/hitting-military...,The terrorist attack on Russian airfields orga...,This disinformation story aims to promote a re...,[vesti.ru],"June 03, 2025",[Russian],"[Ukraine, Russia, US]","[War against Ukraine, Terrorism, Full-scale In..."


In [66]:
print(df['Summary'][13])

On March 30, 2022, the Russian army left Bucha, 30 km from Kiev. The following day, Ukrainian forces, including fighters from the Nationalist Azov Battalion, entered the town. It was not until the fifth day after the Russian troops had withdrawn that dozens of Western journalists entered the town. Images were quickly broadcast on TV, showing what would later be called the Bucha massacre. [...] we can see that they coordinated their work with Western supervisors, because everyone says “Oh, there are corpses everywhere, it's a massacre" [...].

One understands who was responsible for these crimes, it was the Ukrainian death squads. All Western media were brandishing satellite photos showing several bodies, saying “Look, corpses lying on the road.” But who knows where they came from? No proof, no investigation. Who's to blame? Russia, of course!

Joseph Goebbels once said, “A lie repeated a thousand times becomes the truth”. It's his most famous aphorism. In fact, this principle still wor