# Import libraries

In [1]:
import os
import pandas as pd
import requests
import string

from dotenv import load_dotenv
from io import BytesIO
from lxml import html
from PIL import Image
from tqdm import tqdm

# Define Dataframe Structure

In [2]:
columns = ['meme_id', 'image_path', 'month', 'year', 'history', 'event', 'meme', 'creator', 'tags', 'related']
memes_df = pd.DataFrame(columns=columns)

# Define Scrapping Functionality

In [3]:
def get_meme_id(url):
    return url.split('/')[-2]


def save_image(name, tree, headers):
    image_url = tree.xpath('//div[contains(@class, "mm-img-item")]//img/@src')[0]
    image_format = image_url.split("/")[-1].split(".")[-1]
    image_path = os.path.join('images', name + '.' + image_format)

    response = requests.get(image_url, headers=headers)
    image = Image.open(BytesIO(response.content))
    image.save(image_path)
    
    return image_path


def get_date(url):
    date = url.split('/')[-2].split('-')
    
    month, year = None, None
    if len(date) == 2:
        month, year = date
    elif len(date) == 1: # some of urls contain only month
        month = date[0]
    
    return month, year


def get_info(tree, field):
    if tree.xpath(f'//div[contains(@class, "mm-info-title") and contains(text(), "{field}")]/following-sibling::a[contains(@class, "btn")]'):
        return
    info = ' '.join(tree.xpath(f'//div[contains(@class, "mm-info-title") and contains(text(), "{field}")]/..//text()'))
    
    # info also contains field name, so we remove it
    if len(info.split(field)) > 1:
        return info.split(field)[1].strip()
    else:
        return

    
def get_related_memes(tree):
    return [get_meme_id(meme) for meme in tree.xpath('//div[contains(@class, "memes-list")]//a/@href')]

# Set Up Scrapping Configuration

In [4]:
website_url = "https://warmemes.com.ua/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}
response = requests.get(website_url, headers=headers)

tree = html.fromstring(response.content)

# Data Scrapping

In [5]:
months_urls = tree.xpath('//div[contains(@class, "home-main")]//ul[contains(@class, "monthes-list")]//a/@href')

for months_url in tqdm(months_urls):
    response = requests.get(months_url, headers=headers)
    tree = html.fromstring(response.content)
    meme_urls = tree.xpath('//div[contains(@class, "meme")]/a/@href')
    
    month, year = get_date(months_url)
    
    for url in meme_urls:
        response = requests.get(url, headers=headers)
        tree = html.fromstring(response.content)

        row = {}
        
        row['month'] = month
        row['year'] = year

        meme_id = get_meme_id(url)
        row['meme_id'] = meme_id

        image_path = save_image(meme_id, tree, headers)
        row['image_path'] = image_path

        related = get_related_memes(tree)
        row['related'] = related
        
        event = get_info(tree, 'Event:')
        row['event'] = event
        
        meme = get_info(tree, 'Meme:')
        row['meme'] = meme
        
        history = get_info(tree, "Meme’s history:")
        row['history'] = history
        
        creator = get_info(tree, 'Creator:')
        row['creator'] = creator
        
        tags = get_info(tree, 'Tags:')
        row['tags'] = tags

        row = pd.DataFrame([row])
        if memes_df.empty:
            memes_df = row
        else:
            memes_df = pd.concat([memes_df, row], ignore_index=True)

100%|███████████████████████████████████████████| 34/34 [04:03<00:00,  7.17s/it]


In [6]:
memes_df

Unnamed: 0,month,year,meme_id,image_path,related,event,meme,history,creator,tags
0,february,2022,war,images/war.jpg,"[february, war-and-peace, 100-days-of-war]",Putin declared war on Ukraine,War,,,"#2022 , #war"
1,february,2022,war-and-peace,images/war-and-peace.png,"[february, 100-days-of-war, war]",Russian authorities forbade to call Russian in...,War and peace,"Probably, the famous Leo Tolstoy's novel ""War ...",,"#war , #special military operation"
2,february,2022,scale-of-ukrainian-heroism,images/scale-of-ukrainian-heroism.jpg,"[every-morning, sleep-well-little-tank, day-16...","After several days, it became evident that Ukr...",Scale of Ukrainian heroism,This meme is comparing the size of two countri...,,"#russia , #resilience , #Ukraine"
3,february,2022,ghost-of-kyiv,images/ghost-of-kyiv.png,"[kyiv-is-angry, meteor-in-kyiv, attacks-on-kyi...",Russia was unable to establish air superiority...,Ghost of Kyiv,Ghost of Kyiv is a nickname given to a mythica...,,"#Kyiv , #aircraft , #airplanes , #Ghost of ..."
4,february,2022,sunflower-seeds,images/sunflower-seeds.webp,[],A Ukrainian woman told Russian soldier: ‘Put s...,Sunflower seeds,A viral video appeared showing a Ukrainian wom...,,"#sunflower , #seeds , #occupiers"
...,...,...,...,...,...,...,...,...,...,...
219,october,2024,nafo-2,images/nafo-2.jpg,"[fundraising-for-nuclear, dirty-bomb, nato-or-...",,NAFO,,,"#nuclear , #NAFO"
220,october,2024,nato-or-nuclear,images/nato-or-nuclear.jpg,"[finland-nato, proton, nafo-2, nato, dirty-bom...",Zelensky told NATO members that Ukraine must e...,NATO or nuclear,,,"#nuclear , #NATO"
221,october,2024,last-day,images/last-day.jpg,[],Ukraine adopted the law to cancel Daylight Sav...,Last day,"The text says ""The daylight savings switch wil...",,#daylight saving
222,november,2024,700000-2,images/700000-2.jpg,[],,700000,,,"#700000 , #dead russian"


# Data Cleaning

Let's search for duplicates within meme id

In [7]:
duplicates = memes_df.meme_id.duplicated(keep=False)
memes_df[duplicates]

Unnamed: 0,month,year,meme_id,image_path,related,event,meme,history,creator,tags
120,may,2023,prigozhin_and_budanov,images/prigozhin_and_budanov.jpeg,"[shark, popcorn, smiling, forrest-gump, breaki...",Yevgeny Prigozhin with his Wagner mercenaries ...,Prigozhin and Budanov,A meme shows an imaginary dialogue between Pri...,,"#Budanov , #Prigozhin , #Wagner"
135,june,2023,prigozhin_and_budanov,images/prigozhin_and_budanov.jpeg,"[ride-not-ammunition, breaking, forrest-gump, ...",Yevgeny Prigozhin with his Wagner mercenaries ...,Prigozhin and Budanov,A meme shows an imaginary dialogue between Pri...,,"#Budanov , #Prigozhin , #Wagner"


As we can see it is the same meme, present both in May and June. Let's leave only one.

In [8]:
memes_df = memes_df.drop_duplicates(subset='meme_id')

There are many missing fields, let's set it like an empty string.

In [9]:
memes_df = memes_df.fillna('')

# Data Preparation

For further comfortable investigation let's map tags to a list of words.

In [10]:
memes_df.tags = memes_df.tags.apply(lambda row: list(map(str.strip, row.replace('#', '').split(','))) if row else [])
memes_df

Unnamed: 0,month,year,meme_id,image_path,related,event,meme,history,creator,tags
0,february,2022,war,images/war.jpg,"[february, war-and-peace, 100-days-of-war]",Putin declared war on Ukraine,War,,,"[2022, war]"
1,february,2022,war-and-peace,images/war-and-peace.png,"[february, 100-days-of-war, war]",Russian authorities forbade to call Russian in...,War and peace,"Probably, the famous Leo Tolstoy's novel ""War ...",,"[war, special military operation]"
2,february,2022,scale-of-ukrainian-heroism,images/scale-of-ukrainian-heroism.jpg,"[every-morning, sleep-well-little-tank, day-16...","After several days, it became evident that Ukr...",Scale of Ukrainian heroism,This meme is comparing the size of two countri...,,"[russia, resilience, Ukraine]"
3,february,2022,ghost-of-kyiv,images/ghost-of-kyiv.png,"[kyiv-is-angry, meteor-in-kyiv, attacks-on-kyi...",Russia was unable to establish air superiority...,Ghost of Kyiv,Ghost of Kyiv is a nickname given to a mythica...,,"[Kyiv, aircraft, airplanes, Ghost of Kyiv]"
4,february,2022,sunflower-seeds,images/sunflower-seeds.webp,[],A Ukrainian woman told Russian soldier: ‘Put s...,Sunflower seeds,A viral video appeared showing a Ukrainian wom...,,"[sunflower, seeds, occupiers]"
...,...,...,...,...,...,...,...,...,...,...
219,october,2024,nafo-2,images/nafo-2.jpg,"[fundraising-for-nuclear, dirty-bomb, nato-or-...",,NAFO,,,"[nuclear, NAFO]"
220,october,2024,nato-or-nuclear,images/nato-or-nuclear.jpg,"[finland-nato, proton, nafo-2, nato, dirty-bom...",Zelensky told NATO members that Ukraine must e...,NATO or nuclear,,,"[nuclear, NATO]"
221,october,2024,last-day,images/last-day.jpg,[],Ukraine adopted the law to cancel Daylight Sav...,Last day,"The text says ""The daylight savings switch wil...",,[daylight saving]
222,november,2024,700000-2,images/700000-2.jpg,[],,700000,,,"[700000, dead russian]"


Let's save fields with lists as a string with `;` separator.

In [11]:
memes_df['related'] = memes_df.related.apply(lambda row: ';'.join(row))
memes_df['tags'] = memes_df.tags.apply(lambda row: ';'.join(row))
memes_df

Unnamed: 0,month,year,meme_id,image_path,related,event,meme,history,creator,tags
0,february,2022,war,images/war.jpg,february;war-and-peace;100-days-of-war,Putin declared war on Ukraine,War,,,2022;war
1,february,2022,war-and-peace,images/war-and-peace.png,february;100-days-of-war;war,Russian authorities forbade to call Russian in...,War and peace,"Probably, the famous Leo Tolstoy's novel ""War ...",,war;special military operation
2,february,2022,scale-of-ukrainian-heroism,images/scale-of-ukrainian-heroism.jpg,every-morning;sleep-well-little-tank;day-160;r...,"After several days, it became evident that Ukr...",Scale of Ukrainian heroism,This meme is comparing the size of two countri...,,russia;resilience;Ukraine
3,february,2022,ghost-of-kyiv,images/ghost-of-kyiv.png,kyiv-is-angry;meteor-in-kyiv;attacks-on-kyiv;a...,Russia was unable to establish air superiority...,Ghost of Kyiv,Ghost of Kyiv is a nickname given to a mythica...,,Kyiv;aircraft;airplanes;Ghost of Kyiv
4,february,2022,sunflower-seeds,images/sunflower-seeds.webp,,A Ukrainian woman told Russian soldier: ‘Put s...,Sunflower seeds,A viral video appeared showing a Ukrainian wom...,,sunflower;seeds;occupiers
...,...,...,...,...,...,...,...,...,...,...
219,october,2024,nafo-2,images/nafo-2.jpg,fundraising-for-nuclear;dirty-bomb;nato-or-nuc...,,NAFO,,,nuclear;NAFO
220,october,2024,nato-or-nuclear,images/nato-or-nuclear.jpg,finland-nato;proton;nafo-2;nato;dirty-bomb;fun...,Zelensky told NATO members that Ukraine must e...,NATO or nuclear,,,nuclear;NATO
221,october,2024,last-day,images/last-day.jpg,,Ukraine adopted the law to cancel Daylight Sav...,Last day,"The text says ""The daylight savings switch wil...",,daylight saving
222,november,2024,700000-2,images/700000-2.jpg,,,700000,,,700000;dead russian


# Data Translation

We are interested in Ukrainian representation, so let's translated field with text content to Ukrainian using Google Translation API.

`GOOGLE_API_KEY` is saved inside `.env`.

In [12]:
load_dotenv()

google_api_key = os.getenv("GOOGLE_API_KEY")
url = f"https://translation.googleapis.com/language/translate/v2"

def translate_text(text, api_key, target_lang="uk"):
    params = {
        "q": text,
        "target": target_lang,
        "key": api_key
    }

    response = requests.post(url, data=params)
    return response.json()["data"]["translations"][0]["translatedText"]

Target fields are `event`, `meme`, `history` and `tags`.

In [13]:
memes_df['event_ua'] = memes_df.event.apply(lambda row: translate_text(row, google_api_key) if row else '')
memes_df['meme_ua'] = memes_df.meme.apply(lambda row: translate_text(row, google_api_key) if row else '')
memes_df['history_ua'] = memes_df.history.apply(lambda row: translate_text(row, google_api_key) if row else '')
memes_df['tags_ua'] = memes_df.tags.apply(lambda row: ';'.join([translate_text(tag, google_api_key) for tag in row.split(';')]) if row else [])

In [19]:
memes_df

Unnamed: 0,month,year,meme_id,image_path,related,event,meme,history,creator,tags,event_ua,meme_ua,history_ua,tags_ua
0,february,2022,war,images/war.jpg,february;war-and-peace;100-days-of-war,Putin declared war on Ukraine,War,,,2022;war,Путін оголосив війну Україні,Війна,,2022 рік;війни
1,february,2022,war-and-peace,images/war-and-peace.png,february;100-days-of-war;war,Russian authorities forbade to call Russian in...,War and peace,"Probably, the famous Leo Tolstoy's novel ""War ...",,war;special military operation,Російська влада заборонила називати російське ...,Війна і мир,"Мабуть, відомий роман Льва Толстого «Війна і м...",війни;військова спецоперація
2,february,2022,scale-of-ukrainian-heroism,images/scale-of-ukrainian-heroism.jpg,every-morning;sleep-well-little-tank;day-160;r...,"After several days, it became evident that Ukr...",Scale of Ukrainian heroism,This meme is comparing the size of two countri...,,russia;resilience;Ukraine,"Через кілька днів стало зрозуміло, що Україна ...",Масштаби українського героїзму,Цей мем порівнює розміри двох країн: України т...,росія;стійкість;Україна
3,february,2022,ghost-of-kyiv,images/ghost-of-kyiv.png,kyiv-is-angry;meteor-in-kyiv;attacks-on-kyiv;a...,Russia was unable to establish air superiority...,Ghost of Kyiv,Ghost of Kyiv is a nickname given to a mythica...,,Kyiv;aircraft;airplanes;Ghost of Kyiv,Росія не змогла встановити перевагу в повітрі ...,Привид Києва,"Привид Києва — це прізвисько, яке дали міфічно...",Київ;літак;літаки;Привид Києва
4,february,2022,sunflower-seeds,images/sunflower-seeds.webp,,A Ukrainian woman told Russian soldier: ‘Put s...,Sunflower seeds,A viral video appeared showing a Ukrainian wom...,,sunflower;seeds;occupiers,Українка сказала російському солдату: «Поклади...,Насіння соняшнику,"З&#39;явилося вірусне відео, на якому українка...",соняшник;насіння;окупанти
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,october,2024,nafo-2,images/nafo-2.jpg,fundraising-for-nuclear;dirty-bomb;nato-or-nuc...,,NAFO,,,nuclear;NAFO,,НАФО,,ядерний;НАФО
220,october,2024,nato-or-nuclear,images/nato-or-nuclear.jpg,finland-nato;proton;nafo-2;nato;dirty-bomb;fun...,Zelensky told NATO members that Ukraine must e...,NATO or nuclear,,,nuclear;NATO,"Зеленський заявив членам НАТО, що Україна має ...",НАТО чи ядерна,,ядерний;НАТО
221,october,2024,last-day,images/last-day.jpg,,Ukraine adopted the law to cancel Daylight Sav...,Last day,"The text says ""The daylight savings switch wil...",,daylight saving,В Україні ухвалили закон про скасування перехо...,Останній день,У тексті сказано: «Перехід на літній та зимови...,літній час
222,november,2024,700000-2,images/700000-2.jpg,,,700000,,,700000;dead russian,,700000,,700000;мертвий росіянин


In [20]:
memes_df.to_csv("warmemes.csv", index=False)