# Scraping equipment losses with sources

The original source of this aggregated data is from this OSINT project: **Attack On Europe: Documenting Russian Equipment Losses During The Russian Invasion Of Ukraine**. Thw website is:https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-equipment.html

**I have preferred to scrape this data and rearrange it in a manner that fits my needs.**

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import urllib.request
import re
import numpy as np
import pandas as pd

In [2]:
page = urllib.request.urlopen("https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-equipment.html")
soup = bs(page)

In [3]:
h3s = soup.find_all('h3')

for h3 in h3s[3::]:
    print(h3.text)

Tanks (2146, of which destroyed: 1363, damaged: 120, abandoned: 115, captured: 548)
Armoured Fighting Vehicles (925, of which destroyed: 598, damaged: 25, abandoned: 35, captured: 267)
Infantry Fighting Vehicles (2547, of which destroyed: 1712, damaged: 97, abandoned: 134, captured: 604)
Armoured Personnel Carriers (331, of which destroyed: 219, damaged: 9, abandoned: 14, captured: 89)
Mine-Resistant Ambush Protected (MRAP) Vehicles (46, of which destroyed: 32, damaged: 4, abandoned: 1, captured: 9)
Infantry Mobility Vehicles (197, of which destroyed: 138, damaged: 6, abandoned: 2, captured: 51)
Command Posts And Communications Stations (244, of which destroyed: 157, damaged: 1, abandoned: 3, captured: 83)
Engineering Vehicles And Equipment (322, of which destroyed: 151, damaged: 8, abandoned: 38, captured: 125)
Self-Propelled Anti-Tank Missile Systems (40, of which destroyed: 16, damaged: 1, abandoned: 4, captured: 19)
Artillery Support Vehicles And Equipment (103, of which destroyed:

**I have to access and scrape each category separately.**

In [4]:
ul = soup.find_all("ul") # all categories

## Tanks

In [5]:
tanks_ = ul[2]

for i in tanks_:
    tank_model = tanks_.find_all('li')
    
model = []
photo_links = []
state = []

for i in range(len(tank_model)):
    links = tank_model[i].find_all('a')
    
    for link in links:
        model_num = tank_model[i].text[1::].split(":")[0]
        model_num = model_num.replace('\xa0', ' ')
        href = link.get('href')
        text = link.get_text(strip=True)[3:]
        
        if href:
            photo_links.append(href)
            state.append(text)
            model.append(model_num) 
            
tanks = pd.DataFrame({'model' : model, 'state': state, 'photo_links': photo_links})

## Automating the process for the remaining 23 categories

In [6]:
category_data = {}

# names of objects (and dataframes)
bs4_objects = {
    "armoured": 3,
    "IFV": 4,
    "APC": 5,
    "MRAP": 6,
    "IMV": 7,
    "cmd_comms": 8,
    "engineering": 9,
    "ATGM": 10,
    "arty_support": 11,
    "arty_towed": 12,
    "arty_SP": 13,
    "MLRS": 14,
    "aa_guns": 15,
    "aa_gunsSP": 16,
    "sam": 17,
    "radars": 18,
    "jammers": 19,
    "aircrafts": 20,
    "helicopters": 21,
    "UAV": 22,
    "rUAV": 23,
    "navy": 24,
    "auto": 25
}

# loop and extract
for category, ul_index in bs4_objects.items():
    model = []
    photo_links = []
    state = []

    ul_element = ul[ul_index]
    cats = ul_element.find_all('li')

    for cat in cats:
        links = cat.find_all('a')

        for link in links:
            model_num = cat.text[1:].split(":")[0].replace('\xa0', ' ')
            href = link.get('href')
            text = link.get_text(strip=True)[3:]

            if href:
                photo_links.append(href)
                state.append(text)
                model.append(model_num)

    # genetaring a dataframe for each category + storing in dict
    category_df = pd.DataFrame({'model': model, 'state': state, 'photo_links': photo_links})
    category_data[category] = category_df

In [7]:
category_data['armoured']

Unnamed: 0,model,state,photo_links
0,1 BMPT Terminator,destroyed),https://i.postimg.cc/vZznvbJ8/1000-bmpt-termin...
1,25 BRM-1(K) reconnaissance vehicle,destroyed),https://postimg.cc/bGSbfRQm
2,25 BRM-1(K) reconnaissance vehicle,destroyed),https://i.postimg.cc/gj3KbRGG/fsl.png
3,25 BRM-1(K) reconnaissance vehicle,destroyed),https://i.postimg.cc/Ss7ZV5zW/d13p.png
4,25 BRM-1(K) reconnaissance vehicle,destroyed),https://i.postimg.cc/QVLrdHRK/6880.png
...,...,...,...
862,200 Unknown AFV,", damaged)",https://i.postimg.cc/DyVtZr6P/1036-unkn-afv-da...
863,200 Unknown AFV,abandoned),https://i.postimg.cc/RhRQ6vSr/dd2.jpg
864,200 Unknown AFV,abandoned),https://i.postimg.cc/c49WrLL8/1016-unkn-afv-ab...
865,200 Unknown AFV,captured),https://i.postimg.cc/pXJJrNhR/bj.png


# Data transformation & quality assessment

In [8]:
armoured = category_data['armoured']
IFV = category_data['IFV']
APC= category_data['APC']
MRAP= category_data['MRAP']
IMV= category_data['IMV']
cmd_comms= category_data['cmd_comms']
engineering= category_data['engineering']
ATGM= category_data['ATGM']
arty_support= category_data['arty_support']
arty_towed= category_data['arty_towed']
arty_SP= category_data['arty_SP']
MLRS= category_data['MLRS']
aa_guns= category_data['aa_guns']
aa_gunsSP= category_data['aa_gunsSP']
sam= category_data['sam']
radars= category_data['radars']
jammers= category_data['jammers']
aircrafts= category_data['aircrafts']
helicopters= category_data['helicopters']
UAV= category_data['UAV']
rUAV= category_data['rUAV']
navy= category_data['navy']
auto= category_data['auto']

## Functions for preprocessing the data

In [9]:
# remove number from the begining of the string

def func1(record):
    result = re.sub(r'^\d+', '', record)
    return result


# remove ")" and keep the last word

def func2(record):
    result1 = re.sub(r'\)', '', record) # remove ")"
    result2 = re.findall(r'\b\w+\b$', result1)[0] # keep just the last word
    return result2


# when counting the values of state, there is an instance "abanoned" instead of "abandoned"

def replace_state(state):
    if state == 'abanonded':
        return 'abandoned'
    else:
        return state

### tanks preprocessing

In [10]:
# Unknown T-54/55 (1, destroyed) <- removing this (1, destroyed)

def replace_tmodel(model):
    if model == ' Unknown T-54/55 (1, destroyed)':
        return 'Unknown T-54/55'
    else:
        return model
    
# replace "Obr." with "Mk" from "modification"....in the context of an equipment model, "Obr. ####" stands 
# for "обр. 2016" in Russian, which translates to "modification 2016" in English

def replace_mk(record):
    result = re.sub(r'Obr\.', 'Mk.', record)
    return result

### towed artilert preprocessing

In [11]:
# remove ")" and keep the last word

def func_ta(record):
    result1 = re.sub(r'\)', '', record) # remove ")"
    #result2 = re.findall(r'\b\w+\b$', result1)[0] # keep just the last word
    words = result1.split()  # Split the string into a list of words
    if words:
        return words[-1]  # Return the last word
    else:
        return None 
    return words[-1]

## Applying Functions

In [12]:
tanks['model'] = tanks['model'].apply(func1)
tanks['state'] = tanks['state'].apply(func2)
tanks['state'] = tanks['state'].apply(replace_state)
tanks['model'] = tanks['model'].apply(replace_tmodel)
tanks['model'] = tanks['model'].apply(replace_mk)

In [13]:
armoured['model'] = armoured['model'].apply(func1)
armoured['state'] = armoured['state'].apply(func2)

In [14]:
IFV['model'] = IFV['model'].apply(func1)
IFV['state'] = IFV['state'].apply(func2)

In [15]:
APC['model'] = APC['model'].apply(func1)
APC['state'] = APC['state'].apply(func2)

In [16]:
MRAP['model'] = MRAP['model'].apply(func1)
MRAP['state'] = MRAP['state'].apply(func2)

In [17]:
IMV['model'] = IMV['model'].apply(func1)
IMV['state'] = IMV['state'].apply(func2)

In [18]:
cmd_comms['model'] = cmd_comms['model'].apply(func1)
cmd_comms['state'] = cmd_comms['state'].apply(func2)

In [19]:
engineering['model'] = engineering['model'].apply(func1)
engineering['state'] = engineering['state'].apply(func2)

In [20]:
ATGM['model'] = ATGM['model'].apply(func1)
ATGM['state'] = ATGM['state'].apply(func2)

In [21]:
arty_support['model'] = arty_support['model'].apply(func1)
arty_support['state'] = arty_support['state'].apply(func2)

In [22]:
arty_towed['model'] = arty_towed['model'].apply(func1)
arty_towed['state'] = arty_towed['state'].apply(func_ta) # custom
arty_towed['state'] = arty_towed['state'].apply(lambda state: 'damaged' if state == 'aged' else state) # custom preprocessing

In [23]:
arty_SP['model'] = arty_SP['model'].apply(func1)
arty_SP['state'] = arty_SP['state'].apply(func2)

In [24]:
MLRS['model'] = MLRS['model'].apply(func1)
MLRS['state'] = MLRS['state'].apply(func2)

In [25]:
aa_guns['model'] = aa_guns['model'].apply(func1)
aa_guns['state'] = aa_guns['state'].apply(func2)

In [26]:
aa_gunsSP['model'] = aa_gunsSP['model'].apply(func1)
aa_gunsSP['state'] = aa_gunsSP['state'].apply(func2)

In [27]:
sam['model'] = sam['model'].apply(func1)
sam['state'] = sam['state'].apply(func2)

In [28]:
radars['model'] = radars['model'].apply(func1)
radars['state'] = radars['state'].apply(func2)

In [29]:
jammers['model'] = jammers['model'].apply(func1)
jammers['state'] = jammers['state'].apply(func2)

In [30]:
aircrafts['model'] = aircrafts['model'].apply(func1)
aircrafts['state'] = aircrafts['state'].apply(func2)

In [31]:
helicopters['model'] = helicopters['model'].apply(func1)
helicopters['state'] = helicopters['state'].apply(func2)

In [32]:
UAV['model'] = UAV['model'].apply(func1)
UAV['state'] = UAV['state'].apply(func2)

In [33]:
rUAV['model'] = rUAV['model'].apply(func1)
rUAV['state'] = rUAV['state'].apply(func2)

In [34]:
navy['model'] = navy['model'].apply(func1)
navy['state'] = navy['state'].apply(func2)

In [35]:
auto['model'] = auto['model'].apply(func1)
auto['state'] = auto['state'].apply(func2)

## Reshaping

I want to aggregate the data in another manner. For example, these categories ('armoured', 'IFV', 'APC', 'MRAP', 'IMV') will become "Infantry vehicles." Artillery support, self-propelled artillery, and towed artillery will become "artillery," and so on.

### Infantry vehicles

In [37]:
# categories
category_names = ['armoured', 'IFV', 'APC', 'MRAP', 'IMV']

# creating a dictionary to hold dataframes with category names as keys
category_dataframes = dict(zip(category_names, [armoured, IFV, APC, MRAP, IMV])) 

# adding 'category' column to each df and concatenate
infantry_vehicles = pd.concat([df.assign(category=category) for category, df in category_dataframes.items()], ignore_index=True)

In [38]:
infantry_vehicles['category'].value_counts()

IFV         2361
armoured     867
APC          321
IMV          190
MRAP          46
Name: category, dtype: int64

### Artillery

In [39]:
category_names = ['towed', 'support', 'self propelled']

category_dataframes = dict(zip(category_names, [arty_support, arty_towed, arty_SP]))

artillery = pd.concat([df.assign(category=category) for category, df in category_dataframes.items()], ignore_index=True)

In [40]:
artillery

Unnamed: 0,model,state,photo_links,category
0,1V110 BM-21 Grad battery command vehicle,destroyed,https://i.postimg.cc/VLV92mk5/433.png,towed
1,1V13(M) battery fire control center,destroyed,https://i.postimg.cc/BnQf3bzx/d134.jpg,towed
2,1V13(M) battery fire control center,destroyed,https://i.postimg.cc/c4K0H35q/3211.png,towed
3,1V13(M) battery fire control center,destroyed,https://i.postimg.cc/132LjY94/3213.png,towed
4,1V13(M) battery fire control center,destroyed,https://i.postimg.cc/CxYFzQpq/2022-03-30-1v13.jpg,towed
...,...,...,...,...
745,Unknown SPG,destroyed,https://i.postimg.cc/tT7DqT7g/1002-unkn-spg.jpg,self propelled
746,Unknown SPG,destroyed,https://i.postimg.cc/rsqnYfG3/1066-2x-unkn-spg...,self propelled
747,Unknown SPG,destroyed,https://i.postimg.cc/C55Z6LTd/1013-unkn-spg-de...,self propelled
748,Unknown SPG,damaged,https://twitter.com/UAWeapons/status/150701229...,self propelled


### Anti Aircraft

In [41]:
category_names = ['anti aircraft gun', 'self propelled AA', 'surface-to-air missile system']

category_dataframes = dict(zip(category_names, [aa_guns, aa_gunsSP, sam]))

AA = pd.concat([df.assign(category=category) for category, df in category_dataframes.items()], ignore_index=True)

In [42]:
AA

Unnamed: 0,model,state,photo_links,category
0,23mm ZU-23-2,destroyed,https://i.postimg.cc/Y24jwX90/s2.png,anti aircraft gun
1,23mm ZU-23-2,destroyed,https://i.postimg.cc/MKgzsYwB/366.png,anti aircraft gun
2,23mm ZU-23-2,destroyed,https://twitter.com/UAWeapons/status/157836418...,anti aircraft gun
3,23mm ZU-23-2,captured,https://i.postimg.cc/rwZrBrTp/4fh.png,anti aircraft gun
4,23mm ZU-23-2,captured,https://i.postimg.cc/t4z6JH5f/3d14.png,anti aircraft gun
...,...,...,...,...
160,5P85SM2-01 (launcher for S-400),destroyed,https://twitter.com/UAWeapons/status/161644531...,surface-to-air missile system
161,5P85SM2-01 (launcher for S-400),destroyed,https://twitter.com/UAWeapons/status/167850966...,surface-to-air missile system
162,5P85SD/SM (launcher for S-300 PMU(-1)),destroyed,https://twitter.com/UAWeapons/status/155121622...,surface-to-air missile system
163,5P85SD/SM (launcher for S-300 PMU(-1)),destroyed,https://twitter.com/naalsio26/status/165147582...,surface-to-air missile system


### UAVs

In [43]:
category_names = ['combat UAV', 'reconnaissance UAV']

category_dataframes = dict(zip(category_names, [UAV, rUAV]))

UAV = pd.concat([df.assign(category=category) for category, df in category_dataframes.items()], ignore_index=True)

In [44]:
UAV

Unnamed: 0,model,state,photo_links,category
0,Orion,destroyed,https://i.postimg.cc/FsNjfGyG/57.png,combat UAV
1,Orion,destroyed,https://twitter.com/LotA_IL/status/15831597758...,combat UAV
2,Orion,destroyed,https://i.postimg.cc/DzDCgRwq/1011-Orion-ucav-...,combat UAV
3,Orion,destroyed,https://twitter.com/UAWeapons/status/165782877...,combat UAV
4,Orion,destroyed,https://i.postimg.cc/G2vF5ggy/1000-orion-ucav-...,combat UAV
...,...,...,...,...
265,Supercam S350,destroyed,https://i.postimg.cc/RZjJzDkD/1042-Supercam-S3...,reconnaissance UAV
266,Supercam S350,captured,https://i.postimg.cc/9fSnXycR/46g.png,reconnaissance UAV
267,Supercam S350,captured,https://i.postimg.cc/zXcdXxRN/1046-Supercam-S3...,reconnaissance UAV
268,Supercam S350,captured,https://i.postimg.cc/k5tJ9kyT/1037-Supercam-S3...,reconnaissance UAV


In [45]:
tanks['category'] = 'tank'
cmd_comms['category'] = 'C2 vehicle'
engineering['category'] = 'engineering'
ATGM['category'] = 'anti-tank guided missile'
MLRS['category'] = 'MLRS'
radars['category'] = 'radar'
jammers['category'] = 'jammer'
aircrafts['category'] = 'aircraft'
helicopters['category'] = 'helicopter'
navy['category'] = 'navy'
auto['category'] = 'auto'