# Scraping equipment losses with sources

The original source of this aggregated data is from this OSINT project: **Attack On Europe: Documenting Russian Equipment Losses During The Russian Invasion Of Ukraine**. Thw website is:https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-equipment.html

**I have preferred to scrape this data and rearrange it in a manner that fits my needs.**

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import urllib.request
import re
import numpy as np
import pandas as pd

In [2]:
page = urllib.request.urlopen("https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-equipment.html")
soup = bs(page)

In [3]:
h3s = soup.find_all('h3')

for h3 in h3s[3::]:
    print(h3.text)

Tanks (2475, of which destroyed: 1618, damaged: 140, abandoned: 167, captured: 550)
Armoured Fighting Vehicles (1042, of which destroyed: 696, damaged: 30, abandoned: 47, captured: 269)
Infantry Fighting Vehicles (3007, of which destroyed: 2102, damaged: 117, abandoned: 170, captured: 618)
Armoured Personnel Carriers (369, of which destroyed: 249, damaged: 11, abandoned: 17, captured: 92)
Mine-Resistant Ambush Protected (MRAP) Vehicles (49, of which destroyed: 35, damaged: 4, abandoned: 1, captured: 9)
Infantry Mobility Vehicles (224, of which destroyed: 161, damaged: 8, abandoned: 4, captured: 51)
Command Posts And Communications Stations (256, of which destroyed: 167, damaged: 3, abandoned: 2, captured: 84)
Engineering Vehicles And Equipment (359, of which destroyed: 181, damaged: 10, abandoned: 43, captured: 125)
Self-Propelled Anti-Tank Missile Systems (42, of which destroyed: 18, damaged: 1, abandoned: 4, captured: 19)
Artillery Support Vehicles And Equipment (109, of which destro

**I have to access and scrape each category separately.**

In [4]:
ul = soup.find_all("ul") # all categories

## Tanks

In [5]:
tanks_ = ul[1]

for i in tanks_:
    tank_model = tanks_.find_all('li')
    
model = []
photo_links = []
state = []

for i in range(len(tank_model)):
    links = tank_model[i].find_all('a')
    
    for link in links:
        model_num = tank_model[i].text[1::].split(":")[0]
        model_num = model_num.replace('\xa0', ' ')
        href = link.get('href')
        text = link.get_text(strip=True)[1:]
        
        if href:
            photo_links.append(href)
            state.append(text)
            model.append(model_num) 
            
tanks = pd.DataFrame({'model' : model, 'state': state, 'photo_links': photo_links})

## Automating the process for the remaining 23 categories

In [6]:
category_data = {}

# names of objects (and dataframes)
bs4_objects = {
    "armoured": 2,
    "IFV": 3,
    "APC": 4,
    "MRAP": 5,
    "IMV": 6,
    "cmd_comms": 7,
    "engineering": 8,
    "ATGM": 9,
    "arty_support": 10,
    "arty_towed": 11,
    "arty_SP": 12,
    "MLRS": 13,
    "aa_guns": 14,
    "aa_gunsSP": 15,
    "sam": 16,
    "radars": 17,
    "jammers": 19,
    "aircrafts": 19,
    "helicopters": 20,
    "UAV": 21,
    "rUAV": 22,
    "navy": 23,
    "auto": 24
}

# loop and extract
for category, ul_index in bs4_objects.items():
    model = []
    photo_links = []
    state = []

    ul_element = ul[ul_index]
    cats = ul_element.find_all('li')

    for cat in cats:
        links = cat.find_all('a')

        for link in links:
            model_num = cat.text[1:].split(":")[0].replace('\xa0', ' ')
            href = link.get('href')
            text = link.get_text(strip=True)[1:]

            if href:
                photo_links.append(href)
                state.append(text)
                model.append(model_num)

    # genetaring a dataframe for each category + storing in dict
    category_df = pd.DataFrame({'model': model, 'state': state, 'photo_links': photo_links})
    category_data[category] = category_df

# Data transformation & quality assessment

In [7]:
armoured = category_data['armoured']
IFV = category_data['IFV']
APC= category_data['APC']
MRAP= category_data['MRAP']
IMV= category_data['IMV']
cmd_comms= category_data['cmd_comms']
engineering= category_data['engineering']
ATGM= category_data['ATGM']
arty_support= category_data['arty_support']
arty_towed= category_data['arty_towed']
arty_SP= category_data['arty_SP']
MLRS= category_data['MLRS']
aa_guns= category_data['aa_guns']
aa_gunsSP= category_data['aa_gunsSP']
sam= category_data['sam']
radars= category_data['radars']
jammers= category_data['jammers']
aircrafts= category_data['aircrafts']
helicopters= category_data['helicopters']
UAV= category_data['UAV']
rUAV= category_data['rUAV']
navy= category_data['navy']
auto= category_data['auto']

## Functions for preprocessing the data

In [8]:
# remove number from the begining of the string

def func1(record):
    result = re.sub(r'^\d+', '', record)
    return result


# remove ")" and keep the last word

def func2(record):
    record = re.sub(r'\)$', '', record)  
    words = re.split(r'[ ,]+', record)  
    
    digit_words = [word for word in words if re.match(r'^\d+$', word)]

    if not digit_words:
        result = words[-1] 
    else:
        non_digit_words = [word for word in words if not re.match(r'^\d+$', word) and word != "and"]
        digit_count = len(digit_words)
        if digit_count == 1:
            result = non_digit_words[-1]
        else:
            result = f'{digit_count} x {non_digit_words[-1]}'

    return result

# just for "arty_support" + "auto"
def func3(input_string):
    match = re.search(r'(\w+)\W*$', input_string)
    if match:
        return match.group(1)
    else:
        return None

# when counting the values of state, there is an instance "abanoned" instead of "abandoned"

def replace_state(state):
    if state == 'abanonded':
        return 'abandoned'
    else:
        return state

### tanks preprocessing

In [9]:
# Unknown T-54/55 (1, destroyed) <- removing this (1, destroyed)

def replace_tmodel(model):
    if model == ' Unknown T-54/55 (1, destroyed)':
        return 'Unknown T-54/55'
    else:
        return model
    
# replace "Obr." with "Mk" from "modification"....in the context of an equipment model, "Obr. ####" stands 
# for "обр. 2016" in Russian, which translates to "modification 2016" in English

def replace_mk(record):
    result = re.sub(r'Obr\.', 'Mk.', record)
    return result

### towed artilert preprocessing

In [10]:
# remove ")" and keep the last word

def func_ta(record):
    result1 = re.sub(r'\)', '', record) # remove ")"
    #result2 = re.findall(r'\b\w+\b$', result1)[0] # keep just the last word
    words = result1.split()  
    if words:
        return words[-1]  
    else:
        return None 
    return words[-1]

## Applying Functions

In [11]:
tanks['model'] = tanks['model'].apply(func1)
tanks['state'] = tanks['state'].apply(func2)
tanks['state'] = tanks['state'].apply(replace_state)
tanks['model'] = tanks['model'].apply(replace_tmodel)
tanks['model'] = tanks['model'].apply(replace_mk)

In [12]:
armoured['model'] = armoured['model'].apply(func1)
armoured['state'] = armoured['state'].apply(func2)

In [13]:
IFV['model'] = IFV['model'].apply(func1)
IFV['state'] = IFV['state'].apply(func2)

In [14]:
APC['model'] = APC['model'].apply(func1)
APC['state'] = APC['state'].apply(func2)

In [15]:
MRAP['model'] = MRAP['model'].apply(func1)
MRAP['state'] = MRAP['state'].apply(func2)

In [16]:
IMV['model'] = IMV['model'].apply(func1)
IMV['state'] = IMV['state'].apply(func2)

In [17]:
cmd_comms['model'] = cmd_comms['model'].apply(func1)
cmd_comms['state'] = cmd_comms['state'].apply(func2)

In [18]:
engineering['model'] = engineering['model'].apply(func1)
engineering['state'] = engineering['state'].apply(func2)

In [19]:
ATGM['model'] = ATGM['model'].apply(func1)
ATGM['state'] = ATGM['state'].apply(func2)

In [20]:
arty_support['model'] = arty_support['model'].apply(func1)
arty_support['state'] = arty_support['state'].apply(func2)

In [21]:
arty_towed['model'] = arty_towed['model'].apply(func1)
arty_towed['state'] = arty_towed['state'].apply(func_ta) # custom
arty_towed['state'] = arty_towed['state'].apply(lambda state: 'damaged' if state == 'aged' else state) # custom preprocessing

In [22]:
arty_SP['model'] = arty_SP['model'].apply(func1)
arty_SP['state'] = arty_SP['state'].apply(func2)

In [23]:
MLRS['model'] = MLRS['model'].apply(func1)
MLRS['state'] = MLRS['state'].apply(func2)

In [24]:
aa_guns['model'] = aa_guns['model'].apply(func1)
aa_guns['state'] = aa_guns['state'].apply(func2)

In [25]:
aa_gunsSP['model'] = aa_gunsSP['model'].apply(func1)
aa_gunsSP['state'] = aa_gunsSP['state'].apply(func2)

In [26]:
sam['model'] = sam['model'].apply(func1)
sam['state'] = sam['state'].apply(func2)

In [27]:
radars['model'] = radars['model'].apply(func1)
radars['state'] = radars['state'].apply(func2)

In [28]:
jammers['model'] = jammers['model'].apply(func1)
jammers['state'] = jammers['state'].apply(func2)

In [29]:
aircrafts['model'] = aircrafts['model'].apply(func1)
aircrafts['state'] = aircrafts['state'].apply(func2)

In [30]:
helicopters['model'] = helicopters['model'].apply(func1)
helicopters['state'] = helicopters['state'].apply(func2)

In [31]:
UAV['model'] = UAV['model'].apply(func1)
UAV['state'] = UAV['state'].apply(func2)

In [32]:
rUAV['model'] = rUAV['model'].apply(func1)
rUAV['state'] = rUAV['state'].apply(func2)

In [33]:
navy['model'] = navy['model'].apply(func1)
navy['state'] = navy['state'].apply(func2)

In [34]:
auto['model'] = auto['model'].apply(func1)
auto['state'] = auto['state'].apply(func2)

## Reshaping

I want to aggregate the data in another manner. For example, these categories ('armoured', 'IFV', 'APC', 'MRAP', 'IMV') will become "Infantry vehicles." Artillery support, self-propelled artillery, and towed artillery will become "artillery," and so on.

### Infantry vehicles

In [35]:
# categories
category_names = ['armoured', 'IFV', 'APC', 'MRAP', 'IMV']

# creating a dictionary to hold dataframes with category names as keys
category_dataframes = dict(zip(category_names, [armoured, IFV, APC, MRAP, IMV])) 

# adding 'category' column to each df and concatenate
infantry_vehicles = pd.concat([df.assign(category=category) for category, df in category_dataframes.items()], ignore_index=True)

### Artillery

In [37]:
category_names = ['towed', 'support', 'self propelled']

category_dataframes = dict(zip(category_names, [arty_support, arty_towed, arty_SP]))

artillery = pd.concat([df.assign(category=category) for category, df in category_dataframes.items()], ignore_index=True)

### Anti Aircraft

In [39]:
category_names = ['anti aircraft gun', 'self propelled AA', 'surface-to-air missile system']

category_dataframes = dict(zip(category_names, [aa_guns, aa_gunsSP, sam]))

AA = pd.concat([df.assign(category=category) for category, df in category_dataframes.items()], ignore_index=True)

### UAVs

In [41]:
category_names = ['combat UAV', 'reconnaissance UAV']

category_dataframes = dict(zip(category_names, [UAV, rUAV]))

UAV = pd.concat([df.assign(category=category) for category, df in category_dataframes.items()], ignore_index=True)

### Creating the final dataset

In [43]:
tanks['category'] = 'tank'
cmd_comms['category'] = 'C2 vehicle'
engineering['category'] = 'engineering'
ATGM['category'] = 'anti-tank guided missile'
MLRS['category'] = 'MLRS'
radars['category'] = 'radar'
jammers['category'] = 'jammer'
aircrafts['category'] = 'aircraft'
helicopters['category'] = 'helicopter'
navy['category'] = 'navy'
auto['category'] = 'auto'

In [44]:
ru_equipment1 = pd.concat([tanks,infantry_vehicles, artillery, AA, UAV, cmd_comms, engineering, ATGM, MLRS, radars, jammers, 
                          aircrafts, helicopters, navy, auto])

In [45]:
ru_equipment1

Unnamed: 0,model,state,photo_links,category
0,Unknown T-54/55,destroyed,https://twitter.com/CalibreObscura/status/1670...,tank
1,T-55A,damaged,https://i.postimg.cc/jdFBJdQb/1027-t55-dam-05-...,tank
2,T-62 Mk. 1967,captured,https://i.postimg.cc/yxw0SFD6/1001-T-62-Obr-19...,tank
3,T-62M,destroyed,https://twitter.com/UAWeapons/status/154479707...,tank
4,T-62M,destroyed,https://i.postimg.cc/X72RXDmM/1024-t62m-destr-...,tank
...,...,...,...,...
2594,(Unknown) vehicle,destroyed,https://i.postimg.cc/05Tnz61Z/1025-unkn-vehicl...,auto
2595,(Unknown) vehicle,destroyed,https://i.postimg.cc/sXQx68zP/1018-unkn-vehicl...,auto
2596,(Unknown) vehicle,3 x damaged,https://twitter.com/UAWeapons/status/151422993...,auto
2597,(Unknown) vehicle,damaged,https://twitter.com/UAWeapons/status/151532766...,auto


In [46]:
expanded_rows = []

for index, row in ru_equipment1.iterrows():
    state_value = row['state']
    
    # Check if state_value is not None
    if state_value is not None:
        # Split the state_value on ' x ' and check if it contains ' x ' and a number
        parts = state_value.split(' x ')
        
        if len(parts) == 2 and parts[0].isdigit():
            num_repeats = int(parts[0])
            state = parts[1]
            
            # Create new rows and append them to the list
            for _ in range(num_repeats):
                new_row = row.copy()
                new_row['state'] = state
                expanded_rows.append(new_row)
        else:
            expanded_rows.append(row)
    else:
        expanded_rows.append(row)

# Create a new DataFrame from the expanded rows
ru_equipment = pd.DataFrame(expanded_rows)

In [47]:
ru_equipment

Unnamed: 0,model,state,photo_links,category
0,Unknown T-54/55,destroyed,https://twitter.com/CalibreObscura/status/1670...,tank
1,T-55A,damaged,https://i.postimg.cc/jdFBJdQb/1027-t55-dam-05-...,tank
2,T-62 Mk. 1967,captured,https://i.postimg.cc/yxw0SFD6/1001-T-62-Obr-19...,tank
3,T-62M,destroyed,https://twitter.com/UAWeapons/status/154479707...,tank
4,T-62M,destroyed,https://i.postimg.cc/X72RXDmM/1024-t62m-destr-...,tank
...,...,...,...,...
2596,(Unknown) vehicle,damaged,https://twitter.com/UAWeapons/status/151422993...,auto
2596,(Unknown) vehicle,damaged,https://twitter.com/UAWeapons/status/151422993...,auto
2597,(Unknown) vehicle,damaged,https://twitter.com/UAWeapons/status/151532766...,auto
2598,(Unknown) vehicle,damaged,https://twitter.com/UAWeapons/status/152307037...,auto


In [54]:
ru_equipment.to_csv(r"C:\Users\Asus\Desktop\Py\PROJECTS\__Ru\ru harvesting\ru_equipment.csv")