# Extracting data about casualties from UN Human Rights UA (civilian + children)

In [44]:
import datetime
import re
from datetime import date

import numpy as np
import pandas as pd

In [45]:
tweets = pd.read_csv("./../data/UNHumanRightsUATweets.csv")

In [46]:
tweets.head()

Unnamed: 0,id,conversation_id,created_at,date,timezone,place,tweet,language,hashtags,cashtags,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1498918622059798532,1498918622059798532,1646205000000.0,2022-03-02 08:10:22,200,,"From 24-28 Feb, we recorded 550 civilian casua...",en,['ukraine'],[],...,,,,,,[],,,,
1,1499126009085497345,1499126009085497345,1646254000000.0,2022-03-02 21:54:27,200,,"From 24 Feb—1 March, we recorded 752 civilian ...",en,['ukraine'],[],...,,,,,,[],,,,
2,1499460762330402825,1499460762330402825,1646334000000.0,2022-03-03 20:04:39,200,,"From 24 Feb—2 March, we recorded 802 civilian ...",en,['ukraine'],[],...,,,,,,[],,,,
3,1499785706222329870,1499785706222329870,1646412000000.0,2022-03-04 17:35:51,200,,"From 24 Feb—3 March, we recorded 1,006 civilia...",en,['ukraine'],[],...,,,,,,[],,,,
4,1500121254136848390,1500121254136848390,1646492000000.0,2022-03-05 15:49:12,200,,"From 24 Feb—4 March, we recorded 1,058 civilia...",en,['ukraine'],[],...,,,,,,[],,,,


In [47]:
common_beginning_mask = tweets.tweet.str.contains("from", case=False)

In [48]:
casualties_information_pattern = r".* ([0-9]+|[0-9]+,[0-9]+) killed, incl ([0-9]+|[0-9]+,[0-9]+) children; ([0-9]+|[0-9]+,[0-9]+) injured, incl ([0-9]+|[0-9]+,[0-9]+) children"
date_pattern = r"From [0-9]+ Feb—([0-9]+) (\w+).*"
date_format = "%d %B %y"

Create dataset

In [49]:
date_range = pd.date_range(date(2022, 2, 24), date.today())
columns = ["civilians_killed", "civilians_injured", "children_killed", "children_injured"]

In [50]:
casualties = pd.DataFrame(np.zeros(shape=(len(date_range), len(columns)), dtype=np.int32), index=date_range, columns=columns)

In [51]:
casualties.head()

Unnamed: 0,civilians_killed,civilians_injured,children_killed,children_injured
2022-02-24,0,0,0,0
2022-02-25,0,0,0,0
2022-02-26,0,0,0,0
2022-02-27,0,0,0,0
2022-02-28,0,0,0,0


In [52]:
def string_number_to_int(str_number):
    return int(str_number.replace(",", ""))

In [53]:
for tweet in tweets[common_beginning_mask].tweet:
    result = re.match(casualties_information_pattern, str(tweet))
    if result is None:
        continue
    date_format = "%d %B %y"
    print(tweet)
    date_result = re.match(date_pattern, str(tweet))
    print(date_result)
    try:
        day = datetime.datetime.strptime(date_result.group(1) + " " + date_result.group(2) + " 22", date_format)
        print(day)
    except Exception:
        print("ec")
        continue
    casualties.loc[day, "civilians_killed"] = string_number_to_int(result.group(1))
    casualties.loc[day, "civilians_injured"] = string_number_to_int(result.group(2))
    casualties.loc[day, "children_killed"] = string_number_to_int(result.group(3))
    casualties.loc[day, "children_injured"] = string_number_to_int(result.group(4))

From 24-28 Feb, we recorded 550 civilian casualties in context of the Russia’s military action against #Ukraine: 142 killed, incl 13 children; 408 injured, incl 26 children, mostly caused by shelling &amp; airstrikes. Real toll is much higher. Full update— https://t.co/lNWHqVm1s6  https://t.co/aOZExKWGnQ
None
ec
From 24 Feb—1 March, we recorded 752 civilian casualties in context of the Russia’s military action against #Ukraine: 227 killed, incl 15 children; 525 injured, incl 28 children, mostly caused by shelling &amp; airstrikes. Real toll is much higher. Full update— https://t.co/hy7acxKBfz  https://t.co/yiZGbpw0vV
<re.Match object; span=(0, 310), match='From 24 Feb—1 March, we recorded 752 civilian cas>
2022-03-01 00:00:00
From 24 Feb—2 March, we recorded 802 civilian casualties in context of the Russia’s military action against #Ukraine: 249 killed, incl 17 children; 553 injured, incl 30 children, mostly caused by shelling &amp; airstrikes. Real toll is much higher. Full update— ht

In [54]:
casualties

Unnamed: 0,civilians_killed,civilians_injured,children_killed,children_injured
2022-02-24,0,0,0,0
2022-02-25,0,0,0,0
2022-02-26,0,0,0,0
2022-02-27,0,0,0,0
2022-02-28,0,0,0,0
...,...,...,...,...
2022-04-28,0,0,0,0
2022-04-29,0,0,0,0
2022-04-30,0,0,0,0
2022-05-01,0,0,0,0


In [55]:
casualties.to_excel("./../data/casualtiesUNHumanRightsUATweets.xlsx")