# Extracting data about casualties from UN Human Rights UA (civilian + children)

In [1]:
import datetime
import re

import numpy as np
import pandas as pd

In [2]:
tweets = pd.read_csv("./../data/UNHumanRightsUATweets.csv")

In [3]:
tweets.head()

Unnamed: 0,id,conversation_id,created_at,date,timezone,place,tweet,language,hashtags,cashtags,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1498918622059798532,1498918622059798532,1646205000000.0,2022-03-02 08:10:22,200,,"From 24-28 Feb, we recorded 550 civilian casua...",en,['ukraine'],[],...,,,,,,[],,,,
1,1499126009085497345,1499126009085497345,1646254000000.0,2022-03-02 21:54:27,200,,"From 24 Feb—1 March, we recorded 752 civilian ...",en,['ukraine'],[],...,,,,,,[],,,,
2,1499460762330402825,1499460762330402825,1646334000000.0,2022-03-03 20:04:39,200,,"From 24 Feb—2 March, we recorded 802 civilian ...",en,['ukraine'],[],...,,,,,,[],,,,
3,1499785706222329870,1499785706222329870,1646412000000.0,2022-03-04 17:35:51,200,,"From 24 Feb—3 March, we recorded 1,006 civilia...",en,['ukraine'],[],...,,,,,,[],,,,
4,1500121254136848390,1500121254136848390,1646492000000.0,2022-03-05 15:49:12,200,,"From 24 Feb—4 March, we recorded 1,058 civilia...",en,['ukraine'],[],...,,,,,,[],,,,


In [4]:
common_beginning_mask = tweets.tweet.str.contains("from", case=False)

In [5]:
casualties_information_pattern = r".* ([0-9]+|[0-9]+,[0-9]+) killed, incl ([0-9]+|[0-9]+,[0-9]+) children; ([0-9]+|[0-9]+,[0-9]+) injured, incl ([0-9]+|[0-9]+,[0-9]+) children"
date_pattern = r".*[0-9]+ Feb—([0-9]+) (\w+).*"
date_format = "%d %B %y"

Create dataset

In [6]:
date_range = pd.date_range(datetime.date(2022, 2, 23), datetime.date(2022, 4, 28))
columns = ["civilians_killed", "civilians_injured", "children_killed", "children_injured", "interpolated"]

In [7]:
casualties = pd.DataFrame(np.nan, #np.zeros(shape=(len(date_range), len(columns)), dtype=np.int32),
                          index=date_range, columns=columns)

In [8]:
casualties.head()

Unnamed: 0,civilians_killed,civilians_injured,children_killed,children_injured,interpolated
2022-02-23,,,,,
2022-02-24,,,,,
2022-02-25,,,,,
2022-02-26,,,,,
2022-02-27,,,,,


In [9]:
def string_number_to_int(str_number):
    return int(str_number.replace(",", ""))

In [10]:
def infer_date(whole_tweet):
    # 1st option = it is in the tweet
    tweet = whole_tweet.tweet
    date_format = "%d %B %y"
    date_result = re.match(date_pattern, str(tweet))
    if date_result is not None:
        try:
            date = datetime.datetime.strptime(date_result.group(1) + " " + date_result.group(2) + " 22", date_format).date()
        except AttributeError:
            raise ValueError
        return date
    else: # assume that the date is the day before
        date = datetime.datetime.strptime(whole_tweet.date, "%Y-%m-%d %H:%M:%S").date() - datetime.timedelta(1)
        return date


In [11]:
for idx, whole_tweet in tweets.iterrows():
    tweet = whole_tweet.tweet
    # check if the casualties' info is in the post
    result = re.match(casualties_information_pattern, tweet)
    if result is None:
        continue
    try:
        date = infer_date(whole_tweet)
    except ValueError:
        continue

    casualties.loc[pd.Timestamp(date), "civilians_killed"] = string_number_to_int(result.group(1))
    casualties.loc[pd.Timestamp(date), "children_killed"] = string_number_to_int(result.group(2))
    casualties.loc[pd.Timestamp(date), "civilians_injured"] = string_number_to_int(result.group(3))
    casualties.loc[pd.Timestamp(date), "children_injured"] = string_number_to_int(result.group(4))
    casualties.loc[pd.Timestamp(date), "interpolated"] = False

In [12]:
casualties.index = casualties.index.date
# set the -1 day for interpolation
casualties.loc[datetime.date(2022, 2, 23)] = [0,0,0,0, False]
# set one undetected day
casualties.loc[datetime.date(2022, 2, 28)] = [142, 408, 13, 26, False]

In [13]:
casualties["interpolated"] = casualties["interpolated"].fillna(True)

In [14]:
casualties

Unnamed: 0,civilians_killed,civilians_injured,children_killed,children_injured,interpolated
2022-02-23,0.0,0.0,0.0,0.0,False
2022-02-24,,,,,True
2022-02-25,,,,,True
2022-02-26,,,,,True
2022-02-27,,,,,True
...,...,...,...,...,...
2022-04-24,2665.0,3050.0,195.0,296.0,False
2022-04-25,2729.0,3111.0,201.0,299.0,False
2022-04-26,2787.0,3152.0,202.0,302.0,False
2022-04-27,2829.0,3180.0,205.0,303.0,False


In [15]:
dtypes = [np.int32]*4 + [bool]
dtypes = {k:v for k, v in zip(columns, dtypes)}

In [16]:
casualties = casualties.interpolate().astype(dtypes).iloc[1:]

In [17]:
casualties.index.name = "date"
casualties

Unnamed: 0_level_0,civilians_killed,civilians_injured,children_killed,children_injured,interpolated
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-02-24,28,81,2,5,True
2022-02-25,56,163,5,10,True
2022-02-26,85,244,7,15,True
2022-02-27,113,326,10,20,True
2022-02-28,142,408,13,26,False
...,...,...,...,...,...
2022-04-24,2665,3050,195,296,False
2022-04-25,2729,3111,201,299,False
2022-04-26,2787,3152,202,302,False
2022-04-27,2829,3180,205,303,False


In [18]:
casualties.to_csv("./../data/casualtiesUNHumanRightsUATweets.csv",)