In [338]:
import pandas as pd
import numpy as np

In [339]:
protests = pd.read_csv('1900-01-01-2022-11-18protests.csv')

In [340]:
len(protests)

678280

In [341]:
protests.dtypes

data_id               int64
iso                   int64
event_id_cnty        object
event_id_no_cnty      int64
event_date           object
year                  int64
time_precision        int64
event_type           object
sub_event_type       object
actor1               object
assoc_actor_1        object
inter1                int64
actor2               object
assoc_actor_2        object
inter2                int64
interaction           int64
region               object
country              object
admin1               object
admin2               object
admin3               object
location             object
latitude            float64
longitude           float64
geo_precision         int64
source               object
source_scale         object
notes                object
fatalities            int64
timestamp             int64
iso3                 object
dtype: object

In [342]:
protests.isnull().sum()

data_id                  0
iso                      0
event_id_cnty            0
event_id_no_cnty         0
event_date               0
year                     0
time_precision           0
event_type               0
sub_event_type           0
actor1                   0
assoc_actor_1       212800
inter1                   0
actor2              636154
assoc_actor_2       672226
inter2                   0
interaction              0
region                   0
country                  0
admin1                   3
admin2               22807
admin3              376772
location                 0
latitude                 0
longitude                0
geo_precision            0
source                   0
source_scale             0
notes                  426
fatalities               0
timestamp                0
iso3                     0
dtype: int64

In [343]:
# for debugging 
# head = protests.head()
# interested = protests[['data_id', 'event_id_no_cnty', 'event_date', 'year', 'event_type', 'sub_event_type']]
# notes = protests[['notes']]
# note = notes.iat[6841,0]
# note

# Extract Protest Size from Notes Col

In [348]:
def extract_size(s):
    if type(s) == str:
        pos = s.rfind("[size", -50)
        if pos == -1:
            s = None
        else:
            s = s[pos+1:]
            pos2 = s.find("]")
            s = s[:pos2]
            if s[-1] == ".":
                s = s[:-1]
                s = s[s.find("=")+1:]
            else:
                s = s[s.find("=")+1:]
        if type(s) == str and len(s) > 1:
            if s[0] == " ":
                s = s[1:]
            if s[0] == "'":
                s = s[1:-1]
    else:
        s = None
    return s

In [349]:
# added a column of size with information extracted from notes column
protests['size'] = protests['notes'].apply(extract_size)
protests.to_csv("protests_with_size.csv")

In [350]:
size_info_reported = protests[~protests['size'].isnull()] 
size_reported = size_info_reported[size_info_reported['size'] != "no report"]
size_reported = size_reported[size_reported['size'] != "No Report"]
size_reported = size_reported[size_reported['size'] != "no eport"]
size_reported = size_reported[size_reported['size'] != "no reports"]
size_reported = size_reported[size_reported['size'] != "no reply"]
size_reported = size_reported[size_reported['size'] != "no review"]

count_size = len(size_reported)
count_unique = len(size_reported['event_id_no_cnty'].unique())
print(count_size/len(protests), 'of the dataset have actual information on size')
print(count_unique/len(protests), 'of the dataset are unique events with actual information on size')

0.25089343633897504 of the dataset have actual information on size
0.08104175266851447 of the dataset are unique events with actual information on size


In [347]:
size_reported

Unnamed: 0,data_id,iso,event_id_cnty,event_id_no_cnty,event_date,year,time_precision,event_type,sub_event_type,actor1,...,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,timestamp,iso3,size
0,9633791,710,SAF17883,17883,11 November 2022,2022,1,Protests,Peaceful protest,Protesters (South Africa),...,-26.1982,28.0219,1,GardaWorld; SABC News; News24 (South Africa); EWN,Other-National,"On 11 November 2022, about 200 SAMWU-led munic...",0,1668446884,ZAF,about 200
1,9633792,710,SAF17884,17884,11 November 2022,2022,1,Protests,Peaceful protest,Protesters (South Africa),...,-29.8579,31.0292,1,Times (South Africa); GroundUp,National,"On 11 November 2022, more than 200 protesters ...",0,1668446884,ZAF,more than 200
3,9633988,710,SAF17871,17871,11 November 2022,2022,1,Protests,Peaceful protest,Protesters (South Africa),...,-33.9253,18.4239,1,GroundUp,National,"On 11 November 2022, about 300 NUPSAW-led comm...",0,1668446884,ZAF,about 300
5,9634020,710,SAF17885,17885,11 November 2022,2022,1,Protests,Peaceful protest,Protesters (South Africa),...,-33.9253,18.4239,1,News24 (South Africa),National,"On 11 November 2022, more than 100 protesters,...",0,1668446884,ZAF,more than 100
7,9634179,275,PSE17613,17613,11 November 2022,2022,1,Protests,Peaceful protest,Protesters (Palestine),...,31.7767,35.2342,1,Dunia Al Watan; Twitter; Quds News Network; Pa...,New media-National,"On 11 November 2022, thousands of Palestinian ...",0,1668450623,PSE,thousands
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
673642,7913364,764,THA7560,7560,04 February 2010,2010,1,Protests,Peaceful protest,Protesters (Thailand),...,6.9702,100.4193,2,Prachatai,National,"On 4 February 2010, a group of more than 70 me...",0,1619205865,THA,more than 70
673701,7988427,764,THA7559,7559,29 January 2010,2010,1,Protests,Peaceful protest,Protesters (Thailand),...,13.7364,100.5239,1,Prachatai,National,"On 29 January 2010, four hundred members of th...",0,1620775298,THA,400
674017,7988384,764,THA7558,7558,08 January 2010,2010,1,Protests,Peaceful protest,Protesters (Thailand),...,13.7210,100.5389,1,Prachatai,National,"On 8 January 2010, a group of more than 30 wor...",0,1620775298,THA,more than 30
674160,9293581,324,GUI488,488,28 September 2009,2009,1,Protests,Excessive force against protesters,Protesters (Guinea),...,9.5091,-13.7122,1,HRW,Other,At a protest held at a Stadium to demonstrate ...,157,1653349783,GIN,at least 1400


In [351]:
len(size_reported)

170176

In [352]:
len(size_reported['event_id_no_cnty'].unique())

54969