In [83]:
import pandas as pd
import numpy as np

In [84]:
protests = pd.read_csv('1900-01-01-2022-11-18protests.csv')

# Convert event_time from string to datetime object

In [85]:
from datetime import datetime
def convert_to_datetime(date_string):
    return datetime.strptime(date_string, "%d %B %Y").date()

In [86]:
protests["event_date"] = protests["event_date"].apply(convert_to_datetime)

In [87]:
protests.to_csv("protests_clean.csv", index = False)

In [88]:
# for debugging 
# head = protests.head()
# interested = protests[['data_id', 'event_id_no_cnty', 'event_date', 'year', 'event_type', 'sub_event_type']]
# notes = protests[['notes']]
# note = notes.iat[6841,0]
# note

# Extract Protest Size from Notes Col

In [89]:
def extract_size(s):
    if type(s) == str:
        pos = s.rfind("[size", -50)
        if pos == -1:
            s = None
        else:
            s = s[pos+1:]
            pos2 = s.find("]")
            s = s[:pos2]
            if s[-1] == ".":
                s = s[:-1]
                s = s[s.find("=")+1:]
            else:
                s = s[s.find("=")+1:]
        if type(s) == str and len(s) > 1:
            if s[0] == " ":
                s = s[1:]
            if s[0] == "'":
                s = s[1:-1]
    else:
        s = None
    return s

In [90]:
# first, take out None values in the notes column
protests = protests[~protests['notes'].isnull()]

# added a column of size with information extracted from notes column
protests['size'] = protests['notes'].apply(extract_size)

In [91]:
size_reported = protests[~protests['size'].isnull()] 

# There may be other values I'm missing
size_reported = size_reported[size_reported['size'] != "no report"]
size_reported = size_reported[size_reported['size'] != "No Report"]
size_reported = size_reported[size_reported['size'] != "no eport"]
size_reported = size_reported[size_reported['size'] != "no reports"]
size_reported = size_reported[size_reported['size'] != "no reply"]
size_reported = size_reported[size_reported['size'] != "no review"]

count_size = len(size_reported)
count_unique = len(size_reported['event_id_no_cnty'].unique())
print(count_size/len(protests), 'of the dataset have actual information on size')
print(count_unique/len(protests), 'of the dataset are unique events with actual information on size')

0.2510511113012537 of the dataset have actual information on size
0.0810926836752457 of the dataset are unique events with actual information on size


In [92]:
print("There are", len(size_reported['event_id_no_cnty'].unique()), "unique events with some information of protest size")

There are 54969 unique events with some information of protest size


In [93]:
print("There are", len(size_reported['size'].unique()), "unique values in size")

There are 7992 unique values in size


In [77]:
# size_reported['size_type'] = size_reported['size'].apply(type)

In [94]:
def convert_to_numerical(size_string):
    try:
        size_string = int(size_string)
    except:
        size_string = None
    return size_string

In [95]:
size_reported['numerical_size'] = size_reported['size'].apply(convert_to_numerical)

In [96]:
print(len(size_reported[~size_reported['numerical_size'].isnull()])/len(size_reported)*100, "% of the size values are numerical.")

8.026983828506957 % of the size values are numerical.


In [97]:
def has_numbers(size_string):
    return any(char.isdigit() for char in size_string)

In [98]:
size_reported['has_numbers'] = size_reported['size'].apply(has_numbers)

In [103]:
len(size_reported[size_reported['has_numbers'] == True])/len(size_reported)

0.6408071643474991

In [104]:
size_reported

Unnamed: 0,data_id,iso,event_id_cnty,event_id_no_cnty,event_date,year,time_precision,event_type,sub_event_type,actor1,...,geo_precision,source,source_scale,notes,fatalities,timestamp,iso3,size,numerical_size,has_numbers
0,9633791,710,SAF17883,17883,2022-11-11,2022,1,Protests,Peaceful protest,Protesters (South Africa),...,1,GardaWorld; SABC News; News24 (South Africa); EWN,Other-National,"On 11 November 2022, about 200 SAMWU-led munic...",0,1668446884,ZAF,about 200,,True
1,9633792,710,SAF17884,17884,2022-11-11,2022,1,Protests,Peaceful protest,Protesters (South Africa),...,1,Times (South Africa); GroundUp,National,"On 11 November 2022, more than 200 protesters ...",0,1668446884,ZAF,more than 200,,True
3,9633988,710,SAF17871,17871,2022-11-11,2022,1,Protests,Peaceful protest,Protesters (South Africa),...,1,GroundUp,National,"On 11 November 2022, about 300 NUPSAW-led comm...",0,1668446884,ZAF,about 300,,True
5,9634020,710,SAF17885,17885,2022-11-11,2022,1,Protests,Peaceful protest,Protesters (South Africa),...,1,News24 (South Africa),National,"On 11 November 2022, more than 100 protesters,...",0,1668446884,ZAF,more than 100,,True
7,9634179,275,PSE17613,17613,2022-11-11,2022,1,Protests,Peaceful protest,Protesters (Palestine),...,1,Dunia Al Watan; Twitter; Quds News Network; Pa...,New media-National,"On 11 November 2022, thousands of Palestinian ...",0,1668450623,PSE,thousands,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
673642,7913364,764,THA7560,7560,2010-02-04,2010,1,Protests,Peaceful protest,Protesters (Thailand),...,2,Prachatai,National,"On 4 February 2010, a group of more than 70 me...",0,1619205865,THA,more than 70,,True
673701,7988427,764,THA7559,7559,2010-01-29,2010,1,Protests,Peaceful protest,Protesters (Thailand),...,1,Prachatai,National,"On 29 January 2010, four hundred members of th...",0,1620775298,THA,400,400.0,True
674017,7988384,764,THA7558,7558,2010-01-08,2010,1,Protests,Peaceful protest,Protesters (Thailand),...,1,Prachatai,National,"On 8 January 2010, a group of more than 30 wor...",0,1620775298,THA,more than 30,,True
674160,9293581,324,GUI488,488,2009-09-28,2009,1,Protests,Excessive force against protesters,Protesters (Guinea),...,1,HRW,Other,At a protest held at a Stadium to demonstrate ...,157,1653349783,GIN,at least 1400,,True
