In [316]:
import pandas as pd
import regex as re

### Loading in the full 2025 Crowd Counting Datasets

In [317]:
# there are some unique encodings in the data set so we must use a more flexible encoding than utf-8
crowd_data = pd.read_csv(".gitignore/dataverse_files/ccc_compiled_20212024.csv", encoding='latin-1')
crowd_data.drop(columns=['source_1', 'source_2',
       'source_3', 'source_4', 'source_5', 'source_6', 'source_7', 'source_8',
       'source_9', 'source_10', 'source_11', 'source_12', 'source_13',
       'source_14', 'source_15', 'source_16', 'source_17', 'source_18',
       'source_19', 'source_20', 'source_21', 'source_22', 'source_23',
       'source_24', 'source_25', 'source_26', 'source_27', 'source_28',
       'source_29', 'source_30', 'notes', 'lat', 'lon', 'resolved_locality',
       'resolved_county', 'resolved_state', 'fips_code'], inplace=True)

  crowd_data = pd.read_csv(".gitignore/dataverse_files/ccc_compiled_20212024.csv", encoding='latin-1')


### Trimming Down to correct dates

In [318]:
crowd_data['date'] = pd.to_datetime(crowd_data['date'])
start_date = pd.to_datetime("2024-04-1")
end_date = pd.to_datetime("2024-07-31")
mask = (crowd_data['date'] >= start_date) & (crowd_data['date'] <= end_date)
crowd_data = crowd_data[mask]

### Only Taking Events realated to the campus protests

In [319]:
key_phrases = ['divest', 'israel', 'palestine', 'divestment', 'liberation', 'palestinian', 'gaza', 'genocide']
regex = '|'.join(key_phrases)
crowd_data['claims'] = crowd_data['claims'].str.lower()
mask = crowd_data['claims'].str.contains(regex)
crowd_data = crowd_data[mask]

###

### Dropping Unnecessary Columns

In [320]:
crowd_data.drop(columns = ['locality', 'state', 'location_detail', 'online', 'type',
       'title', 'macroevent', 'organizations', 'participants', 'claims',
       'claims_summary', 'claims_verbatim', 'issue_tags_summary',
       'issue_tags_verbatim', 'issue_tags', 'valence',
       'size_cat', 'property_damage', 'property_damage_any',
       'chemical_agents', 'arrests_any', 'injuries_crowd_any', 'injuries_police_any', 'participant_deaths',
       'police_deaths', 'size_text'], inplace=True)

### Parsing of mixed data types

In [321]:
def parse_mixed(value):
    #NA case
    if pd.isna(value):
        return 0
    
    #value is already a number
    if isinstance(value, int) or isinstance(value, float):
        return float(value)
    
    #unclear/unspecified
    pattern = r'unspecified|unclear'
    if re.search(pattern, value):
        return pd.NA  # you can later impute with mean/median nonzero
    
    #"above value" case (in this just do 1 more than the value for a lower bound)
    pattern = r'(?:more than|over)\s+(\d+)'
    match = re.findall(pattern, value)
    if len(match) > 0:
        return float(match[0]) + 1
    
    #single number case
    pattern = r'(\d+)'
    match = re.findall(pattern, value)
    if len(match) > 0:
        return float(match[0])
    
    #catch all case
    return pd.NA


In [333]:
crowd_data['injuries_crowd'] = crowd_data['injuries_crowd'].apply(parse_mixed)
crowd_data['arrests'] = crowd_data['arrests'].apply(parse_mixed)
crowd_data['injuries_police'] = crowd_data['injuries_police'].apply(parse_mixed)
crowd_data['participant_measures'] = crowd_data['participant_measures'].astype(str)
crowd_data['police_measures'] = crowd_data['participant_measures'].astype(str)
print(crowd_data.columns)

Index(['date', 'size_low', 'size_high', 'size_mean', 'arrests',
       'injuries_crowd', 'injuries_police', 'participant_measures',
       'police_measures'],
      dtype='object')


### Aggregate the same days together by summing the counts and concatenating the strings of measures

In [None]:
agg_dict = {
    'size_low': 'sum',
    'size_high': 'sum',
    'size_mean': 'sum',
    'arrests': 'sum',
    'injuries_crowd': 'sum',
    'injuries_police': 'sum',
    'participant_measures': lambda x: ' ; '.join(x),
    'police_measures': lambda x: ' ; '.join(x)
}

crowd_data = crowd_data.groupby('date').agg(agg_dict)

Unnamed: 0_level_0,size_low,size_high,size_mean,arrests,injuries_crowd,injuries_police,participant_measures,police_measures
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-04-01,639.0,1583.0,1111.0,1.0,1.0,0.0,nan ; nan ; round-the-clock encampment to defe...,nan ; nan ; round-the-clock encampment to defe...
2024-04-02,275.0,341.0,308.0,5.0,0.0,0.0,nan ; Weekly protest outside office of U.S. Re...,nan ; Weekly protest outside office of U.S. Re...
2024-04-03,602.0,683.0,642.0,0.0,0.0,0.0,megaphone ; noise demo outside building where ...,megaphone ; noise demo outside building where ...
2024-04-04,986.0,1638.0,1313.0,1.0,0.0,0.0,megaphones; chalked sidewalk ; round-the-clock...,megaphones; chalked sidewalk ; round-the-clock...
2024-04-05,2179.0,2725.0,2451.0,23.0,0.0,0.0,nan ; smashed laptop computer with Intel chip ...,nan ; smashed laptop computer with Intel chip ...
...,...,...,...,...,...,...,...,...
2024-07-27,1663.0,3137.0,2401.0,9.0,0.0,0.0,nan ; megaphones; marched on sidewalks ; ampli...,nan ; megaphones; marched on sidewalks ; ampli...
2024-07-28,964.0,1024.0,994.0,0.0,0.0,0.0,megaphones ; nan ; nan ; nan ; nan ; nan ; nan...,megaphones ; nan ; nan ; nan ; nan ; nan ; nan...
2024-07-29,382.0,447.0,415.0,0.0,0.0,0.0,nan ; banners on freeway overpass ; Weekly pro...,nan ; banners on freeway overpass ; Weekly pro...
2024-07-30,368.0,454.0,411.0,46.0,0.0,0.0,Weekly protest outside office of U.S. Rep. Ada...,Weekly protest outside office of U.S. Rep. Ada...


### Exporting the dataset as a csv

In [335]:
crowd_data.to_csv("palprot24.csv")

In [None]:
data = pd.read_csv("palprot24.csv", index_col=0)

Unnamed: 0_level_0,size_low,size_high,size_mean,arrests,injuries_crowd,injuries_police,participant_measures,police_measures
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-04-01,639.0,1583.0,1111.0,1.0,1.0,0.0,nan ; nan ; round-the-clock encampment to defe...,nan ; nan ; round-the-clock encampment to defe...
2024-04-02,275.0,341.0,308.0,5.0,0.0,0.0,nan ; Weekly protest outside office of U.S. Re...,nan ; Weekly protest outside office of U.S. Re...
2024-04-03,602.0,683.0,642.0,0.0,0.0,0.0,megaphone ; noise demo outside building where ...,megaphone ; noise demo outside building where ...
2024-04-04,986.0,1638.0,1313.0,1.0,0.0,0.0,megaphones; chalked sidewalk ; round-the-clock...,megaphones; chalked sidewalk ; round-the-clock...
2024-04-05,2179.0,2725.0,2451.0,23.0,0.0,0.0,nan ; smashed laptop computer with Intel chip ...,nan ; smashed laptop computer with Intel chip ...
...,...,...,...,...,...,...,...,...
2024-07-27,1663.0,3137.0,2401.0,9.0,0.0,0.0,nan ; megaphones; marched on sidewalks ; ampli...,nan ; megaphones; marched on sidewalks ; ampli...
2024-07-28,964.0,1024.0,994.0,0.0,0.0,0.0,megaphones ; nan ; nan ; nan ; nan ; nan ; nan...,megaphones ; nan ; nan ; nan ; nan ; nan ; nan...
2024-07-29,382.0,447.0,415.0,0.0,0.0,0.0,nan ; banners on freeway overpass ; Weekly pro...,nan ; banners on freeway overpass ; Weekly pro...
2024-07-30,368.0,454.0,411.0,46.0,0.0,0.0,Weekly protest outside office of U.S. Rep. Ada...,Weekly protest outside office of U.S. Rep. Ada...
