# Global Nonviolent Action Database
- Scraping here: https://nvdatabase.swarthmore.edu/
- Get details for North American campaigns only (U.S., Mexico, Canada).

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import unicodedata
import math
import regex as re
from bs4 import BeautifulSoup
from ast import literal_eval

## Functions

In [2]:
def get_country_slugs(country):
    '''
    Get slugs to get urls for individual campaigns
    '''
    url = f'{root_url}/browse/{country}/all/all/all/all'
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'lxml')

    # Get number of pages to go through from "Showing 1-{showing} of {total_results}"
    results = soup.find('div', {'class': 'view-browse-cases'}
                       ).find('div', {'class': 'view-header'}).text.strip()
    total_results = int(re.search(r'(?<= of )\d*?(?= results)', results)[0])
    showing = int(re.search(r'(?<=-)\d*?(?= of)', results)[0])
    pages = math.ceil(total_results/showing)

    articles = []

    counter = 0
    print('Getting slugs... Directory page # ')

    # Cycle through all pages in directory
    for page in range(pages):
        url = f'{root_url}/browse/{country}/all/all/all/all?page={page}'
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'lxml')

        for article in soup.find_all('article'):
            articles.append(article['about'])

        print(counter+1, end=' ')

        time.sleep(1)
        counter += 1
    
    return articles

In [3]:
def get_campaign_details(country_slugs):
    '''
    Get details for individual campaigns
    '''
    campaigns = []
    counter = 0
    total_campaigns = len(country_slugs)

    print(f'Getting {total_campaigns} campaign details... ')
    for article in range(total_campaigns):
        url = f'{root_url}{country_slugs[article]}'
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'lxml')

        campaign = {}
        
        campaign['title'] = soup.find('h1', {'class': 'page-header'}
                 ).find('span').text.strip()
        
        try:
            campaign['goals'] = soup.find('div', {'class': 'field--name-field-goals'}
                     ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['movementcluster'] = soup.find('div', {'class': 'field--name-field-movementcluster'}
                                       ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['time_period'] = soup.find('div', {'class': 'field--name-field-startime'}
                                   ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['country'] = soup.find('div', {'class': 'field--name-field-loc-country'}
                               ).find_all('div', {'class': 'field--item'})
            campaign['country'] = [x.text.strip() for x in campaign['country']]
        except:
            pass

        try:
            campaign['location'] = soup.find('div', {'class': 'field--name-field-city'}
                                ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['location_desc'] = soup.find('div', {'class': 'field--name-field-loc-remainder'}
                                     ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['pcs'] = soup.find('div', {'class': 'field--name-field-pcs-tags'}
                           ).find_all('a')
            campaign['pcs'] = [x.text.strip() for x in campaign['pcs']]
        except:
            pass

        methods = dict()
        for segment in range(1,7):
            try:
                methods[segment] = soup.find('div', {'class': f'field--name-field-tactic{segment}'}
                           ).find_all('a')
                methods[segment] = [method.text.strip() for method in methods[segment]]
            except:
                pass    
        try:
            methods['addl'] = soup.find('div', {'class': 'field--name-field-tactic7'}
                                ).find_all('a')
            methods['addl'] = [x.text.strip() for x in methods['addl']]
        except:
            pass
        campaign['methods'] = methods

        try:
            campaign['segment_length'] = soup.find('div', {'class': 'field--name-field-segmenth-length'}
                                      ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['leaders'] = soup.find('div', {'class': 'field--name-field-leaders'}
                               ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['partners'] = soup.find('div', {'class': 'field--name-field-partners'}
                            ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['allies'] = soup.find('div', {'class': 'field--name-field-external-allies'}
                              ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['social_elites'] = soup.find('div', {'class': 'field--name-field-involvelemt'}
                                 ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['opponents'] = soup.find('div', {'class': 'field--name-field-opponents'}
                                 ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pas

        try:
            campaign['opp_nvresponses'] = soup.find('div', {'class': 'field--name-field-nvresponses'}
                                       ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass
        
        try:
            campaign['campaigner_violence'] = soup.find('div', {'class': 'field--name-field-campaignviolence'}
                                           ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['repressive_violence'] = soup.find('div', {'class': 'field--name-field-repressviolence'}
                                           ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass
        
        try:
            campaign['cluster'] = soup.find('div', {'class': 'field--name-field-cluster'}
                               ).find_all('div', {'class': 'field--item'})
            campaign['cluster'] = [x.text.strip() for x in campaign['cluster']]
        except:
            pass

        try:
            campaign['classification'] = soup.find('div', {'class': 'field--name-field-classification'}
                                      ).find_all('div', {'class': 'field--item'})
            campaign['classification'] = [x.text.strip() for x in campaign['classification']]
        except:
            pass

        try:
            campaign['group'] = soup.find('div', {'class': 'field--name-field-groupchar'}
                             ).find_all('div', {'class': 'field--item'})
            campaign['group'] = [x.text.strip() for x in campaign['group']]
        except:
            pass

        groups = dict()
        for segment in range(1,7):
            try:
                groups[segment] = soup.find('div', {'class': f'field--name-field-group{segment}'}
                                           ).find_all('div', {'class': 'field--item'})
                groups[segment] = [group.text.strip() for group in groups[segment]]
            except:
                pass
        campaign['groups'] = groups

        try:
            campaign['success_goal'] = soup.find('div', {'class': 'field--name-field-procedure'}
                                    ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['success_survival'] = soup.find('div', {'class': 'field--name-field-survivalgoals'}
                                        ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['success_growth'] = soup.find('div', {'class': 'field--name-field-growth'}
                                      ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass

        try:
            campaign['success_total'] = soup.find('div', {'class': 'field--name-field-total-points'}
                                     ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass
        
        try:
            campaign['success_notes'] = soup.find('div', {'class': 'field--name-field-notesoutcome'}
                                     ).find('div', {'class': 'field--item'}).text.strip()
        except:
            pass
        
        campaign['url'] = url
        
        campaigns.append(campaign)
        
        if (counter+1) % 5 == 0:
            print(counter+1, end=' ')

        time.sleep(1)
        counter += 1
        if counter == total_campaigns:
            print('Done.')

    # Save to dataframe
    campaigns = pd.DataFrame(campaigns)
    
    return campaigns

def dict_to_cols(df, col):
    '''
    Use to change 'methods' and 'groups' from dict to multiple columns
    https://stackoverflow.com/questions/38231591/split-explode-a-column-of-dictionaries-into-separate-columns-with-pandas
    '''
    # replace NaN with '{}' if the column is strings, otherwise replace with {}
    df[col] = df[col].fillna({i: {} for i in df.index})  # if the column is not strings

    # normalize the column of dictionaries and join it to df
    df = df.join(pd.json_normalize(df[col]).add_prefix(f'{col}_'))

    # drop col
    df.drop(columns=[col], inplace=True)
    
    return df

def add_missing_cols(df):
    '''
    Add any missing columns and reorder.
    '''
    cols = ['title', 'goals', 'movementcluster', 'time_period', 'country', 'location', 'location_desc', 'pcs',
            'methods_1', 'methods_2', 'methods_3', 'methods_4', 'methods_5', 'methods_6', 'methods_addl',
            'segment_length', 'leaders', 'partners', 'allies', 'social_elites',
            'opponents', 'opp_nvresponses', 'campaigner_violence', 'repressive_violence',
            'cluster', 'classification', 'group',
            'groups_1', 'groups_2', 'groups_3', 'groups_4', 'groups_5', 'groups_6',
            'success_goal', 'success_survival', 'success_growth', 'success_total', 'success_notes', 'url']
    for col in cols:
        if col not in df:
            df[col] = np.nan
            
    df = df[cols]
    return df

In [4]:
def campaigns(country_slugs):
    df = get_campaign_details(country_slugs)
    df = dict_to_cols(df, 'methods')
    df = dict_to_cols(df, 'groups')
    df = add_missing_cols(df)
    return df

## Get all campaign info

In [5]:
root_url = 'https://nvdatabase.swarthmore.edu'

In [6]:
us_slugs = get_country_slugs('United%20States')
mexico_slugs = get_country_slugs('Mexico')
canada_slugs = get_country_slugs('Canada')

Getting slugs... Directory page # 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 Getting slugs... Directory page # 
1 Getting slugs... Directory page # 
1 2 3 

In [7]:
us = campaigns(us_slugs)
mexico = campaigns(mexico_slugs)
canada = campaigns(canada_slugs)

Getting 421 campaign details... 
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200 205 210 215 220 225 230 235 240 245 250 255 260 265 270 275 280 285 290 295 300 305 310 315 320 325 330 335 340 345 350 355 360 365 370 375 380 385 390 395 400 405 410 415 420 Done.
Getting 12 campaign details... 
5 10 Done.
Getting 59 campaign details... 
5 10 15 20 25 30 35 40 45 50 55 Done.


## Combine and save to .csv

In [8]:
all_campaigns = pd.concat([us, mexico, canada])

In [9]:
# Save to .csv
all_campaigns.to_csv('./campaigns.csv', index=False)

## Preview

In [10]:
all_campaigns

Unnamed: 0,title,goals,movementcluster,time_period,country,location,location_desc,pcs,methods_1,methods_2,...,groups_3,groups_4,groups_5,groups_6,success_goal,success_survival,success_growth,success_total,success_notes,url
0,"Granite workers strike, picket, and march agai...","Raise in pay, stopping of paycuts and layoffs",,"1 April, 1933 to 1 June, 1933",[United States],"Barre, Vermont",,[Local Community or Neighborhood-level Campaign],"[016. Picketing, 116. Generalised strike]","[016. Picketing, 116. Generalised strike]",...,"[Stonecutters' Union, Quarry Workers Union]","[Stonecutters' Union, Quarry Workers Union]","[Quarry Workers Union, ACLU, civillians in Bar...","[Quarry Workers Union, civillians in Barre]",4 out of 6 points,1 out of 1 points,1 out of 3 points,6 out of 10 points,,https://nvdatabase.swarthmore.edu/content/gran...
1,University of Kentucky Students Hunger Strike ...,"Originally, 1) Establish and fund a physical B...",,"27 March, 2019 to 2 April, 2019",[United States],"Lexington, Kentucky",University of Kentucky,,"[001. Public speeches, 159. The fast (fast of ...","[001. Public speeches, 159. The fast (fast of ...",...,,,,[President Capilouto],5 out of 6 points,1 out of 1 points,3 out of 3 points,9 out of 10 points,"Students won 7 of the 8 demands, reaching a co...",https://nvdatabase.swarthmore.edu/content/univ...
2,Workers at Harvard University-owned DoubleTree...,Workers at DoubleTree Hilton Hotel Boston dema...,,"11 March, 2013 to 7 April, 2015",[United States],"Boston, Massachusetts",,"[(Mainly or Initiated by) People of Color, (Ma...",[006. Group or mass petitions],"[009. Leaflets, pamphlets, and books, 010. New...",...,,,,[Cambridge United for Justice With Peace],6 out of 6 points,1 out of 1 points,3 out of 3 points,10 out of 10 points,The workers at DoubleTree successfully negotia...,https://nvdatabase.swarthmore.edu/content/work...
3,Vermont Migrant Farmworkers March and Picket f...,Return of pay for three workers who had quit d...,,"15 May, 2014 to 16 May, 2014",[United States],"Ferrisburgh, VT",,[(Mainly or Initiated by) People of Color],[113. Strike by resignation],,...,[Migrant Justice (joined)],[Vermont Public Radio (joined)],,,6 out of 6 points,1 out of 1 points,2 out of 3 points,9 out of 10 points,"Workers received their withheld pay, and the a...",https://nvdatabase.swarthmore.edu/content/verm...
4,Vermont Migrant Farmworkers picket and march f...,-Recognition of Human Rights by Vermont Farmer...,,"23 October, 2014 to 3 October, 2017",[United States],Vermont,,,"[001. Public speeches, 007. Slogans, caricatur...","[016. Picketing, 038. Marches, 047. Assemblies...",...,,,,,6 out of 6 points,1 out of 1 points,2 out of 3 points,9 out of 10 points,,https://nvdatabase.swarthmore.edu/content/verm...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54,Environmentalists defend old forest in Clayoqu...,"To stop the government and logging company, Ma...",,"April, 1993 to October, 1993",[Canada],"Clayoquot Sound, British Columbia",Clayoquot Peace Camp,,"[001. Public speeches, 003. Declarations by or...","[001. Public speeches, 003. Declarations by or...",...,,"[Robert Redford, Oliver Stone, Tom Cruise, Bar...",[publishing house Knopf Canada],[Nuu-chah-nulth First Nation],0 out of 6 points,1 out of 1 points,3 out of 3 points,4 out of 10 points,The protests of clear-cutting in Clayoquot Sou...,https://nvdatabase.swarthmore.edu/content/envi...
55,International campaign against the Multilatera...,Originally the goal was to expand the Multilat...,,"1996 to October, 1998","[Malaysia, United States, Netherlands, Canada,...",,,[Included Innovative Organizational Forms/Comm...,[003. Declarations by organizations and instit...,[003. Declarations by organizations and instit...,...,,,[SalAMI],"[Oxfam, Transnational National Institute and E...",6 out of 6 points,1 out of 1 points,3 out of 3 points,10 out of 10 points,"Though the campaign achieved its stated goal, ...",https://nvdatabase.swarthmore.edu/content/inte...
56,Canadian Quebecois workers general strike for ...,Workers demanded an 8% raise to match inflatio...,,"March 9, 1972 to May 20, 1972",[Canada],Quebec,,"[An Example of Paradox of Repression, Included...",[005. Declarations of indictment and intention...,[005. Declarations of indictment and intention...,...,,,,,2 out of 6 points,1 out of 1 points,3 out of 3 points,6 out of 10 points,Most sectors were successful in raising the mi...,https://nvdatabase.swarthmore.edu/content/cana...
57,Carleton University students win divestment fr...,For the Carleton University student union and ...,,"October, 1985 to March, 1987",[Canada],"Ottawa, Ontario",Carleton University,[(Mainly or Initiated by) Student Participants],[015. Group lobbying],"[015. Group lobbying, 177. Speak-in]",...,,,,,6 out of 6 points,1 out of 1 points,3 out of 3 points,10 out of 10 points,Carleton University President Beckel responded...,https://nvdatabase.swarthmore.edu/content/carl...


Next steps:
- Could get start and end date from `time_period`. Depending on earliest / lastest, can bin by decade, presidency, etc.
- `pcs`: Create dummies AND/OR count of different tags
- `cluster`: count or dummies
- `methods`: Could create just a couple dummies - strike, boycott, sit-in...
- `methods` count: count of total different methods used
- `partners/ allies/ elites`: either dummy or count
- opponents / violence: dummy yes or no
- `success`: Change just to first digits.
- others - get count of number of different groups...

