<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Set-up-directories" data-toc-modified-id="Set-up-directories-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Set up directories</a></span></li><li><span><a href="#Scrape-html-files-containing-acts" data-toc-modified-id="Scrape-html-files-containing-acts-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Scrape html files containing acts</a></span></li><li><span><a href="#Compute-moving-averages" data-toc-modified-id="Compute-moving-averages-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Compute moving averages</a></span></li></ul></div>

***This notebook scrapes pre-downloaded wikipedia html files to create a data frame containing all the UK parliamentary acts related to schooling and education.***

In [1]:
from bs4 import BeautifulSoup
import glob
import re
import csv
import pandas as pd
import os

### Set up directories

Ensure CWD is the scripts folder of the rep directory

In [2]:
os.getcwd()

'/Volumes/GoogleDrive/My Drive/02_Stanford/00_Researching/16_SocialScientization/-03_HM/00_replication/01_scripts'

In [3]:
directory = os.path.dirname(os.getcwd()) + "/"
data = directory + "00_data/"
acts_folder = data + "03_acts/"

### Scrape html files containing acts

In [4]:
acts = set()
files = glob.glob(acts_folder+"*htm")
for file in files:
    with open(file, 'r') as f: 
        content = f.read()
        soup = BeautifulSoup(content, 'lxml-xml')
        a_s = soup.find_all("a")
        pattern = "Act\s{1}\d{4}"
        for a in a_s:
            title = a.get("title")
            if title is not None:
                act = re.search(pattern, title)
                if act is not None:
                    acts.add(title.replace(" (page does not exist)", ""))

In [6]:
with open(acts_folder+"sch_acts.csv", "w", encoding='utf-8') as f:
    f_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    f_writer.writerow(['year', 'act', 'sch'])
    i = -1
    for act in acts:
        i += 1
        year = re.search("(?:18|19)\d{2}", act)
        if (year is not None) and (int(year.group()) < 1915): 
            if ("School" in act) or ("Educat" in act):
                sch = 1
            else:
                sch = 0
            f_writer.writerow([int(year.group()), act, sch])

### Compute moving averages

In [7]:
sch_acts = pd.read_csv(acts_folder+"sch_acts.csv")

In [8]:
n_sch_acts = []
years = []
for year, df in sch_acts.groupby('year'):
    years.append(year)
    n_sch_acts.append(df['sch'].sum())

In [9]:
# set averaged lag period
lag = 5
mov_avg = []
for i in range(len(n_sch_acts)+1):
    # keep within range
    if i < lag: 
        mov_avg.append(sum(n_sch_acts[i-i:i+1])/len(n_sch_acts[i-i:i+1]))
    if i > lag:
        mov_avg.append(sum(n_sch_acts[i-lag:i])/len(n_sch_acts[i-lag:i]))

In [10]:
moving_averages = pd.DataFrame()
moving_averages['year'] = years
moving_averages['mov_avg'] = mov_avg
moving_averages.to_stata(acts_folder+"mov_avg_acts.dta", write_index=False)
print(moving_averages.shape)
moving_averages

(114, 2)


Unnamed: 0,year,mov_avg
0,1801,0.000000
1,1802,0.000000
2,1803,0.333333
3,1804,0.250000
4,1805,0.200000
...,...,...
109,1910,1.800000
110,1911,1.600000
111,1912,1.600000
112,1913,1.800000
