## Summary

This code is the runnable python notebook version of `scraped.py`

This code is used to scrape the webpages by using the url address stored in the Cicero dataset.

The webpages will be stored as HTML files in the output directory.


The error log will be stored in the `output/error_log.json` file 
saying the (1) politician ID, (2) url address, and (3) error message.

## Import

In [None]:
import collections
import json
import os

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

## Load Data

In [None]:

# your own path to cicero data
CICERO_DATA_PATH = ''

In [None]:
# load the Cicero dataset
df = pd.read_csv(CICERO_DATA_PATH ,\
                  error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)
b'Skipping line 13: expected 55 fields, saw 60\nSkipping line 34: expected 55 fields, saw 59\nSkipping line 110: expected 55 fields, saw 60\nSkipping line 125: expected 55 fields, saw 60\nSkipping line 127: expected 55 fields, saw 60\nSkipping line 171: expected 55 fields, saw 60\nSkipping line 175: expected 55 fields, saw 65\nSkipping line 181: expected 55 fields, saw 60\nSkipping line 196: expected 55 fields, saw 60\nSkipping line 197: expected 55 fields, saw 58\nSkipping line 206: expected 55 fields, saw 60\nSkipping line 220: expected 55 fields, saw 60\nSkipping line 261: expected 55 fields, saw 60\nSkipping line 272: expected 55 fields, saw 63\nSkipping line 279: expected 55 fields, saw 60\nSkipping line 353: expected 55 fields, saw 58\nSkipping line 356: expected 55 fields, saw 59\nSkipping line 407: expected 55 fields, saw 57\nSkipping line 429: expected 55 fields, saw 60\nSkipping line 446: expected 55 fields, saw 69\nSkippi

## Scrape and Save

In [None]:
output = "./scraped_pages/"

In [None]:
error_log = collections.defaultdict(dict)

In [None]:
# detect if the output directory exists
if not os.path.exists(output):
    os.makedirs(output)

In [None]:
# scrape the webpages
for i, step in enumerate(tqdm(range(len(df)))):
    data_dict = dict(df.iloc[i].dropna())
    politician_id = str(data_dict.get("id", None))
    url = data_dict.get("url_1", None)
    name = (data_dict.get("first_name", None) + " " +
            data_dict.get("last_name", None))
    try:
        response = requests.get(url, timeout=2)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            with open(f"{output}/{politician_id}.html",
                      "w",
                      encoding="utf-8") as f:
                f.write(str(soup))
        elif response.status_code != 200:
            error_log[politician_id]["url"] = url
            error_log[politician_id]["name"] = name
            error_log[politician_id]["error"] = response.status_code
    except Exception as e:
        error_log[politician_id]["url"] = url
        error_log[politician_id]["name"] = name
        error_log[politician_id]["error"] = str(e)

100%|██████████| 1919/1919 [23:25<00:00,  1.37it/s]


In [None]:
# save the error log
with open(f"{output}/error_log.json", "w") as f:
    json.dump(error_log, f)

In [None]:
error_log

defaultdict(dict,
            {'323290': {'url': 'https://www.cityofvallejo.net/city_hall/city_government/mayor___city_council',
              'name': 'Katy Miessner',
              'error': 404},
             '362375': {'url': 'https://www.supervisorterralawsonremer.com/',
              'name': 'Terra Lawson-Remer',
              'error': "HTTPSConnectionPool(host='www.supervisorterralawsonremer.com', port=443): Read timed out. (read timeout=2)"},
             '342866': {'url': 'http://www.peoriagov.org/city-council/denis-cyr/',
              'name': 'Denis Cyr',
              'error': "HTTPConnectionPool(host='www.peoriagov.org', port=80): Read timed out."},
             '346298': {'url': 'https://members.parliament.uk/member/1605/contact',
              'name': 'Edward Timpson',
              'error': 403},
             '304133': {'url': 'http://www.capitol.tn.gov/senate/members/s6.html',
              'name': 'Becky Massey',
              'error': 404},
             '335134': {'url