In [8]:
import collections
import json
import pathlib
import re
from typing import Dict

import requests
from bs4 import BeautifulSoup

from src import utils
from src import generate_directory

## Fetching the contracts

Here we're grabbing the database page we want to parse, parsing it, and pulling out the relevant links and jurisdiction names.

In [2]:
soup = utils.fetch_check_the_police_soup()
jurisdiction_data = utils.get_jurisdiction_and_link_soup(soup)
jurisdictions_to_pdf_map = utils.get_jurisdictions_and_pdf_links(jurisdiction_data)

## Reorganize By State

Here we're going to create the basic structure we want to use to keep track of our data, which should be by state and then jurisdiction, rather than jurisdiction as they have it on checkthepolice.org.

In [3]:
soup = utils.fetch_check_the_police_soup()
jd_to_soup_map = utils.get_jurisdiction_and_link_soup(soup)
jd_to_doc_map = utils.get_jurisdictions_and_pdf_links(jd_to_soup_map)
jd_to_doc_map.update(utils.hard_coded_jurisdictions_and_links)

state_jd_pdf_map = utils.transform_from_jd_to_state(jd_to_doc_map)

existing_files_map = utils.load_contract_paths()
directory = generate_directory.merge_dicts(state_jd_pdf_map, existing_files_map)

## Generate the file directory for the frontend

In [4]:
from src.generate_directory import merge_dicts
directory = merge_dicts(state_jd_pdf_map, existing_files_map)
#print(json.dumps(merged, indent=4))

In [5]:
import pandas as pd
df = pd.read_csv('new_tags.csv', encoding='latin')
df['associated filename'].fillna('', inplace=True)
tags = df.to_dict(orient='records')
print(tags[0])

{'city/state': 'Albuquerque', 'associated filename': 'Police Union Contract.txt', 'expiration': '7/16/15', 'section': '18.1.5', 'language': "Human Resources Department files are a permanent record of an employee's performance with the City of Albuquerque. Such files will not be purged. However, employees who have been cleared of any charges shall not have reference of these charges included in their permanent personnel file.", 'category': 'Erases misconduct records', 'impact': 'Erases records if charges are dropped', 'id': 1, 'State': nan, 'Negotiating_Contract': 'Yes'}


In [18]:
for tag in tags:
    document_name = tag['associated filename'][:-4]

    if 'Bill' in tag['city/state']:
        jurisdiction = 'State'
        state = ''.join(tag['city/state'].split(' ')[:-4])
    else:
        jurisdiction = tag['city/state']
        state = utils.CITY_TO_STATE_MAP[tag['city/state']]
    
    entry = directory[state][jurisdiction][document_name]
    
    if 'tags' not in entry:
        entry['tags'] = set()
    entry['tags'].add(tag['category'])
    
for state in directory:
    for jd, files in directory[state].items():
        for file in files:
            if 'tags' in files[file]:
                files[file]['tags'] = list(files[file]['tags'])

{
    "Illinois": {
        "State": {
            "Illinois Police Bill of Rights": {
                "textUrl": "https://raw.githubusercontent.com/PoliceContracts/PoliceContracts/master/Illinois/State/Illinois%20Police%20Bill%20of%20Rights.txt",
                "pdfUrl": "https://www.checkthepolice.org/s/Illinois-PBR.pdf",
                "tags": [
                    "Gives officers unfair access to information",
                    "Restricts/Delays Interrogations"
                ]
            }
        },
        "Chicago": {
            "Police Union Contract Accountability Fact Sheet": {
                "textUrl": "https://raw.githubusercontent.com/PoliceContracts/PoliceContracts/master/Illinois/Chicago/Police%20Union%20Contract%20Accountability%20Fact%20Sheet.txt",
                "pdfUrl": "https://www.dropbox.com/s/2o3dtucoap7fw7h/Chicago%20Police%20Contract%20Police%20Accountability%20Review%207.10.15.pdf?dl=0",
                "tags": [
                    "Gives officers 

## Fixing the 

In [75]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag

import re

BASE_URL = "https://www.checkthepolice.org"

CITY_REGEX_PATTERN = re.compile(
    r'(.*) (Metropolitan Police Department|Police Bureau|Sheriff\'s Office|Division of Police|Bureau of Police|Police Department).*'
)


def get_jurisdiction_and_link_soup() -> Dict[str, Tag]:
    """
    Hits checkthepolice.org and returns a list of
    BeautifulSoup objects containing
    :return:
    """
    db_page = f"{BASE_URL}/database"

    base_page_html = requests.get(db_page).text
    base_page_soup = BeautifulSoup(base_page_html, 'lxml')
    contract_soups = list(base_page_soup.find_all('div', {'class': 'sqs-block-content'}))[0]
    jurisdiction_to_soup_map = {}
    for jd in contract_soups:
        if jd.strong and '✔️' in jd.strong.text:
            try:
                jurisdiction_name = CITY_REGEX_PATTERN.findall(jd.strong.text)[0]
                jurisdiction_to_soup_map[jurisdiction_name[0]] = jd
            except IndexError:
                print('failed', jd.strong.text)
    return jurisdiction_to_soup_map

In [None]:
hard_coded_jurisdictions_and_links = {
    'Honolulu': {
        'Police Union Contract': 'https://www.checkthepolice.org/s/Honolulu-Police-Contract.pdf',
    },
    'Kansas City, MO': {
        'Police Union Contract': 'https://www.checkthepolice.org/s/Kansas-City-MO-Police-Contract.pdf'
    },
    'Lexington': {
        'Police Union Contract': 'https://www.checkthepolice.org/s/Lexington-Police-Contract.pdf'
    },
    'Mesa': {
        'Police Union Contract': 'https://www.checkthepolice.org/s/Mesa-Police-Union-Contract.pdf'
    },
    'Toledo': {
        'Police Union Contract': 'https://www.checkthepolice.org/s/Toledo-Police-Contract.pdf'
    },
}

## Split files that are too big

358400

In [72]:
    import glob
    contract_paths = [pathlib.Path(p) for p in glob.glob('./contracts/*/*/*')]

    # Github only indexes files up to 384kb, so setting the max to a conservative 350
    MAX_SIZE = 1024 * 350

    print('MAX_SIZE', MAX_SIZE)
    import os
    for path in contract_paths:
        with path.open() as f:
            full_text = f.read()
        # Using the number of characters as opposed to some other measure of size
        # because it should make it easier to be consistent
        if len(full_text) > MAX_SIZE:
            num_groups = 1 + len(full_text) // MAX_SIZE
            for i in range(num_groups):
                new_filename = str(path).replace('.txt', f' {i+1}.txt')

                # chunking
                starting_index = i * MAX_SIZE
                ending_index = (i + 1) * MAX_SIZE
                text_chunk = full_text[starting_index:ending_index]

                with open(new_filename, 'w') as f:
                    f.write(text_chunk)

            # Remove the old file
            path.unlink()
                

MAX_SIZE 358400


## Zip up files by tag

In [122]:
with open('directory.json', 'r') as f:
    directory = json.load(f)

tags = collections.defaultdict(list)

import dataclasses

from collections import defaultdict

class Directory:
    
    def __init__(self):
        self._dir = collections.defaultdict
        
    def write_json(filename):
        pass

@dataclasses.dataclass
class Document:
    state: str
    jurisdiction: str
    filename: str
    
    def path(self) -> pathlib.Path:
        path = pathlib.Path('./contracts/') / self.state / self.jurisdiction / f"{self.filename}.txt"
        return path
    
    def from_path(path: pathlib.Path):
        _, state, jurisdiction, filename = str(path).split('/')
        return Document(
            state,
            jurisdiction,
            filename
        )
    
    def get_text():
        with self.path().open() as f:
            return f.read()

for state, jurisdictions in directory.items():
    for jurisdiction, documents in jurisdictions.items():
        for doc_name, properties in documents.items():
            if 'tags' in properties:
                for tag in properties['tags']:
                    doc = Document(state, jurisdiction, doc_name)
                    tags[tag].append(doc)

In [123]:
contract_paths = [pathlib.Path(p) for p in glob.glob('./contracts/*/*/*')]
import zipfile

for tag in tags:
    zip_file_name = tag.replace('/',' or ')
    zip_file_name = f"./tag_archive/{zip_file_name}.zip"
    print(zip_file_name)
    with zipfile.ZipFile(zip_file_name, 'w') as zipf:
        for document in tags[tag]:
            print(document)
            zipf.write(document.path())

./tag_archive/Gives officers unfair access to information.zip
Document(state='Illinois', jurisdiction='State', filename='Illinois Police Bill of Rights')
Document(state='Illinois', jurisdiction='Chicago', filename='Police Union Contract Accountability Fact Sheet')
Document(state='Illinois', jurisdiction='Chicago', filename='Police Union Contract 1')
Document(state='Illinois', jurisdiction='Chicago', filename='Police Union Contract 2')
Document(state='Arizona', jurisdiction='Phoenix', filename='Police Union Contract')
Document(state='Arizona', jurisdiction='Chandler', filename="sergeant's")
Document(state='Arizona', jurisdiction='Chandler', filename='police officers')
Document(state='Arizona', jurisdiction='Tucson', filename='Police Union Contract')
Document(state='Indiana', jurisdiction='Fort Wayne', filename='Police Union Contract')
Document(state='Indiana', jurisdiction='Indianapolis', filename='Police Union Contract')
Document(state='Tennessee', jurisdiction='Memphis', filename='Pol

## Renaming the contracts and updating the structure

In [10]:
import json
with open('directory.json', 'r') as f:
    directory = json.load(f)
    
import collections
doc = collections.namedtuple("DOCUMENT", ['state', 'jurisdiction', 'document', 'pdf_url', 'text_url', 'title'])
results = []
for state, jurisdictions in directory.items():
    for jurisdiction, documents in jurisdictions.items():
        for document, data in documents.items():
           results.append([
               state,
               jurisdiction,
               document,
               data.get('pdfUrl'),
               data.get('textUrl'),
               data.get('title')
           ]) 

import csv
with open('directory.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['state', 'jurisdiction', 'document', 'pdf_url', 'text_url', 'title'])
    writer.writerows(results)

In [18]:
from pathlib import Path
base_path = Path('./contracts')
for result in results:
    path = base_path / result[0] / result[1] / f"{result[2]}.txt"
    if path.exists():
        if result[1] == "State":
            new_filename = path.parent / f"{result[2]}.txt"
        else:
            new_filename = path.parent / f"{result[1]} {result[2]}.txt"
        
        path.rename(new_filename)

In [None]:
with open('directory.csv') as f:
    