This notebook handles the processing of PA docket data that has been downloaded in JSON format and converted into a CSV with the following columns:

* docket_no: Court docket number
* status: Status of this docket
* gender: Offender's gender
* race: Offender's race
* county: County of the court managing this docket
* offender_id: Hashed value for the Offender
* offense_age: Age computed from DOB
* seq_no: Sequential numbering of charges
* statute: Statute code in violation
* grade: Grade of the crime
* statute_description: Statute description
* offense_date: Date of the offense
* description: Most likely the same as statute description
* offense_tracking_no: Tracking number for the offense for multiple offenders involved
* disposition: Disposition of the charge
* sentence_date: Sentencing date (if any)
* sentence_start: Start of the sentence to be served (if any)
* sentence_type: Type of the sentence meted (if any)
* sentence_min_pd: Minimum sentence (if any)
* sentence_max_pd: Maximum sentence (if any)

In [5]:
import json
import os
import pandas as pd
import hashlib
from dateutil.relativedelta import relativedelta
from tqdm import tqdm_notebook

In [2]:
def get_bio(json_data):
    """
    Retrieves the biographical information
    """
    return dict(
        docket_no = json_data["docketNumber"],
        status = json_data["statusName"],
        gender = json_data["caseParticipants"][0]["gender"],
        dob = json_data["caseParticipants"][0]["primaryDateOfBirth"],
        race = json_data["caseParticipants"][0]["race"],
        first_name = json_data["caseParticipants"][0]["participantName"]["firstName"],
        middle_name = json_data["caseParticipants"][0]["participantName"]["middleName"],
        last_name = json_data["caseParticipants"][0]["participantName"]["lastName"],
        county = json_data["county"]["name"]
    )
    
def get_offenses(json_data):
    """
    Retrieves the list of offenses
    """
    offenses = map(
        lambda x: (
            x["sequenceNumber"], 
            x["statuteName"], 
            x["grade"], 
            x["statuteDescription"], 
            x["offenseDate"], 
            x["description"], 
            x["otn"]), 
        json_data["offenses"])
    return pd.DataFrame(
        offenses, 
        columns=['seq_no', 'statute', 'grade', 'statute_description', 'offense_date', 
                 'description', 'offense_tracking_no'])

def get_dispositions(json_data):
    """Retrieves the disposition (if applicable) of the offenses"""
    def process_sentencing(sentence_section):
        """Extracts sentencing as part of the disposition"""
        if len(sentence_section) == 0:
            return (None, None, None, None, None)
        else:
            latest_sentence = sentence_section[-1]
            return (latest_sentence["eventDate"], 
                latest_sentence["sentenceTypes"][0]["startDateTime"],
                latest_sentence["sentenceTypes"][0]["sentenceType"],
                latest_sentence["sentenceTypes"][0]["minPeriod"],
                latest_sentence["sentenceTypes"][0]["maxPeriod"])
    
    if len(json_data["dispositionEvents"]) > 0:
        disposition_section = json_data["dispositionEvents"][-1]["offenseDispositions"]
        dispositions = map(lambda x: (
            x["sequenceNumber"], 
            x["disposition"]) +
            process_sentencing(x["sentences"]), disposition_section)    
    else:
        dispositions = None
    return pd.DataFrame(
        dispositions,
        columns=['seq_no', 'disposition', 'sentence_date', 'sentence_start', 
                 'sentence_type', 'sentence_min_pd', 'sentence_max_pd']
    )

def offense_age(row):
    """Computes the age of the offender at the time of the offense"""
    
    if row["offense_date"] is pd.NaT or row["dob"] is pd.NaT:
        # If the date is not valid return None
        return None
    else: 
        # Else get the number of years between offense date and DOB
        return relativedelta(row["offense_date"].date(), row["dob"].date()).years
    
def get_records(json_data):
    """Pieces together all relevant pieces from the docket"""
    
    # Retrieve components of the data
    bio = get_bio(json_data) # Biographical information
    off = get_offenses(json_data) # Charges
    disps = get_dispositions(json_data) # Disposition of the charges
    
    # Merge the data together
    merged = off.merge(disps, on="seq_no", how='left')
    
    # Federate out the biographical data so this is de-normalized
    for k, v in get_bio(json_data).items():
        merged[k] = v
    
    # Convert date fields into datetime
    merged["dob"] = pd.to_datetime(merged["dob"], errors = 'coerce')
    merged["offense_date"] = pd.to_datetime(merged["offense_date"], errors = 'coerce')
    merged["sentence_date"] = pd.to_datetime(merged["sentence_date"], errors = 'coerce')
    
    # Construct a unique ID by hashing the names and DOB
    uid_str = "".join(filter(None, (bio["first_name"], bio["middle_name"], bio["last_name"], bio["dob"])))
    merged["offender_id"] = hashlib.sha256(uid_str.encode("utf-8")).hexdigest()[:12]
    
    # Compute age at time of each offense
    merged["offense_age"] = merged.apply(offense_age, axis=1)
    
    # Drop sensitive columns
    merged = merged.drop(columns=["first_name", "middle_name", "last_name", "dob"])
    
    # Re-order columns
    cols = merged.columns.tolist()
    cols = cols[len(cols)-7:] + cols[0:-7]
    return merged[cols]

In [None]:
input_path = "data/pa_json/"
output_path = "data/output/"
appended_data = []

def process_file(json_file):
    with open(json_file) as f:
        try:
            data = json.load(f)
            appended_data.append(get_records(data))
        except:
            print(json_file)
            raise
            
for i, input_file in enumerate(tqdm_notebook(os.listdir(input_path))):
    if input_file.endswith(".json"):
        process_file(path + input_file)
    if i > 0 and i % 10000 == 0:
        df = pd.concat(appended_data)
        df.to_csv(f"data/output/pa_data_{i}.csv")
        appended_data = []

df = pd.concat(appended_data)
df.to_csv(f"{output_path}pa_data_{i}.csv")
appended_data = []

HBox(children=(IntProgress(value=0, max=68481), HTML(value='')))

In [25]:
pa_data = pd.concat([pd.read_csv(f"{output_path}{x}", low_memory=False) for x in os.listdir(output_path)], axis=0)

In [40]:
# Create mapping for salted/hashed docket id
import os
salt = os.urandom(32)
hashed_docket_id = pa_data.apply(lambda row: hashlib.sha256(f"{salt}{row['docket_no']}".encode("utf-8")).hexdigest()[:12], axis=1)
docket_map = pd.concat([hashed_docket_id, pa_data["docket_no"]], axis=1)
docket_map.columns = ["hash_docket_no","real_docket_no"]
docket_map.drop_duplicates().to_csv(f"{output_path}docket_mapping.csv")

In [61]:
# Replace docket number
pa_data["docket_no"] = hashed_docket_id
pa_data = pa_data.drop(["Unnamed: 0"], axis=1)
pa_data.to_csv(f"{output_path}pa_data_all.csv.gz", compression='gzip')

In [62]:
len(pa_data)

11537519