Enviornment for developing a quick method to scan through datasets and see what PPID and PID are included.

In [46]:
from scipy import stats
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path

In [2]:
loc1 = 'data/chicagoParole.csv'
loc2 = 'data/Strategic_Subject_List_-_Historical_20240320.csv'

In [26]:
pd.read_csv(loc2).columns

  pd.read_csv(loc2).columns


Index(['SSL SCORE', 'PREDICTOR RAT AGE AT LATEST ARREST',
       'PREDICTOR RAT VICTIM SHOOTING INCIDENTS',
       'PREDICTOR RAT VICTIM BATTERY OR ASSAULT',
       'PREDICTOR RAT ARRESTS VIOLENT OFFENSES',
       'PREDICTOR RAT GANG AFFILIATION', 'PREDICTOR RAT NARCOTIC ARRESTS',
       'PREDICTOR RAT TREND IN CRIMINAL ACTIVITY', 'PREDICTOR RAT UUW ARRESTS',
       'SEX CODE CD', 'RACE CODE CD', 'WEAPON I', 'DRUG I', 'AGE GROUP',
       'AGE TO', 'STOP ORDER NO', 'PAROLEE I', 'LATEST DATE', 'LATEST DIST',
       'MAJORITY DIST', 'DLST', 'LATEST DIST RES', 'WEAPONS ARR CNT',
       'LATEST WEAPON ARR DATE', 'NARCOTICS ARR CNT',
       'LATEST NARCOTIC ARR DATE', 'IDOC RES CITY', 'IDOC RES STATE CODE',
       'IDOC RES ZIP CODE', 'IDOC CPD DIST', 'CPD ARREST I',
       'DOMESTIC ARR CNT', 'LATEST DOMESTIC ARR DATE', 'AGE CURR',
       'SSL LAST PTV DATE', 'TRAP STATUS', 'RAW SSL SCORE', 'HEAT SCORE',
       'RAW_HEAT_SCORE', 'STATUS I', 'PRE RAW HEAT SCORE', 'TRAP FLAGS',
       'SSL FL

In [43]:
def pii_violation(csv_filePath: str, pii_roots = ['Name', 'Date', 'Address', 'Residence', 'Code', 'Number', 'Age', 'Status', 'DOB', 'Year', 'Month', 'Day']):
    # read in the CSV
    data = pd.read_csv(csv_filePath)  

    # create a lowercase version of whatever was passed in or the default list
    pii_roots = [x.lower() for x in pii_roots] 

    # container for the hit columns
    suspected_pii = []

    # check each of the columns against our pii_roots (the default or a custom one)
    for col in data.columns:
        # make sure everything is the same regardless of casing
        lowercol = col.lower()

        # check the lower case versions, but append the normal column
        if len(set(lowercol.split(' ')).intersection(pii_roots)) > 0:
            # the roots matched, so the column(s) have a space as a delimiter
            suspected_pii.append(col)

        elif len(set(lowercol.split('_')).intersection(pii_roots)) > 0:
            # the roots matched, so the column(s) have '_' as a delimiter
            suspected_pii.append(col)
        elif len(set(lowercol.split('-')).intersection(pii_roots)) > 0:
            # the roots matched, so the column(s) have '-' as a delimiter
            suspected_pii.append(col)

    # hits in comparison to the columns
    hitRate = round(len(suspected_pii) / len(data.columns), 2)

    return suspected_pii, hitRate

In [54]:
class PIIScan():
    def __init__(self, filePath: str, pii = ['Name', 'Date', 'Address', 'Residence', 'Code', 'Number', 'Age', 'Status', 'DOB', 'Year', 'Month', 'Day']):
        # save the file path, name, and extension (mainly for report purposes)
        self.filePath = filePath
        self.fileName = Path(filePath).stem
        self.fileExtension = filePath.split('.')[-1]

        # get and save the data
        self.df = self._readFile()
        self.features = self.df.columns

        # save the lowercase version of roots passed in or used from default
        self.roots = [root.lower() for root in pii]

        # check for PII hits
        print('Checking for PII Violations...')
        self._piiViolation()

        print('Finished\n')

    def __str__(self):
        return f' File: {self.fileName} \n File Type: {self.fileExtension} \n Records: {len(self.df)} \n\n Possible PII Matches: {len(self.matches)} \n Hit Rate: {self.hitRate} \n\n Possible Matches: {self.matches} '

    def _readFile(self):
        
        if self.fileExtension == 'csv':
            print('Reading CSV...')
        
            return pd.read_csv(self.filePath)
        else:
            print(f'Extension not recognized: {self.fileExtension}')
        
    def _piiViolation(self):
        # container for the hit columns
        suspected_pii = []

        # check each of the columns against our pii_roots (the default or a custom one)
        for col in self.features:
            # make sure everything is the same regardless of casing
            lowercol = col.lower()

            # check the lower case versions, but append the normal column
            if len(set(lowercol.split(' ')).intersection(self.roots)) > 0:
                # the roots matched, so the column(s) have a space as a delimiter
                suspected_pii.append(col)

            elif len(set(lowercol.split('_')).intersection(self.roots)) > 0:
                # the roots matched, so the column(s) have '_' as a delimiter
                suspected_pii.append(col)

            elif len(set(lowercol.split('-')).intersection(self.roots)) > 0:
                # the roots matched, so the column(s) have '-' as a delimiter
                suspected_pii.append(col)

        self.matches = suspected_pii
        self.hitRate = round(len(suspected_pii) / len(self.features), 2)

In [56]:
# default roots
chicagoParole = PIIScan('data/chicagoParole.csv')
print(chicagoParole)

Reading CSV...
Checking for PII Violations...
Finished

 File: chicagoParole 
 File Type: csv 
 Records: 15630 

 Possible PII Matches: 11 
 Hit Rate: 0.37 

 Possible Matches: ['Name', 'Date of Birth', 'Veteran Status', 'Current Admission Date', 'MSR/Parole Date', 'Projected Discharge Date', 'Custody Date', 'Sentence Date', 'County of Residence', 'Residence Zip Code', 'Age'] 


In [57]:
other = PIIScan(loc2)

Reading CSV...
Checking for PII Violations...
Finished



  return pd.read_csv(self.filePath)


In [58]:
print(other)

 File: Strategic_Subject_List_-_Historical_20240320 
 File Type: csv 
 Records: 398684 

 Possible PII Matches: 15 
 Hit Rate: 0.31 

 Possible Matches: ['PREDICTOR RAT AGE AT LATEST ARREST', 'SEX CODE CD', 'RACE CODE CD', 'AGE GROUP', 'AGE TO', 'LATEST DATE', 'LATEST WEAPON ARR DATE', 'LATEST NARCOTIC ARR DATE', 'IDOC RES STATE CODE', 'IDOC RES ZIP CODE', 'LATEST DOMESTIC ARR DATE', 'AGE CURR', 'SSL LAST PTV DATE', 'TRAP STATUS', 'STATUS I'] 
