# Data Preparation for the Visualizer

The primary purpose of this notebook is to ensure that changes to the cleaned data files (ie. clean_northwestern.csv) do not automatically affect the visualizer. The copies of the data files used by the visualizer can only be altered by running this notebook.

Additionally, some extra columns may be added that are helpful for the visualizer. These adjustments are not intended to be a step in the data pipeline.

In [1]:
import requests
import pandas as pd

In [2]:
# Create dataframes for Northwest, Suffolk, and Middlesex
nw = pd.read_csv('../../data/cleaned/clean_northwestern.csv')
sf = pd.read_csv('../../data/cleaned/clean_suffolk.csv')
pd.set_option("display.max.columns", None)

In [3]:
# Add columns for Northwest: Incident Sex, Incident Murder
nw['Incident Sex'] = nw.groupby(['Person ID', 'Offense Date'])['sex'].transform('max')
nw['Incident Murder'] = nw.groupby(['Person ID', 'Offense Date'])['murder'].transform('max')

In [4]:
sf.loc[(sf['guilty'] == '0'), 'guilty'] = False
sf.loc[(sf['guilty'] == 'False'), 'guilty'] = False
sf.loc[(sf['guilty'] == '1'), 'guilty'] = True
sf.loc[(sf['guilty'] == 'True'), 'guilty'] = True
sf['guilty'].unique()

array([True, False], dtype=object)

In [5]:
# Add columns for Suffolk: Incident_Guilty_or_missing, Inc_Expungeable_Attempts_Are, Incident Sex, Incident Murder
sf.loc[(sf['guilty'] == '0'), 'guilty'] = False
sf.loc[(sf['guilty'] == 'False'), 'guilty'] = False
sf.loc[(sf['guilty'] == '1'), 'guilty'] = True
sf.loc[(sf['guilty'] == 'True'), 'guilty'] = True
sf.loc[(sf['Description Disposition Reason'].isnull() & (sf['Disposition'].isnull())), 'guilty'] = -1
sf['Incident_Guilty_or_missing'] = sf.groupby(['Person ID', 'Offense Date'])['guilty'].transform('max')

sf['Inc_Expungeable_Attempts_Are'] = sf.groupby(['Person ID', 'Offense Date'])['ExpAtt'].transform('min')

sf['Incident Sex'] = sf.groupby(['Person ID', 'Offense Date'])['sex'].transform('max')

sf['Incident Murder'] = sf.groupby(['Person ID', 'Offense Date'])['murder'].transform('max')

In [6]:
# Save the updated dataframes as csv files, overwriting them in the processed data folder
nw_file = nw.to_csv('../../data/cleaned/visualizer_northwestern.csv', index=False)
sf_file = sf.to_csv('../../data/cleaned/visualizer_suffolk.csv', index=False)