In [115]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set_style('whitegrid')
sns.set_palette('colorblind')

In [233]:
def tidy_nanonets_extractions(filepath, write=False):
    extracted = pd.read_csv(filepath, sep=',')
    if 'col7' in extracted.columns:
        extracted = extracted.drop(['original_filename', 'file_url', 'platform_link', 'col1', 'col7'], axis=1)
    else:
        extracted = extracted.drop(['original_filename', 'file_url', 'platform_link', 'col1'], axis=1)
    extracted.columns = ['Page', 'Confinement Reason', 'Entry', 'Exit', 'Duration', 'Ethnic Origin']
    extracted = extracted[extracted['Confinement Reason'] != 'Confinement Reason']
    extracted['Correctional Facility'] = filepath.split('/')[-1].split('-')[0]
    if write:
        extracted.to_csv(f"nanonets_extracted_csvs/stage1_tidy/{filepath.split('/')[-1]}", sep='\t', index=False)
    else:
        return extracted
# tidy_nanonets_extractions('nanonets_extracted_csvs/cape_breton_correctional_facility-0-pdf.csv', True)
# tidy_nanonets_extractions('nanonets_extracted_csvs/southwest_ns_correctional_facility-0-pdf.csv', True)
# tidy_nanonets_extractions('nanonets_extracted_csvs/northeast_ns_correctional_facility-0-pdf.csv', True)
# tidy_nanonets_extractions('nanonets_extracted_csvs/central_ns_correctional_facility_chunk001-0-pdf.csv', True)
# tidy_nanonets_extractions('nanonets_extracted_csvs/central_ns_correctional_facility_chunk002-0-pdf.csv', True)

# data then manually tidied to remove summary tables in stage2_tidy (and missing last few records added to central chunk2)

In [157]:
def fix_ethnic_origin(x):
    if x not in ['CAU', 'BLA', 'IND', 'ANS', 'OTH', 'UNK', 'ARAB', "ASIA"]:
        x = 'Redacted'
    return x

southwest = pd.read_csv('nanonets_extracted_csvs/stage2_manual_tidy/southwest_ns_correctional_facility-0-pdf.csv', sep='\t')
southwest['Entry'] = pd.to_datetime(southwest['Entry'])
southwest['Exit'] = pd.to_datetime(southwest['Exit'])
southwest['Duration_calc'] = southwest['Exit'] - southwest['Entry']
print(southwest[southwest['Duration'] != southwest['Duration_calc'].dt.days])
southwest = southwest.drop('Duration_calc', axis=1)
southwest['Ethnic Origin'] = southwest['Ethnic Origin'].apply(fix_ethnic_origin)
southwest['Correctional Facility'] = 'Southwest NS Correctional Facility'
southwest = southwest.drop('Page', axis=1)
southwest.to_csv('nanonets_extracted_csvs/stage3_final_clean/southwest_correctional_data.csv', sep='\t', index=False)

Empty DataFrame
Columns: [Page, Confinement Reason, Entry, Exit, Duration, Ethnic Origin, Correctional Facility, Duration_calc]
Index: []


In [297]:
capebreton = pd.read_csv('nanonets_extracted_csvs/stage2_manual_tidy/cape_breton_correctional_facility-0-pdf.csv', sep='\t')
capebreton['Entry'] = pd.to_datetime(capebreton['Entry'])
capebreton['Exit'] = pd.to_datetime(capebreton['Exit'])
capebreton['Duration'] = capebreton['Duration'].str.strip('INLOلا').str.strip().astype(int)
capebreton['Duration_calc'] = (capebreton['Exit'] - capebreton['Entry']).dt.days
capebreton['Ethnic Origin'] = capebreton['Ethnic Origin'].apply(fix_ethnic_origin)
print(capebreton[capebreton['Duration'] != capebreton['Duration_calc']])
capebreton = capebreton.drop('Duration_calc', axis=1)
capebreton['Correctional Facility'] = "Cape Breton Correctional Facility"
capebreton.to_csv('nanonets_extracted_csvs/stage3_final_clean/capebreton_correctional_data.csv', sep='\t', index=False)

Empty DataFrame
Columns: [Page, Confinement Reason, Entry, Exit, Duration, Ethnic Origin, Correctional Facility, Duration_calc]
Index: []


In [298]:
northeast = pd.read_csv('nanonets_extracted_csvs/stage2_manual_tidy/northeast_ns_correctional_facility-0-pdf.csv', sep='\t')
northeast = northeast[~northeast['Duration'].isna()]

northeast_missing_pages = dict(zip([0,1,2,3,4,5,6], [36, 43, 74, 76, 77, 86, 109]))
northeast_missing = pd.read_csv('nanonets_extracted_csvs/stage2_manual_tidy/re_OCR_pages_with_missing_samples/northeast_ns_correctional_facility_missing_pages.csv')
northeast_missing = northeast_missing.drop(['original_filename', 'col1'], axis=1)
northeast_missing.columns = ['Page', 'Confinement Reason', 'Entry', 'Exit', 'Duration', 'Ethnic Origin']
northeast_missing = northeast_missing[northeast_missing['Confinement Reason'] != 'Confinement Reason']
northeast_missing['Page'] = northeast_missing['Page'].apply(lambda x: northeast_missing_pages[x]) - 1

northeast = pd.concat([northeast, northeast_missing])
northeast['Entry'] = pd.to_datetime(northeast['Entry'])
northeast['Exit'] = pd.to_datetime(northeast['Exit'])
northeast['Ethnic Origin'] = northeast['Ethnic Origin'].apply(fix_ethnic_origin)

northeast['Duration'] = northeast['Duration'].str.replace('00', '').str.strip('LOINلیا()').str.replace(" \) 6", "6").str.strip().astype(int)

northeast['Duration_calc'] = (northeast['Exit'] - northeast['Entry']).dt.days
print(northeast[northeast['Duration'] != northeast['Duration_calc']])
northeast['Correctional Facility'] = "Northeast NS Correctional Facility"
northeast.to_csv('nanonets_extracted_csvs/stage3_final_clean/northeast_correctional_data.csv', sep='\t', index=False)

Empty DataFrame
Columns: [Page, Confinement Reason, Entry, Exit, Duration, Ethnic Origin, Correctional Facility, Duration_calc]
Index: []


  northeast['Duration'] = northeast['Duration'].str.replace('00', '').str.strip('LOINلیا()').str.replace(" \) 6", "6").str.strip().astype(int)


In [366]:
central1 = pd.read_csv("nanonets_extracted_csvs/stage2_manual_tidy/central_ns_correctional_facility_chunk001-0-pdf.csv", sep='\t')
central2 = pd.read_csv("nanonets_extracted_csvs/stage2_manual_tidy/central_ns_correctional_facility_chunk002-0-pdf.csv", sep='\t')
central1['Page'].max()
central2['Page'] = central2['Page'] + 200
central = pd.concat([central1, central2])
central = central[central['Confinement Reason'] != 'Entry']
central_page_counts = central["Page"].value_counts()
central_page_counts = central_page_counts[central_page_counts != 16]

central_missing_pages = sorted(set(central_page_counts.index) - set([381]))
print(" ".join([str(x+1) for x in central_missing_pages]))
central_missing_pages = dict(zip(list(range(len(central_missing_pages))), central_missing_pages))
central_missing_pages
#print(set([x for x in range(max(central_page_counts.index))]) - set(central_page_counts.index))


central_missing_df = pd.read_csv('nanonets_extracted_csvs/stage2_manual_tidy/re_OCR_pages_with_missing_samples/central_ns_correctional_facility_missing_pages.csv')
central_missing_df = central_missing_df.drop(['original_filename', 'col1'], axis=1)
central_missing_df.columns = ['Page', 'Confinement Reason', 'Entry', 'Exit', 'Duration', 'Ethnic Origin']
central_missing_df = central_missing_df[central_missing_df['Confinement Reason'] != 'Confinement Reason']
central_missing_df['Page'] = central_missing_df['Page'].apply(lambda x: central_missing_pages[x])

central = pd.concat([central, central_missing_df])
central = central[~central['Entry'].isna()]
central_page_counts = central['Page'].value_counts() 
central['Confinement Reason'].value_counts()


central['Entry'] = pd.to_datetime(central['Entry'])
central['Exit'] = pd.to_datetime(central['Exit'])
central['Ethnic Origin'] = central['Ethnic Origin'].apply(fix_ethnic_origin)
central['Duration'] = central['Duration'].str.replace('8 00', '8').str.replace('00 8', '8').str.replace('\) 6', '6').str.strip('()LOINال').str.strip().astype(int)

central['Duration_calc'] = (central['Exit'] - central['Entry']).dt.days
central[central['Duration'] != central['Duration_calc']]
central['Correctional Facility'] = "Central NS Correctional Facility"
central.to_csv('nanonets_extracted_csvs/stage3_final_clean/central_correctional_data.csv', sep='\t', index=False)

228 259 272 281 283 298 315 324 329 341 355 358 368


  central['Duration'] = central['Duration'].str.replace('8 00', '8').str.replace('00 8', '8').str.replace('\) 6', '6').str.strip('()LOINال').str.strip().astype(int)
