In [1]:
# STEP 1: Create NY subset of data (NYSubmissions.csv)
    # Find all NY Facilities in the CDFC_FacilityCodes.csv and save as a set
    # Use the set to filter ComplaintFilings.csv so that we are left with only complaints made at NY facilities 
    # Save results to NYSubmissions.csv

import pandas as pd

# Load facilities data, filter for NY, and extract facility codes
allfacilities = pd.read_csv('../data/CDFC_FacilityCodes.csv')
nyfacilitycodes = set(allfacilities.loc[allfacilities['State'] == 'NY', 'Facility_Code'])

# Load submissions data and filter where CDFCLEVN is in NY facility codes
allsubmissions = pd.read_csv('../data/ComplaintFilings.csv')
nysubmissions = allsubmissions[allsubmissions['CDFCLEVN'].isin(nyfacilitycodes)]

# Save filtered submissions to a CSV file
nysubmissions.to_csv('../results/NYSubmissions.csv', index=False)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# STEP 2: Create a new enriched dataset (NYSubmissionsEnriched) by adding columns
  # Add column (subcount) that counts total number of submissions associated with the CASENBR
  # Add column (appcount) that counts total number of appeals
  # Add column (earliest_sitdtrcv) that has earliest Remedy Case Submission date received (sitdtrcv) for that CASENBR
  # Add column (latest_sdtstat) that has Date latest status assigned (sdtstat) for that CASENBR
  # Add column (days_between) that has number of days between earliest_sitdtrcv and latest_sdtstat. This is the amount of time either between first complaint and closure OR first complaint and latest appeal

# Load NYComplaintFilings.csv dataset
nysubmissions2 = pd.read_csv(r'../results/NYSubmissions.csv')

# add column that counts the total number of submissions associated with that one CASENBR
counts = nysubmissions2['CASENBR'].value_counts()
nysubmissions2['subcount'] = nysubmissions2['CASENBR'].map(counts)

# Add a column to count the total number of "Rejected" submissions for each CASENBR
rejected_counts = nysubmissions2[nysubmissions2['CDSTATUS'] == 'REJ'].groupby('CASENBR').size()
nysubmissions2['rejected_count'] = nysubmissions2['CASENBR'].map(rejected_counts).fillna(0).astype(int)

# Add a column to count the total number of "Closed Denied" and "Closed Other" submissions for each CASENBR
closed_denied_other_counts = nysubmissions2[nysubmissions2['CDSTATUS'].isin(['CLD', 'CLO'])].groupby('CASENBR').size()
nysubmissions2['closed_denied_other_count'] = nysubmissions2['CASENBR'].map(closed_denied_other_counts).fillna(0).astype(int)

# Add a column to count the total number of "Closed Granted" and "Accepted" submissions for each CASENBR
closed_denied_other_counts = nysubmissions2[nysubmissions2['CDSTATUS'].isin(['CLG', 'ACC'])].groupby('CASENBR').size()
nysubmissions2['closed_granted_accepted_count'] = nysubmissions2['CASENBR'].map(closed_denied_other_counts).fillna(0).astype(int)

# ensure 'sitdtrcv' and 'sdtstat' are in datetime format
nysubmissions2['sitdtrcv'] = pd.to_datetime(nysubmissions2['sitdtrcv'], errors='coerce')
nysubmissions2['sdtstat'] = pd.to_datetime(nysubmissions2['sdtstat'], errors='coerce')

# find the earliest sitdtrcv date for each CASENBR
earliest_dates = nysubmissions2.groupby('CASENBR')['sitdtrcv'].min().reset_index()
earliest_dates.rename(columns={'sitdtrcv': 'earliest_sitdtrcv'}, inplace=True)

# find the latest sdtstat date for each CASENBR
latest_dates = nysubmissions2.groupby('CASENBR')['sdtstat'].max().reset_index()
latest_dates.rename(columns={'sdtstat': 'latest_sdtstat'}, inplace=True)

# merge the earliest and latest dates
merged_dates = pd.merge(earliest_dates, latest_dates, on='CASENBR')

# calculate the number of days between the earliest sitdtrcv and the latest sdtstat
merged_dates['days_between'] = (merged_dates['latest_sdtstat'] - merged_dates['earliest_sitdtrcv']).dt.days

# merge the calculated dates back into the original dataset
nysubmissionsenriched = pd.merge(
    nysubmissions2,
    merged_dates,
    on='CASENBR',
    how='left'  # Retain all original rows
)

# save the updated dataset as NYSubmissionsEnriched.csv
nysubmissionsenriched.to_csv("../results/NYSubmissionsEnriched.csv", index=False)

In [3]:
# STEP 3: Create expanded dataset of NYSubmissionsEnriched.csv with codes translated for easier use 

# import code csvs
complaintcodes = pd.read_csv(r'../data/cdsub1cb_ConcatSubjectCodes.csv')
facilitycodes = pd.read_csv(r'../data/CDFC_FacilityCodes.csv')
statuscodes = pd.read_csv(r'../data/CDSTATUS_CaseStatusCodes.csv')
orglevelcodes = pd.read_csv(r'../data/ITERLVL_OrgLevelCodes.csv')
statusreasoncodes = pd.read_csv(r'../data/STATRSN_StatusReasonCodes.csv')
columncodes = pd.read_csv(r'../data/ColumnCodes.csv')
primarysubjectcodes = pd.read_csv(r'../data/CDSUB1PR _PrimarySubjectCodes.csv')



nysubmissionsrenriched2 = nysubmissionsenriched.copy()

# duplicate the cdsub1cb column to create cdsub1cbTEXT
nysubmissionsrenriched2.loc[:, 'cdsub1cbTEXT'] = nysubmissionsrenriched2['cdsub1cb']

# find and replace
nysubmissionsrenriched2.loc[:, 'cdsub1cbTEXT'] = nysubmissionsrenriched2['cdsub1cbTEXT'].map(complaintcodes.set_index('Code')['Text']).fillna(nysubmissionsrenriched2['cdsub1cbTEXT'])
nysubmissionsrenriched2.loc[:, 'CDFCLEVN'] = nysubmissionsrenriched2['CDFCLEVN'].map(facilitycodes.set_index('Facility_Code')['Facility_Name']).fillna(nysubmissionsrenriched2['CDFCLEVN'])
nysubmissionsrenriched2.loc[:, 'CDFCLRCV'] = nysubmissionsrenriched2['CDFCLRCV'].map(facilitycodes.set_index('Facility_Code')['Facility_Name']).fillna(nysubmissionsrenriched2['CDFCLRCV'])
nysubmissionsrenriched2.loc[:, 'CDOFCRCV'] = nysubmissionsrenriched2['CDOFCRCV'].map(facilitycodes.set_index('Facility_Code')['Facility_Name']).fillna(nysubmissionsrenriched2['CDOFCRCV'])
nysubmissionsrenriched2.loc[:, 'ITERLVL'] = nysubmissionsrenriched2['ITERLVL'].map(orglevelcodes.set_index('Code')['Text']).fillna(nysubmissionsrenriched2['ITERLVL'])
nysubmissionsrenriched2.loc[:, 'CDSTATUS'] = nysubmissionsrenriched2['CDSTATUS'].map(statuscodes.set_index('Code')['Text']).fillna(nysubmissionsrenriched2['CDSTATUS'])
nysubmissionsrenriched2.loc[:, 'STATRSN1'] = nysubmissionsrenriched2['STATRSN1'].map(statusreasoncodes.set_index('Reason Code')['Text']).fillna(nysubmissionsrenriched2['STATRSN1'])
nysubmissionsrenriched2.loc[:, 'STATRSN2'] = nysubmissionsrenriched2['STATRSN2'].map(statusreasoncodes.set_index('Reason Code')['Text']).fillna(nysubmissionsrenriched2['STATRSN2'])
nysubmissionsrenriched2.loc[:, 'STATRSN3'] = nysubmissionsrenriched2['STATRSN3'].map(statusreasoncodes.set_index('Reason Code')['Text']).fillna(nysubmissionsrenriched2['STATRSN3'])
nysubmissionsrenriched2.loc[:, 'STATRSN4'] = nysubmissionsrenriched2['STATRSN4'].map(statusreasoncodes.set_index('Reason Code')['Text']).fillna(nysubmissionsrenriched2['STATRSN4'])
nysubmissionsrenriched2.loc[:, 'STATRSN5'] = nysubmissionsrenriched2['STATRSN5'].map(statusreasoncodes.set_index('Reason Code')['Text']).fillna(nysubmissionsrenriched2['STATRSN5'])
nysubmissionsrenriched2.loc[:, 'CDSUB1PR'] = nysubmissionsrenriched2['CDSUB1PR'].map(primarysubjectcodes.set_index('Primary Subject Code')['Primary Subject Code Translation']).fillna(nysubmissionsrenriched2['CDSUB1PR'])


nysubmissionsrenriched2.loc[:, 'sdtdue'] = nysubmissionsrenriched2['sdtdue'].fillna('rejected')

nysubmissionsrenriched2.loc[:, 'accept'] = nysubmissionsrenriched2['accept'].replace({0: 'no', 1: 'yes'})
nysubmissionsrenriched2.loc[:, 'reject'] = nysubmissionsrenriched2['reject'].replace({0: 'no', 1: 'yes'})
nysubmissionsrenriched2.loc[:, 'deny'] = nysubmissionsrenriched2['deny'].replace({0: 'no', 1: 'yes'})
nysubmissionsrenriched2.loc[:, 'grant'] = nysubmissionsrenriched2['grant'].replace({0: 'no', 1: 'yes'})
nysubmissionsrenriched2.loc[:, 'other'] = nysubmissionsrenriched2['other'].replace({0: 'no', 1: 'yes'})
nysubmissionsrenriched2.loc[:, 'submit'] = nysubmissionsrenriched2['submit'].replace({0: 'no', 1: 'yes'})
nysubmissionsrenriched2.loc[:, 'filed'] = nysubmissionsrenriched2['filed'].replace({0: 'no', 1: 'yes'})
nysubmissionsrenriched2.loc[:, 'closed'] = nysubmissionsrenriched2['closed'].replace({0: 'no', 1: 'yes'})

columncodes_dict = dict(zip(columncodes['Code'], columncodes['Text']))

nysubmissionsrenriched2.rename(columns=columncodes_dict, inplace=True)

nysubmissionsrenriched2.to_csv('../results/NYSubmissionsEnrichedExpanded.csv', index=False)

 'DHO/CDC/CONT. HOUSING APPEALS' ... 'SENTENCE COMPUTATION'
 'SENTENCE COMPUTATION' 'CLASSIFICATION MATTERS']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  nysubmissionsrenriched2.loc[:, 'CDSUB1PR'] = nysubmissionsrenriched2['CDSUB1PR'].map(primarysubjectcodes.set_index('Primary Subject Code')['Primary Subject Code Translation']).fillna(nysubmissionsrenriched2['CDSUB1PR'])
  nysubmissionsrenriched2.loc[:, 'accept'] = nysubmissionsrenriched2['accept'].replace({0: 'no', 1: 'yes'})
  nysubmissionsrenriched2.loc[:, 'reject'] = nysubmissionsrenriched2['reject'].replace({0: 'no', 1: 'yes'})
  nysubmissionsrenriched2.loc[:, 'deny'] = nysubmissionsrenriched2['deny'].replace({0: 'no', 1: 'yes'})
  nysubmissionsrenriched2.loc[:, 'grant'] = nysubmissionsrenriched2['grant'].replace({0: 'no', 1: 'yes'})
  nysubmissionsrenriched2.loc[:, 'other'] = nysubmissionsrenriched2['other'].replace({0: 'no', 1: 'yes'})
  nysubmissionsrenriched2.loc[:, 'submit'] = nys

In [4]:
# STEP 4: Create subset of data which only has unique complaints. It will be the record with the most recent status assignment

# load dataset 
nysubmissionedenriched3 = pd.read_csv(r'../results/NYSubmissionsEnriched.csv')

# sort 'sitdtrcv' (submission date) in descending order
nysubmissionedenriched3 = nysubmissionedenriched3.sort_values(by='sitdtrcv', ascending=False)

# Drop duplicate CASENBRs, keeping the most recent (highest sitdtrcv)
unique_nysubmissionedenriched1 = nysubmissionedenriched3.drop_duplicates(subset='CASENBR', keep='first')

unique_nysubmissionedenriched1.to_csv("../results/UniqueNYSubmissionedEnriched.csv", index=False)

In [5]:
# STEP 5: Create expanded dataset of NYSubmissionsEnriched.csv with codes translated for easier use 

# import code csvs
complaintcodes = pd.read_csv(r'../data/cdsub1cb_ConcatSubjectCodes.csv')
facilitycodes = pd.read_csv(r'../data/CDFC_FacilityCodes.csv')
statuscodes = pd.read_csv(r'../data/CDSTATUS_CaseStatusCodes.csv')
orglevelcodes = pd.read_csv(r'../data/ITERLVL_OrgLevelCodes.csv')
statusreasoncodes = pd.read_csv(r'../data/STATRSN_StatusReasonCodes.csv')
columncodes = pd.read_csv(r'../data/ColumnCodes.csv')
primarysubjectcodes = pd.read_csv(r'../data/CDSUB1PR _PrimarySubjectCodes.csv')

unique_nysubmissionsrenriched2 = unique_nysubmissionedenriched1.copy()

# duplicate the cdsub1cb column to create cdsub1cbTEXT
unique_nysubmissionsrenriched2.loc[:, 'cdsub1cbTEXT'] = unique_nysubmissionsrenriched2['cdsub1cb']

# find and replace
unique_nysubmissionsrenriched2.loc[:, 'cdsub1cbTEXT'] = unique_nysubmissionsrenriched2['cdsub1cbTEXT'].map(complaintcodes.set_index('Code')['Text']).fillna(unique_nysubmissionsrenriched2['cdsub1cbTEXT'])
unique_nysubmissionsrenriched2.loc[:, 'CDFCLEVN'] = unique_nysubmissionsrenriched2['CDFCLEVN'].map(facilitycodes.set_index('Facility_Code')['Facility_Name']).fillna(unique_nysubmissionsrenriched2['CDFCLEVN'])
unique_nysubmissionsrenriched2.loc[:, 'CDFCLRCV'] = unique_nysubmissionsrenriched2['CDFCLRCV'].map(facilitycodes.set_index('Facility_Code')['Facility_Name']).fillna(unique_nysubmissionsrenriched2['CDFCLRCV'])
unique_nysubmissionsrenriched2.loc[:, 'CDOFCRCV'] = unique_nysubmissionsrenriched2['CDOFCRCV'].map(facilitycodes.set_index('Facility_Code')['Facility_Name']).fillna(unique_nysubmissionsrenriched2['CDOFCRCV'])
unique_nysubmissionsrenriched2.loc[:, 'ITERLVL'] = unique_nysubmissionsrenriched2['ITERLVL'].map(orglevelcodes.set_index('Code')['Text']).fillna(unique_nysubmissionsrenriched2['ITERLVL'])
unique_nysubmissionsrenriched2.loc[:, 'CDSTATUS'] = unique_nysubmissionsrenriched2['CDSTATUS'].map(statuscodes.set_index('Code')['Text']).fillna(unique_nysubmissionsrenriched2['CDSTATUS'])
unique_nysubmissionsrenriched2.loc[:, 'STATRSN1'] = unique_nysubmissionsrenriched2['STATRSN1'].map(statusreasoncodes.set_index('Reason Code')['Text']).fillna(unique_nysubmissionsrenriched2['STATRSN1'])
unique_nysubmissionsrenriched2.loc[:, 'STATRSN2'] = unique_nysubmissionsrenriched2['STATRSN2'].map(statusreasoncodes.set_index('Reason Code')['Text']).fillna(unique_nysubmissionsrenriched2['STATRSN2'])
unique_nysubmissionsrenriched2.loc[:, 'STATRSN3'] = unique_nysubmissionsrenriched2['STATRSN3'].map(statusreasoncodes.set_index('Reason Code')['Text']).fillna(unique_nysubmissionsrenriched2['STATRSN3'])
unique_nysubmissionsrenriched2.loc[:, 'STATRSN4'] = unique_nysubmissionsrenriched2['STATRSN4'].map(statusreasoncodes.set_index('Reason Code')['Text']).fillna(unique_nysubmissionsrenriched2['STATRSN4'])
unique_nysubmissionsrenriched2.loc[:, 'STATRSN5'] = unique_nysubmissionsrenriched2['STATRSN5'].map(statusreasoncodes.set_index('Reason Code')['Text']).fillna(unique_nysubmissionsrenriched2['STATRSN5'])
unique_nysubmissionsrenriched2.loc[:, 'CDSUB1PR'] = unique_nysubmissionsrenriched2['CDSUB1PR'].map(primarysubjectcodes.set_index('Primary Subject Code')['Primary Subject Code Translation']).fillna(unique_nysubmissionsrenriched2['CDSUB1PR'])


unique_nysubmissionsrenriched2.loc[:, 'sdtdue'] = unique_nysubmissionsrenriched2['sdtdue'].fillna('rejected')

unique_nysubmissionsrenriched2.loc[:, 'accept'] = unique_nysubmissionsrenriched2['accept'].replace({0: 'no', 1: 'yes'})
unique_nysubmissionsrenriched2.loc[:, 'reject'] = unique_nysubmissionsrenriched2['reject'].replace({0: 'no', 1: 'yes'})
unique_nysubmissionsrenriched2.loc[:, 'deny'] = unique_nysubmissionsrenriched2['deny'].replace({0: 'no', 1: 'yes'})
unique_nysubmissionsrenriched2.loc[:, 'grant'] = unique_nysubmissionsrenriched2['grant'].replace({0: 'no', 1: 'yes'})
unique_nysubmissionsrenriched2.loc[:, 'other'] = unique_nysubmissionsrenriched2['other'].replace({0: 'no', 1: 'yes'})
unique_nysubmissionsrenriched2.loc[:, 'submit'] = unique_nysubmissionsrenriched2['submit'].replace({0: 'no', 1: 'yes'})
unique_nysubmissionsrenriched2.loc[:, 'filed'] = unique_nysubmissionsrenriched2['filed'].replace({0: 'no', 1: 'yes'})
unique_nysubmissionsrenriched2.loc[:, 'closed'] = unique_nysubmissionsrenriched2['closed'].replace({0: 'no', 1: 'yes'})

columncodes_dict = dict(zip(columncodes['Code'], columncodes['Text']))

unique_nysubmissionsrenriched2.rename(columns=columncodes_dict, inplace=True)

unique_nysubmissionsrenriched2.to_csv('../results/UniqueNYSubmissionsEnrichedExpanded.csv', index=False)

 'DHO/CDC/CONT. HOUSING APPEALS' ... 'DHO/CDC/CONT. HOUSING APPEALS'
 'TRANSFER - REQUEST/OBJECTION' 'STAFF/OTHERS - COMPLAINTS']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  unique_nysubmissionsrenriched2.loc[:, 'CDSUB1PR'] = unique_nysubmissionsrenriched2['CDSUB1PR'].map(primarysubjectcodes.set_index('Primary Subject Code')['Primary Subject Code Translation']).fillna(unique_nysubmissionsrenriched2['CDSUB1PR'])
  unique_nysubmissionsrenriched2.loc[:, 'accept'] = unique_nysubmissionsrenriched2['accept'].replace({0: 'no', 1: 'yes'})
  unique_nysubmissionsrenriched2.loc[:, 'reject'] = unique_nysubmissionsrenriched2['reject'].replace({0: 'no', 1: 'yes'})
  unique_nysubmissionsrenriched2.loc[:, 'deny'] = unique_nysubmissionsrenriched2['deny'].replace({0: 'no', 1: 'yes'})
  unique_nysubmissionsrenriched2.loc[:, 'grant'] = unique_nysubmissionsrenriched2['grant'].replace({0: 'no', 1: 'yes'})
  unique_nysubmissionsrenriched2.loc[:, 'other'] = unique