In [65]:
# IMPORTS

import pandas as pd
from datetime import timedelta
from scipy.spatial import cKDTree

In [66]:
# COSTANTI

# dizionario che mappa i nomi dei borough ai loro acronimi
ACRONYMS = {'BRONX': 'BX',
            'BROOKLYN': 'BK',
            'MANHATTAN': 'MN',
            'QUEENS': 'QN',
            'STATEN ISLAND': 'SI'}

# dizionario che mappa i codici dei distretti alle sub-borough area
DISTRICTS_MAP = {'BK01': 'Williamsburg/Greenpoint',
                 'BK02': 'Brooklyn Heights/Fort Greene',
                 'BK03': 'Bedford Stuyvesant',
                 'BK04': 'Bushwick',
                 'BK05': 'East New York/Starrett City',
                 'BK06': 'Park Slope/Carroll Gardens',
                 'BK07': 'Sunset Park',
                 'BK08': 'North Crown Heights/Prospect Heights',
                 'BK09': 'South Crown Heights',
                 'BK10': 'Bay Ridge',
                 'BK11': 'Bensonhurst',
                 'BK12': 'Borough Park',
                 'BK13': 'Coney Island',
                 'BK14': 'Flatbush',
                 'BK15': 'Sheepshead Bay/Gravesend',
                 'BK16': 'Brownsville/Ocean Hill',
                 'BK17': 'East Flatbush',
                 'BK18': 'Flatlands/Canarsie',
                 'BX01': 'Mott Haven/Hunts Point',
                 'BX02': 'Mott Haven/Hunts Point',
                 'BX03': 'Morrisania/Belmont',
                 'BX04': 'Highbridge/South Concourse',
                 'BX05': 'University Heights/Fordham',
                 'BX06': 'Morrisania/Belmont',
                 'BX07': 'Kingsbridge Heights/Mosholu',
                 'BX08': 'Riverdale/Kingsbridge',
                 'BX09': 'Soundview/Parkchester',
                 'BX10': 'Throgs Neck/Co-op City',
                 'BX11': 'Pelham Parkway',
                 'BX12': 'Williamsbridge/Baychester',
                 'MN01': 'Greenwich Village/Financial District',
                 'MN02': 'Greenwich Village/Financial District',
                 'MN03': 'Lower East Side/Chinatown',
                 'MN04': 'Chelsea/Clinton/Midtown',
                 'MN05': 'Chelsea/Clinton/Midtown',
                 'MN06': 'Stuyvesant Town/Turtle Bay',
                 'MN07': 'Upper West Side',
                 'MN08': 'Upper East Side',
                 'MN09': 'Morningside Heights/Hamilton Heights',
                 'MN10': 'Central Harlem',
                 'MN11': 'East Harlem',
                 'MN12': 'Washington Heights/Inwood',
                 'QN01': 'Astoria',
                 'QN02': 'Sunnyside/Woodside',
                 'QN03': 'Jackson Heights',
                 'QN04': 'Elmhurst/Corona',
                 'QN05': 'Middle Village/Ridgewood',
                 'QN06': 'Rego Park/Forest Hills',
                 'QN07': 'Flushing/Whitestone',
                 'QN08': 'Hillcrest/Fresh Meadows',
                 'QN09': 'Ozone Park/Woodhaven',
                 'QN10': 'South Ozone Park/Howard Beach',
                 'QN11': 'Bayside/Little Neck',
                 'QN12': 'Jamaica',
                 'QN13': 'Queens Village',
                 'QN14': 'Rockaways',
                 'SI01': 'North Shore',
                 'SI02': 'Mid-Island',
                 'SI03': 'South Shore'}

DEGREES_CONVERSION_CONST = 111139

In [67]:
# PREPROCESSING DEL DATASET SULLE `REQUESTS`

def preprocess_requests_data():
    """
    Esegue il preprocessing iniziale del dataset '311-2023-05.csv',
    ottenuto precedentemente dalla selezione delle richieste di servizio 311
    fatte nel mese di maggio dell'anno 2023, da cui sono state inoltre
    eliminate colonne a valori per la maggior parte nulli.
    """
    df = pd.read_csv('datasets/311-2023-05.csv')

    cols_to_normalize = ['Complaint Type', 'Descriptor', 'Incident Address',
                         'Location Type', 'Borough', 'Status', 'Channel Type']
    
    normalize_string_spaces = lambda s: ' '.join(s.split())
    def transform_district_codes(s):
        number, borough = s.split(' ', 1)
        return f"{ACRONYMS.get(borough)}{number}"
    
    df.drop(columns=['Street Name', 'Vehicle Type', 'Taxi Company Borough'], inplace=True)
    df.rename(columns={'Community Board': 'Sub-Borough Area',
                       'Open Data Channel Type': 'Channel Type'}, inplace=True)

    df = df.dropna(subset=['Location'])
    for col in df.columns:
        df[col] = df[col].fillna('unknown')
    
    for col in cols_to_normalize:
        df[col] = df[col].apply(normalize_string_spaces)
        df[col] = df[col].str.lower()
    
    df['Sub-Borough Area'] = df['Sub-Borough Area'].apply(transform_district_codes).map(DISTRICTS_MAP).str.lower()

    df[['Latitude', 'Longitude']] = df['Location'].str.extract(r'\(([^,]+), ([^,]+)\)').astype(float)
    df.drop(columns=['Location'], inplace=True)

    df['Created Date'] = pd.to_datetime(df['Created Date'])
    df = df[(df['Created Date'] >= '2023-05-23') & (df['Created Date'] <= '2023-05-30')]

    df.to_csv('datasets/311-2023-05-v2.csv', index=False)

In [68]:
preprocess_requests_data()

  df = pd.read_csv('datasets/311-2023-05.csv')


In [69]:
# COSTANTI

# lista dei path dei dataset sui distretti
DISTRICTS_DATA_PATHS = ['datasets/district-incomedistribution.csv',
                        'datasets/district-povertyrate.csv',
                        'datasets/district-racecomposition.csv',
                        'datasets/district-crimerate.csv']

# lista delle coppie di distretti i cui dati sono ripetuti
DISTRICTS_TO_COLLAPSE = [('BX01', 'BX02'),
                         ('BX03', 'BX06'),
                         ('MN01', 'MN02'),
                         ('MN04', 'MN05')]

# lista delle coppie (path, rinominazione della colonna 2021) per ciascuno dei dataset sui sub-borough
SUBBOROUGHS_DATA_INFO = [('datasets/sub-borougharea-populationdensity1000personspersquaremile.csv', 'Population Density'),
                         ('datasets/sub-borougharea-populationaged65.csv', 'Population Aged 65+'),
                         ('datasets/sub-borougharea-borninnewyorkstate.csv', 'NYS Born People'),
                         ('datasets/sub-borougharea-foreign-bornpopulation.csv', 'Foreign Born People'),
                         ('datasets/sub-borougharea-disabledpopulation.csv', 'Disabled People'),
                         ('datasets/sub-borougharea-unemploymentrate.csv', 'Unemployment Rate'),
                         ('datasets/sub-borougharea-car-freecommuteofcommuters.csv', 'Car-Free Commuters'),
                         ('datasets/sub-borougharea-householdswithchildrenunder18yearsold.csv', 'Families with Children'),
                         ('datasets/sub-borougharea-populationaged25withabachelorsdegreeorhigher.csv', 'People O25 with Bachelor'),
                         ('datasets/sub-borougharea-populationaged25withoutahighschooldiploma.csv', 'People O25 without Diploma')]

In [70]:
# PREPROCESSING DEL DATASET SUI `SUBBOROUGHS`

def preprocess_subboroughs_data():
    """
    Esegue il preprocessing dei dataset sulle sub-borough area,
    ognuno contenente uno o più attributi di interesse. I dataset
    sono poi joinati tra loro per ottenere un unico dataset sui sub-boroughs.
    """
    df_incomedistribution = pd.read_csv(DISTRICTS_DATA_PATHS[0])
    df_povertyrate = pd.read_csv(DISTRICTS_DATA_PATHS[1])
    df_racecomposition = pd.read_csv(DISTRICTS_DATA_PATHS[2])
    df_crimerate = pd.read_csv(DISTRICTS_DATA_PATHS[3])

    df_incomedistribution.rename(columns={'year': 'Year'}, inplace=True)
    df_incomedistribution = df_incomedistribution[df_incomedistribution['Year'] == '2018-2022']

    def process_districts_data(df, cols_to_drop, rename_columns):
        df = df.copy()
        df = df.iloc[6:]
        df.drop(columns=cols_to_drop, inplace=True)
        df.rename(columns={'Geography': 'Community District'}, inplace=True)
        df['Community District'] = df['Community District'].apply(lambda s: ''.join(s.split()))
        df.rename(columns=rename_columns, inplace=True)
        return df
    
    cols_to_drop = ['Name', 'Level', 'Year']
    df_crimerate = process_districts_data(df_crimerate, cols_to_drop, {'property_crime_rate': 'Property Crime Rate',
                                                                       'violent_crime_rate': 'Violent Crime Rate'})
    df_incomedistribution = process_districts_data(df_incomedistribution, cols_to_drop, {'<=$20,000': 'Low Income Population',
                                                                                         '$20,001-$40,000': 'Medium-Low Income Population',
                                                                                         '$40,001-$60,000': 'Medium Income Population',
                                                                                         '$60,001-$100,000': 'Medium-High Income Population',
                                                                                         '$100,001-$250,000': 'High Income Population',
                                                                                         '>$250,000': 'Very High Income Population'})
    df_povertyrate = process_districts_data(df_povertyrate, cols_to_drop, {'poverty_rate': 'Poverty Rate'})
    df_racecomposition = process_districts_data(df_racecomposition, cols_to_drop, {'pop_hispanic_pct': 'Hispanic Population',
                                                                                   'pop_non_hispanic_asian_pct': 'Asian Population',
                                                                                   'pop_non_hispanic_black_pct': 'Black Population',
                                                                                   'pop_non_hispanic_white_pct': 'White Population'})
    
    df_join = pd.merge(df_crimerate, df_incomedistribution, on='Community District', how='inner')
    df_join = pd.merge(df_join, df_povertyrate, on='Community District', how='inner')
    df_join = pd.merge(df_join, df_racecomposition, on='Community District', how='inner')

    for i, col in enumerate(df_join.columns):
        if i > 2:
            df_join[col] = df_join[col].str.replace('%', '').astype(float)
    
    col_pc, col_vc = 'Property Crime Rate', 'Violent Crime Rate'
    for dist in DISTRICTS_TO_COLLAPSE:
        cond1, cond2 = df_join['Community District'] == dist[0], df_join['Community District'] == dist[1]

        new_pc = (df_join.loc[cond1, col_pc].values[0] + df_join.loc[cond2, col_pc].values[0]) / 2
        new_vc = (df_join.loc[cond1, col_vc].values[0] + df_join.loc[cond2, col_vc].values[0]) / 2
        df_join.loc[cond1 | cond2, col_pc] = round(new_pc, 2)
        df_join.loc[cond1 | cond2, col_vc] = round(new_vc, 2)
    
    df_join['Community District'] = df_join['Community District'].map(DISTRICTS_MAP)
    df_join.rename(columns={'Community District': 'Sub-Borough Area'}, inplace=True)
    df_join = df_join.drop_duplicates()

    cols_to_drop = [str(year) for year in range(2000, 2021)]
    cols_to_drop.extend(['short_name', 'long_name'])

    for info in SUBBOROUGHS_DATA_INFO:
        df = pd.read_csv(info[0])
        df.drop(columns=cols_to_drop, errors='ignore', inplace=True)
        df.rename(columns={'2021': info[1]}, inplace=True)
        df_join = pd.merge(df_join, df, on='Sub-Borough Area', how='inner')
    
    subboroughs_cols = [col[1] for col in SUBBOROUGHS_DATA_INFO]
    for i, col in enumerate(subboroughs_cols):
        if i > 1:
            df_join[col] = df_join[col].astype(float) * 100
        df_join[col] = df_join[col].round(2)
    
    df_join['Sub-Borough Area'] = df_join['Sub-Borough Area'].str.lower()
    df_join.to_csv('datasets/subboroughs-ny.csv', index=False)

In [71]:
preprocess_subboroughs_data()