In [2]:
import pandas as pd
import datetime

In [6]:
def fill_gaps(df):
    all_dates = pd.date_range(start='2001-01-01', end='2023-10-10', freq='D')
    all_hours = range(1, 24)
    all_areas = range(1, 77)
    date_hour_combinations = pd.DataFrame(([datetime.datetime.strptime(str(date)[0:10], "%Y-%m-%d"), hour, area] for date in all_dates for hour in all_hours for area in all_areas), columns=['date', 'hour', 'area'])
    merged_df = date_hour_combinations.merge(df, on=['date', 'hour', 'area'], how='outer')

    return merged_df

In [1]:
def aggregate_data():
    
    #read in crime data to use as the base for aggregation
    clean_crime = pd.read_csv('../clean_datasets/clean_crime.csv')

    #clean the crime data so the aggregation process is smoother
    clean_crime['date'] = [datetime.datetime.strptime(clean_crime.date.iloc[i], '%m/%d/%Y %I:%M:%S %p') for i in range(len(clean_crime))]
    clean_crime['hour'] = clean_crime.date.apply(lambda x : (x.hour + 1))
    clean_crime['date'] = clean_crime.date.apply(lambda x : x.date())

    print('clean_crime successfully read in')

    #group the crime dataset utilizing a count aggregation
    grouped_crime = clean_crime.groupby(['date', 'hour', 'type', 'area']).size().reset_index(name='count')

    #pivot the violent vs non-violent counts to display across our grouped columns
    grouped_crime = grouped_crime.pivot_table(index=['date', 'hour', 'area'], columns='type', values='count', fill_value=0).reset_index()
    grouped_crime.rename(columns={1 : "non-violent", 2 : 'violent'}, inplace=True)

    print('crime dataset successfully grouped')

    #call fill_gaps function
    grouped_crime = fill_gaps(grouped_crime)

    print('fill gaps function successfully ran in')

    #read in the area reference data
    area_reference = pd.read_csv('../scoring_datasets/area_reference.csv')
    area_reference = [['id','cta_stations','police_stations','bus_stations','unemployment','per_capita_income',
                       'no_hs_dip','gov_depend','crowded_housing','below_pov','bike_stations']]
    
    #merge the area_reference dataset on top of the grouped_crime data
    grouped_crime.merge(area_reference, left_on='area', right_on='id', how='left')
    grouped_crime.drop(columns=['id'], inplace=True)

    #reorder the columns so they make more sense
    desired_order = ['date', 'hour', 'area', 'cta_stations',
                 'police_stations', 'bus_stations', 'bike_stations', 'unemployment',
                 'per_capita_income', 'no_hs_dip', 'gov_depend', 'crowded_housing',
                 'below_pov', 'non-violent', 'violent']
    grouped_crime = grouped_crime[desired_order]

    print('area reference data successfully merged')

    #read in cta ridership dataset
    clean_ridership = pd.read_csv('../clean_datasets/clean_ridership.csv')
    clean_ridership = [['date','area','rides']]
    
    #match datatypes of columns and then merge the two datasets together
    grouped_crime['date'] = pd.to_datetime(grouped_crime['date'])
    grouped_crime = grouped_crime.merge(clean_ridership, on=['date', 'area'], how='left')
    
    print('clean ridership data successfully merged')

    #read in divvy bike trips dataset
    clean_divvy_trips = pd.read_csv('../clean_datasets/clean_divvy_trips.csv')
    clean_divvy_trips = [['id','date','station_id','station_name','area']]

    #match datatypes of columns and then group the dataset to make merging together easier
    clean_divvy_trips['hour'] = pd.to_datetime(clean_divvy_trips['date']).dt.hour
    clean_divvy_trips['date'] = pd.to_datetime(pd.to_datetime(clean_divvy_trips['date']).dt.date)
    grouped_divvy = clean_divvy_trips.groupby(['date', 'hour', 'area'])['station_id'].agg('count').reset_index()  

    #merge the grouped divvy data onto the base crime data  
    grouped_crime = grouped_crime.merge(grouped_divvy, on=['date', 'hour', 'area'], how='left')
    grouped_crime.rename(columns={'station_id' : 'bike_rides', 'rides' : 'train_rides'}, inplace=True)

    print('clean divvy trips data successfully merged')

    #read in the lighting data, group it for aggregation, and then merge it on top of the crime data
    clean_lighting = pd.read_csv('../clean_datasets/clean_lighting.csv')
    clean_lighting = [['date','id','lat','long','status','area']]

    grouped_lighting = clean_lighting.groupby(['date', 'area'])['lat'].agg('count').reset_index()
    grouped_lighting.date = pd.to_datetime(grouped_lighting.date)
    grouped_crime = grouped_crime.merge(grouped_lighting, on=['date', 'area'], how='left')
    grouped_crime.rename(columns={'lat' : 'lighting'}, inplace=True)

    print('clean lighting data successfully merged')

    #read in the vacant apartments data, group it for aggregation, and then merge it on top of the crime data
    clean_vacant_buildings = pd.read_csv('../clean_datasets/clean_vacant_buildings.csv')
    clean_vacant_buildings = [['date', 'id', 'lat', 'long', 'status', 'area']]

    grouped_vacancies = clean_vacant_buildings.groupby(['date', 'area'])['long'].agg('count').reset_index()
    grouped_vacancies.date = pd.to_datetime(grouped_vacancies.date)
    grouped_crime = grouped_crime.merge(grouped_vacancies, on=['date', 'area'], how='left')
    grouped_crime.rename(columns={'long' : 'lighting'}, inplace=True)

    print('clean vacancies data successfully merged')

    return grouped_crime


In [5]:
aggregate_data()