In [1]:
import pandas as pd
import numpy as np

In [2]:
area_pre_feature_selection = pd.read_csv('../../data/pre_training/area_pre_feature_selection.csv')
district_pre_feature_selection = pd.read_csv('../../data/pre_training/district_pre_feature_selection.csv')

In [4]:
area_pre_feature_selection.head()

Unnamed: 0,area_id,date_hour,day,hour,year,month,day_of_week,area_unemployment,area_per_capita_income,area_no_hs_dip,...,district_crimes_3_hours_prev,district_crimes_6_hours_prev,district_crimes_12_hours_prev,district_crimes_24_hours_prev,area_crimes_this_hour,area_crimes_1_hours_prev,area_crimes_3_hours_prev,area_crimes_6_hours_prev,area_crimes_12_hours_prev,area_crimes_24_hours_prev
0,1,2016-01-01 00:00:00,1,0,2016,1,4,0.092179,0.193048,0.272401,...,0.013333,0.060086,0.084677,0.126812,0.00463,0.0,0.0,0.0,0.0,0.0
1,1,2016-01-01 00:00:00,1,0,2016,1,4,0.092179,0.193048,0.272401,...,0.013333,0.060086,0.084677,0.126812,0.00463,0.0,0.0,0.0,0.0,0.0
2,1,2016-01-01 00:00:00,1,0,2016,1,4,0.092179,0.193048,0.272401,...,0.013333,0.060086,0.084677,0.126812,0.00463,0.0,0.0,0.0,0.0,0.0
3,1,2016-01-01 00:00:00,1,0,2016,1,4,0.092179,0.193048,0.272401,...,0.013333,0.060086,0.084677,0.126812,0.00463,0.0,0.0,0.0,0.0,0.0
4,1,2016-01-01 01:00:00,1,1,2016,1,4,0.092179,0.193048,0.272401,...,0.008889,0.008584,0.008065,0.007246,0.00463,0.00463,0.004566,0.004484,0.004348,0.004016


In [3]:
area_features = area_pre_feature_selection.drop('area_crimes_this_hour', axis=1)
district_features = district_pre_feature_selection.drop('district_crimes_this_hour', axis=1)

area_target = area_pre_feature_selection[['year', 'area_crimes_this_hour']]
district_target = district_pre_feature_selection[['year', 'district_crimes_this_hour']]

In [4]:
# break the area dataset into testing and training datasets
area_feature_training_data = area_features[area_features['year'] < 2020].reset_index(drop=True)
area_feature_testing_data = area_features[area_features['year'] == 2020].reset_index(drop=True)

area_target_training_data = area_target[area_target['year'] < 2020].reset_index(drop=True)
area_target_testing_data = area_target[area_target['year'] == 2020].reset_index(drop=True)

In [5]:
# break the district dataset into testing and training datasets
district_feature_training_data = district_features[district_features['year'] < 2020].reset_index(drop=True)
district_feature_testing_data = district_features[district_features['year'] == 2020].reset_index(drop=True)

district_target_training_data = district_target[district_target['year'] < 2020].reset_index(drop=True)
district_target_testing_data = district_target[district_target['year'] == 2020].reset_index(drop=True)

In [6]:
area_target_training_data = area_target_training_data.drop('year', axis=1)
area_target_testing_data = area_target_testing_data.drop('year', axis=1)
district_target_training_data = district_target_training_data.drop('year', axis=1)
district_target_testing_data = district_target_testing_data.drop('year', axis=1)

In [7]:
area_feature_training_data = area_feature_training_data.drop('date_hour', axis=1)
area_feature_testing_data = area_feature_testing_data.drop('date_hour', axis=1)

district_feature_training_data = district_feature_training_data.drop('date_hour', axis=1)
district_feature_testing_data = district_feature_testing_data.drop('date_hour', axis=1)

In [8]:
# target encoding of district/area columns
area_means = area_pre_feature_selection.groupby('area_id')['area_crimes_this_hour'].mean()
district_means = district_pre_feature_selection.groupby('district')['district_crimes_this_hour'].mean()

area_feature_training_data['area_id_target_encoded'] = area_feature_training_data['area_id'].map(area_means)
area_feature_testing_data['area_id_target_encoded'] = area_feature_testing_data['area_id'].map(area_means)

district_feature_training_data['district_target_encoded'] = district_feature_training_data['district'].map(district_means)
district_feature_testing_data['district_target_encoded'] = district_feature_testing_data['district'].map(district_means)

In [9]:
# frequency encoding of district/area columns
area_freq = area_pre_feature_selection['area_id'].value_counts() / len(area_pre_feature_selection)
district_freq = district_pre_feature_selection['district'].value_counts() / len(district_pre_feature_selection)

area_feature_training_data['area_id_freq_encoded'] = area_feature_training_data['area_id'].map(area_freq)
area_feature_testing_data['area_id_freq_encoded'] = area_feature_testing_data['area_id'].map(area_freq)

district_feature_training_data['district_freq_encoded'] = district_feature_training_data['district'].map(district_freq)
district_feature_testing_data['district_freq_encoded'] = district_feature_testing_data['district'].map(district_freq)

In [10]:
area_feature_training_data.drop('area_id', axis=1, inplace=True)
area_feature_testing_data.drop('area_id', axis=1, inplace=True)

district_feature_training_data.drop('district', axis=1, inplace=True)
district_feature_testing_data.drop('district', axis=1, inplace=True)

In [11]:
def patch_datatypes(df):
    float_cols = df.select_dtypes(include=['float64']).columns
    df[float_cols] = df[float_cols].astype(np.float32)

    int_cols = df.select_dtypes(include=['int64']).columns
    df[int_cols] = df[int_cols].astype(np.int32)    
      
    return df

In [12]:
area_feature_training_data = patch_datatypes(area_feature_training_data)
area_feature_training_data.shape

(10659456, 84)

In [13]:
area_feature_testing_data = patch_datatypes(area_feature_testing_data)
area_feature_testing_data.shape

(2670336, 84)

In [14]:
district_feature_training_data = patch_datatypes(district_feature_training_data)
district_feature_training_data.shape

(806472, 83)

In [15]:
district_feature_testing_data = patch_datatypes(district_feature_testing_data)
district_feature_testing_data.shape

(202032, 83)

##### Selecting a Representative Sample

In [16]:
area_training_combined = pd.concat([area_feature_training_data, area_target_training_data], axis=1)
district_training_combined = pd.concat([district_feature_training_data, district_target_training_data], axis=1)

In [17]:
area_training_combined['crime_status'] = area_training_combined['area_crimes_this_hour'] > 0
district_training_combined['crime_status'] = district_training_combined['district_crimes_this_hour'] > 0

In [18]:
area_training_combined_false = area_training_combined[area_training_combined['crime_status'] == False].reset_index(drop=True)
area_training_combined_true = area_training_combined[area_training_combined['crime_status'] == True].reset_index(drop=True)

district_training_combined_false = district_training_combined[district_training_combined['crime_status'] == False].reset_index(drop=True)
district_training_combined_true = district_training_combined[district_training_combined['crime_status'] == True].reset_index(drop=True)

In [19]:
# Sturges' formula to determine the number of bins
def sturges_formula(n):
    return int(np.ceil(np.log2(n) + 1))

In [20]:
def bin_dataframe(df, exempt, bins):
    binned_df = pd.DataFrame()
    for col in df.columns:
        if col not in exempt:
            binned_df[col] = pd.cut(df[col], bins=bins, labels=False)
        else:
            binned_df[col] = df[col]
    
    return binned_df

In [21]:
area_false_bins = sturges_formula(len(area_training_combined_false))
area_true_bins = sturges_formula(len(area_training_combined_true))

district_false_bins = sturges_formula(len(district_training_combined_false))
district_true_bins = sturges_formula(len(district_training_combined_true))

In [22]:
print(area_training_combined_true.shape, district_training_combined_true.shape)

(1544337, 86) (288624, 85)


In [23]:
area_training_combined_false_binned = bin_dataframe(area_training_combined_false, ['day', 'hour', 'year', 'month', 'day_of_week', 'crime_status', 'area_id_target_encoded', 'area_id_freq_encoded'], area_false_bins)

In [24]:
area_training_combined_false_binned['combined'] = area_training_combined_false_binned.apply(lambda row: tuple(row), axis=1)
area_combined_false_weight = area_training_combined_false_binned['combined'].value_counts(normalize=True)
area_training_combined_false_binned['combined_weight'] = area_training_combined_false_binned['combined'].apply(lambda x: area_combined_false_weight[x])
area_training_combined_false_binned_sample = area_training_combined_false_binned.sample(n=len(area_training_combined_true), weights=area_training_combined_false_binned['combined_weight']).drop(['combined', 'combined_weight'], axis=1).reset_index()
area_training_false_sample = area_training_combined_false.loc[area_training_combined_false_binned_sample.index]

In [25]:
district_training_combined_false_binned = bin_dataframe(district_training_combined_false, ['day', 'hour', 'year', 'month', 'day_of_week', 'crime_status', 'district_id_target_encoded', 'district_id_freq_encoded'], district_false_bins)

In [26]:
district_training_combined_false_binned['combined'] = district_training_combined_false_binned.apply(lambda row: tuple(row), axis=1)
district_combined_false_weight = district_training_combined_false_binned['combined'].value_counts(normalize=True)
district_training_combined_false_binned['combined_weight'] = district_training_combined_false_binned['combined'].apply(lambda x: district_combined_false_weight[x])
district_training_combined_false_binned_sample = district_training_combined_false_binned.sample(n=len(district_training_combined_true), weights=district_training_combined_false_binned['combined_weight']).drop(['combined', 'combined_weight'], axis=1).reset_index()
district_training_false_sample = district_training_combined_false.loc[district_training_combined_false_binned_sample.index]

In [27]:
area_training_sample = pd.concat([area_training_false_sample, area_training_combined_true])
area_training_sample = area_training_sample.sample(frac=1).reset_index(drop=True)

In [28]:
district_training_sample = pd.concat([district_training_false_sample, district_training_combined_true])
district_training_sample = district_training_sample.sample(frac=1).reset_index(drop=True)

In [29]:
area_feature_training_sample = area_training_sample.drop('area_crimes_this_hour', axis=1)
area_target_training_sample = area_training_sample[['area_crimes_this_hour']]

district_feature_training_sample = district_training_sample.drop('district_crimes_this_hour', axis=1)
district_target_training_sample = district_training_sample[['district_crimes_this_hour']]

In [None]:
area_feature_training_sample.to_csv('../../data/pre_training/area_feature_training_sample.csv', index=False)
area_target_training_sample.to_csv('../../data/pre_training/area_target_training_sample.csv', index=False)
district_feature_training_sample.to_csv('../../data/pre_training/district_feature_training_sample.csv', index=False)
district_target_training_sample.to_csv('../../data/pre_training/district_target_training_sample.csv', index=False)

In [None]:
area_feature_testing_data.to_csv('../../data/pre_training/area_feature_testing_data.csv', index=False)
area_target_testing_data.to_csv('../../data/pre_training/area_target_testing_data.csv', index=False)
district_feature_testing_data.to_csv('../../data/pre_training/district_feature_testing_data.csv', index=False)
district_target_testing_data.to_csv('../../data/pre_training/district_target_testing_data.csv', index=False)