In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
area_pre_feature_selection = pd.read_csv('../../data/pre_training/area_pre_feature_selection.csv')
district_pre_feature_selection = pd.read_csv('../../data/pre_training/district_pre_feature_selection.csv')

In [3]:
area_features = area_pre_feature_selection.drop('area_crimes_this_hour', axis=1)
district_features = district_pre_feature_selection.drop('district_crimes_this_hour', axis=1)

area_target = area_pre_feature_selection[['year', 'area_crimes_this_hour']]
district_target = district_pre_feature_selection[['year', 'district_crimes_this_hour']]

In [4]:
# break the area dataset into testing and training datasets
area_feature_training_data = area_features[area_features['year'] < 2020].reset_index(drop=True)
area_feature_testing_data = area_features[area_features['year'] == 2020].reset_index(drop=True)

area_target_training_data = area_target[area_target['year'] < 2020].reset_index(drop=True)
area_target_testing_data = area_target[area_target['year'] == 2020].reset_index(drop=True)

In [5]:
# break the district dataset into testing and training datasets
district_feature_training_data = district_features[district_features['year'] < 2020].reset_index(drop=True)
district_feature_testing_data = district_features[district_features['year'] == 2020].reset_index(drop=True)

district_target_training_data = district_target[district_target['year'] < 2020].reset_index(drop=True)
district_target_testing_data = district_target[district_target['year'] == 2020].reset_index(drop=True)

In [6]:
area_target_training_data = area_target_training_data.drop('year', axis=1)
area_target_testing_data = area_target_testing_data.drop('year', axis=1)
district_target_training_data = district_target_training_data.drop('year', axis=1)
district_target_testing_data = district_target_testing_data.drop('year', axis=1)

In [7]:
area_feature_training_data = area_feature_training_data.drop('date_hour', axis=1)
area_feature_testing_data = area_feature_testing_data.drop('date_hour', axis=1)

district_feature_training_data = district_feature_training_data.drop('date_hour', axis=1)
district_feature_testing_data = district_feature_testing_data.drop('date_hour', axis=1)

In [8]:
# target encoding of district/area columns
area_means = area_pre_feature_selection.groupby('area_id')['area_crimes_this_hour'].mean()
district_means = district_pre_feature_selection.groupby('district')['district_crimes_this_hour'].mean()

area_feature_training_data['area_id_target_encoded'] = area_feature_training_data['area_id'].map(area_means)
area_feature_testing_data['area_id_target_encoded'] = area_feature_testing_data['area_id'].map(area_means)

district_feature_training_data['district_target_encoded'] = district_feature_training_data['district'].map(district_means)
district_feature_testing_data['district_target_encoded'] = district_feature_testing_data['district'].map(district_means)

In [9]:
# frequency encoding of district/area columns
area_freq = area_pre_feature_selection['area_id'].value_counts() / len(area_pre_feature_selection)
district_freq = district_pre_feature_selection['district'].value_counts() / len(district_pre_feature_selection)

area_feature_training_data['area_id_freq_encoded'] = area_feature_training_data['area_id'].map(area_freq)
area_feature_testing_data['area_id_freq_encoded'] = area_feature_testing_data['area_id'].map(area_freq)

district_feature_training_data['district_freq_encoded'] = district_feature_training_data['district'].map(district_freq)
district_feature_testing_data['district_freq_encoded'] = district_feature_testing_data['district'].map(district_freq)

In [10]:
area_feature_training_data.drop('area_id', axis=1, inplace=True)
area_feature_testing_data.drop('area_id', axis=1, inplace=True)

district_feature_training_data.drop('district', axis=1, inplace=True)
district_feature_testing_data.drop('district', axis=1, inplace=True)

In [11]:
def patch_datatypes(df):
    float_cols = df.select_dtypes(include=['float64']).columns
    df[float_cols] = df[float_cols].astype(np.float32)

    int_cols = df.select_dtypes(include=['int64']).columns
    df[int_cols] = df[int_cols].astype(np.int32)    
      
    return df

In [12]:
area_feature_training_data = patch_datatypes(area_feature_training_data)
area_feature_training_data.shape

(10659456, 84)

In [13]:
area_feature_testing_data = patch_datatypes(area_feature_testing_data)
area_feature_testing_data.shape

(2670336, 84)

In [14]:
district_feature_training_data = patch_datatypes(district_feature_training_data)
district_feature_training_data.shape

(806472, 83)

In [15]:
district_feature_testing_data = patch_datatypes(district_feature_testing_data)
district_feature_testing_data.shape

(202032, 83)

##### Selecting a Representative Sample

In [17]:
area_training_combined = pd.concat([area_feature_training_data, area_target_training_data], axis=1)
district_training_combined = pd.concat([district_feature_training_data, district_target_training_data], axis=1)

In [138]:
area_training_combined['crime_status'] = area_training_combined['area_crimes_this_hour'] > 0
district_training_combined['crime_status'] = district_training_combined['district_crimes_this_hour'] > 0

In [155]:
# Sturges' formula to determine the number of bins
def sturges_formula(n):
    return int(np.ceil(np.log2(n) + 1))

In [161]:
def bin_dataframe(df, exempt, bins):
    binned_df = pd.DataFrame()
    for col in df.columns:
        if col not in exempt:
            binned_df[col] = pd.cut(df[col], bins=bins, labels=False)
        else:
            binned_df[col] = df[col]
    
    return binned_df

In [162]:
area_bins = sturges_formula(len(area_training_combined))

In [164]:
area_training_combined_binned = bin_dataframe(area_training_combined, ['day', 'hour', 'year', 'month', 'day_of_week', 'crime_status', 'area_id_target_encoded', 'area_id_freq_encoded'], area_bins)

- Group by Crime Status
- Extract ALL or as many as possible True Crimes
- Match the amount of False Crimes randomly
- Use oversampling + Gaussian Noise if the amount of True Crimes is not nearly enough (needs a lot more)
- Use Gaussian Noise to add more and mix up some of them
- Use CNN + ENN to clean up the dataset
- Possibly revisit oversampling/GN if sample is too removed

In [171]:
area_training_combined_binned['combined'] = area_training_combined_binned.apply(lambda row: tuple(row), axis=1)

In [173]:
area_combined_weight = area_training_combined_binned['combined'].value_counts(normalize=True)

In [175]:
area_training_combined_binned['combined_weight'] = area_training_combined_binned['combined'].apply(lambda x: area_combined_weight[x])

In [395]:
area_training_combined_binned_sample = area_training_combined_binned.sample(frac=0.1, weights=area_training_combined_binned['combined_weight']).drop(['combined', 'combined_weight'], axis=1).reset_index()

  .apply(lambda x: x.sample(frac=0.1, weights=x['combined_weight']))


In [398]:
area_training_combined_binned_sample

Unnamed: 0,day,hour,year,month,day_of_week,area_unemployment,area_per_capita_income,area_no_hs_dip,area_gov_depend,area_crowded_housing,...,district_crimes_24_hours_prev,area_crimes_1_hours_prev,area_crimes_3_hours_prev,area_crimes_6_hours_prev,area_crimes_12_hours_prev,area_crimes_24_hours_prev,area_id_target_encoded,area_id_freq_encoded,area_crimes_this_hour,crime_status
0,15,4,2019,9,6,11,4,7,18,3,...,0,0,0,0,0,0,0.000312,0.039474,0,False
1,12,20,2016,2,4,14,0,10,19,12,...,6,0,0,0,3,5,0.000884,0.032895,0,False
2,11,15,2019,12,2,9,1,12,18,10,...,5,0,0,0,2,5,0.001855,0.046053,0,False
3,18,7,2019,9,2,8,1,10,19,10,...,0,0,0,0,0,0,0.000655,0.039474,0,False
4,1,5,2018,11,3,5,1,12,17,9,...,0,0,0,0,0,0,0.000832,0.023026,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065941,16,23,2016,5,0,13,2,7,19,5,...,5,0,0,0,2,4,0.001400,0.046053,0,True
1065942,17,9,2016,7,6,11,2,9,16,7,...,3,0,0,1,3,5,0.004215,0.059211,0,True
1065943,1,12,2016,6,2,21,0,12,18,9,...,4,0,0,1,2,4,0.001208,0.059211,0,True
1065944,28,12,2017,5,6,11,2,9,16,7,...,4,0,0,1,2,5,0.004215,0.059211,0,True


In [396]:
area_training_sample = area_training_combined.loc[area_training_combined_binned_sample.index]

In [399]:
area_training_combined_binned_sample['crime_status'].value_counts(normalize=True)

crime_status
False    0.85512
True     0.14488
Name: proportion, dtype: float64

In [201]:
district_bins = sturges_formula(len(district_training_combined))

In [202]:
district_training_combined_binned = bin_dataframe(district_training_combined, ['day', 'hour', 'year', 'month', 'day_of_week', 'crime_status', 'district_id_target_encoded', 'district_id_freq_encoded'], district_bins)

In [203]:
district_training_combined_binned['combined'] = district_training_combined_binned.apply(lambda row: tuple(row), axis=1)

In [204]:
district_combined_weight = district_training_combined_binned['combined'].value_counts(normalize=True)

In [205]:
district_training_combined_binned['combined_weight'] = district_training_combined_binned['combined'].apply(lambda x: district_combined_weight[x])

In [231]:
district_training_combined_binned_sample = district_training_combined_binned.sample(round(0.2 * len(district_training_combined_binned)), weights=district_training_combined_binned['combined_weight']).drop(['combined', 'combined_weight'], axis=1).reset_index()

In [232]:
district_training_sample = district_training_combined.loc[district_training_combined_binned_sample.index]

In [224]:
area_feature_training_sample = area_training_sample.drop('area_crimes_this_hour', axis=1)
area_target_training_sample = area_training_sample['area_crimes_this_hour'].to_numpy()

district_feature_training_sample = district_training_sample.drop('district_crimes_this_hour', axis=1)
district_target_training_sample = district_training_sample['district_crimes_this_hour'].to_numpy()

##### SMOTE

In [239]:
import ImbalancedLearningRegression as iblr

In [385]:
rel_ctrl_pts_rg = [
    [0, 0],    # Point 1: No relevance for target value 0
    [0.001, 1],  # Point 2: Full relevance for target values > 0
]

In [251]:
district_training_sample['crime_status'].value_counts(normalize=True)

crime_status
False    0.588478
True     0.411522
Name: proportion, dtype: float64

In [255]:
district_training_sample_ro = iblr.ro(data=district_training_sample, y='district_crimes_this_hour', rel_xtrm_type='high', rel_ctrl_pts_rg=rel_ctrl_pts_rg)

r_index: 100%|#################################################################| 47499/47499 [00:29<00:00, 1606.04it/s]


In [256]:
district_training_sample_ro['crime_status'].value_counts(normalize=True)

crime_status
True     0.545397
False    0.454603
Name: proportion, dtype: float64

In [379]:
temp2 = area_training_sample.sample(1000).reset_index(drop=True)

In [376]:
temp2['area_crimes_this_hour'].value_counts(normalize=True)

area_crimes_this_hour
0.000000    0.913
0.004630    0.060
0.009259    0.018
0.013889    0.006
0.018519    0.003
Name: proportion, dtype: float64

In [286]:
temp = district_training_sample_ro.sample(1000).reset_index(drop=True)

In [391]:
area_training_sample['crime_status'].value_counts(normalize=True)

crime_status
False    0.925401
True     0.074599
Name: proportion, dtype: float64

In [386]:
temp2_ro =iblr.ro(data=area_training_sample, y='area_crimes_this_hour', rel_xtrm_type='high', rel_ctrl_pts_rg=rel_ctrl_pts_rg)

ValueError: redefine phi relevance function: all points are 0

In [380]:
temp_gn = iblr.gn(data=temp2, y='area_crimes_this_hour', manual_perc=True, pert=0.04, perc_u=0.99, perc_o=1, rel_xtrm_type='high', rel_ctrl_pts_rg=rel_ctrl_pts_rg)

ValueError: redefine phi relevance function: all points are 0

In [308]:
temp_gn['crime_status'].value_counts(normalize=True)

crime_status
True     0.503384
False    0.496616
Name: proportion, dtype: float64

In [268]:
district_training_sample_gn = iblr.gn(data=district_training_sample_ro, y='district_crimes_this_hour', manual_perc=True, pert=0.04, perc_u=0.9, perc_o= 1.5, rel_xtrm_type='high', rel_ctrl_pts_rg=rel_ctrl_pts_rg)

synth_matrix:   0%|2                                                             | 123/36090 [00:33<2:45:32,  3.62it/s]


KeyboardInterrupt: 

In [None]:
district_training_sample_gn['crime_status'].value_counts(normalize=True)