In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from datetime import datetime

In [None]:
jan_feb_responses = pd.read_csv("data/Responses/responsesP1(Jan-Feb).csv")
mar_apr_responses = pd.read_csv("data/Responses/responsesP2(Mar-Apr).csv")
may_jun_responses = pd.read_csv("data/Responses/responsesP3(May-Jun).csv")
jul_aug_responses = pd.read_csv("data/Responses/responsesP4(Jul-Aug).csv")
sep_oct_responses = pd.read_csv("data/Responses/responsesP5(Sep-Oct).csv")
nov_dec_responses = pd.read_csv("data/Responses/responsesP6(Nov-Dec).csv")

Split responses into 111 calls vs non all other calls

In [None]:
jan_feb_responses_111 = jan_feb_responses[jan_feb_responses['chiefcomplaintcode'] == 111]
mar_apr_responses_111 = mar_apr_responses[mar_apr_responses['chiefcomplaintcode'] == 111]
may_jun_responses_111 = may_jun_responses[may_jun_responses['chiefcomplaintcode'] == 111]
jul_aug_responses_111 = jul_aug_responses[jul_aug_responses['chiefcomplaintcode'] == 111]
sep_oct_responses_111 = sep_oct_responses[sep_oct_responses['chiefcomplaintcode'] == 111]
nov_dec_responses_111 = nov_dec_responses[nov_dec_responses['chiefcomplaintcode'] == 111]

jan_feb_responses_not_111 = jan_feb_responses[jan_feb_responses['chiefcomplaintcode'] != 111]
mar_apr_responses_not_111 = mar_apr_responses[mar_apr_responses['chiefcomplaintcode'] != 111]
may_jun_responses_not_111 = may_jun_responses[may_jun_responses['chiefcomplaintcode'] != 111]
jul_aug_responses_not_111 = jul_aug_responses[jul_aug_responses['chiefcomplaintcode'] != 111]
sep_oct_responses_not_111 = sep_oct_responses[sep_oct_responses['chiefcomplaintcode'] != 111]
nov_dec_responses_not_111 = nov_dec_responses[nov_dec_responses['chiefcomplaintcode'] != 111]

Put all responses for the year into one dataframe

In [None]:
all_responses_111 = jan_feb_responses_111.append([mar_apr_responses_111,may_jun_responses_111,jul_aug_responses_111,sep_oct_responses_111,nov_dec_responses_111])
all_responses_not_111 = jan_feb_responses_not_111.append([mar_apr_responses_not_111,may_jun_responses_not_111,jul_aug_responses_not_111,sep_oct_responses_not_111,nov_dec_responses_not_111])

In [None]:
all_responses_111.shape

In [None]:
# clean repeated dates by restricting the dates
all_responses_111 = all_responses_111[(all_responses_111['callstart'] > '2017-12-31') & (all_responses_111['callstart'] < '2019-01-01')]
all_responses_not_111 = all_responses_not_111[(all_responses_not_111['callstart'] > '2017-12-31') & (all_responses_not_111['callstart'] < '2019-01-01')]

In [None]:
all_responses_111.shape

------

## Group 111 calls by LSOA and time of day during working days
Outputs one file per hour of the day with the count of 111 calls in each LSOA

In [None]:
all_responses_111['callstart'] = pd.to_datetime(all_responses_111['callstart'])
all_responses_111['day_of_week'] = all_responses_111['callstart'].dt.day_name()

In [None]:
# keep responses that happened on week days
all_responses_111_weekdays = all_responses_111[(all_responses_111['day_of_week'] != 'Sunday') & (all_responses_111['day_of_week'] != 'Saturday')]

In [None]:
all_responses_111_weekdays.shape

In [None]:
all_responses_111_weekdays['time_of_day'] = all_responses_111_weekdays['callstart'].dt.hour

In [None]:
for i in range(0, 24):
    file_name = "outputs/num_responses_111_per_lsoa_time_" + str(i) + ".csv"
    num_responses_111_per_lsoa_time_i = all_responses_111_weekdays[all_responses_111_weekdays['time_of_day'] == i].groupby('lsoa').count()[['incidentid']].reset_index()
    num_responses_111_per_lsoa_time_i.to_csv(file_name)

-----------

## Compare 111 calls to non-111 calls
Are there any noticeable patterns? Could we say that 111 calls are used more by locals? Is there a difference in 111 usage between females/males and different age groups?

In [None]:
all_responses_111.shape

In [None]:
all_responses_not_111.shape

### Borough

In [None]:
num_responses_111_per_borough = all_responses_111.groupby('borough').count()[['incidentid']].rename(columns={'incidentid':'num_111_responses'})
num_responses_not_111_per_borough = all_responses_not_111.groupby('borough').count()[['incidentid']].rename(columns={'incidentid':'num_not_111_responses'})

In [None]:
borough_111_vs_not_111 = pd.concat([num_responses_111_per_borough,num_responses_not_111_per_borough], axis=1)

In [None]:
borough_111_vs_not_111['per_111_responses'] = borough_111_vs_not_111['num_111_responses'] / borough_111_vs_not_111['num_111_responses'].sum() * 100
borough_111_vs_not_111['per_not_111_responses'] = borough_111_vs_not_111['num_not_111_responses'] / borough_111_vs_not_111['num_not_111_responses'].sum() * 100

borough_111_vs_not_111['per_abs_difference'] = (borough_111_vs_not_111['per_111_responses'] - borough_111_vs_not_111['per_not_111_responses']).abs()

In [None]:
borough_111_vs_not_111.to_csv("outputs/percentage_diff_111_vs_not_111_by_borough.csv")

In [None]:
borough_111_vs_not_111

### Gender

In [None]:
num_responses_111_per_gender = all_responses_111.groupby('sex').count()[['incidentid']].rename(columns={'incidentid':'num_111_responses'})
num_responses_not_111_per_gender = all_responses_not_111.groupby('sex').count()[['incidentid']].rename(columns={'incidentid':'num_not_111_responses'})

In [None]:
gender_111_vs_not_111 = pd.concat([num_responses_111_per_gender,num_responses_not_111_per_gender], axis=1)

In [None]:
gender_111_vs_not_111

Chi-squared test

In [None]:
chi2, p, dof, ex = chi2_contingency(gender_111_vs_not_111)

In [None]:
expected_df = pd.DataFrame(ex)
gender_expected_data = {'num_111_responses': expected_df.iloc[:,0], 'num_not_111_responses': expected_df.iloc[:,1]}
gender_111_vs_not_111_expected = pd.DataFrame(data = gender_expected_data)
gender_111_vs_not_111_expected.index = gender_111_vs_not_111.index

In [None]:
gender_111_vs_not_111_expected = gender_111_vs_not_111_expected.astype(int)

In [None]:
gender_111_vs_not_111_expected

### Age

In [None]:
# no NAs can be passed to this function -- TODO: check age is a valid number
def calculate_age_range(age):
    if (age <= 15):
        return '0-15'
    elif (age > 15 and age <= 29):
        return '16-29'
    elif (age > 29 and age <= 44):
        return '30-44'
    elif (age > 44 and age <= 64):
        return '45-64'
    elif (age > 64 and age <= 84):
        return '65-84'
    else:
        return '85+'

In [None]:
# remove all null ages
all_responses_111_age = all_responses_111.dropna(subset=['age'])
all_responses_not_111_age = all_responses_not_111.dropna(subset=['age'])

In [None]:
all_responses_111_age.shape

In [None]:
all_responses_not_111_age.shape

In [None]:
all_responses_111_age['age_range'] = all_responses_111_age['age'].apply(lambda x: calculate_age_range(x))
all_responses_not_111_age['age_range'] = all_responses_not_111_age['age'].apply(lambda x: calculate_age_range(x))

In [None]:
num_responses_111_per_age_range = all_responses_111_age.groupby('age_range').count()[['incidentid']].rename(columns={'incidentid':'num_111_responses'})
num_responses_not_111_per_age_range = all_responses_not_111_age.groupby('age_range').count()[['incidentid']].rename(columns={'incidentid':'num_not_111_responses'})

In [None]:
age_111_vs_not_111 = pd.concat([num_responses_111_per_age_range,num_responses_not_111_per_age_range], axis=1)

In [None]:
age_111_vs_not_111

Chi-squared test

In [None]:
chi2, p, dof, ex = chi2_contingency(age_111_vs_not_111)

In [None]:
expected_df = pd.DataFrame(ex)
age_expected_data = {'num_111_responses': expected_df.iloc[:,0], 'num_not_111_responses': expected_df.iloc[:,1]}
age_111_vs_not_111_expected = pd.DataFrame(data = age_expected_data)
age_111_vs_not_111_expected.index = age_111_vs_not_111.index
age_111_vs_not_111_expected = age_111_vs_not_111_expected.astype(int)

In [None]:
age_111_vs_not_111_expected

-----

## Ward Outliers per Day
Were there any wards that had outstanding incidents on a particular day?

In [None]:
all_responses = jan_feb_responses.append([mar_apr_responses, may_jun_responses, jul_aug_responses, sep_oct_responses, nov_dec_responses])

In [None]:
all_responses.shape

In [None]:
# get only 2018 data
all_responses = all_responses[(all_responses['callstart'] > '2017-12-31') & (all_responses['callstart'] < '2019-01-01')]

In [None]:
all_responses.shape

In [None]:
all_responses['day_of_year'] = pd.to_datetime(all_responses['callstart']).dt.dayofyear

In [None]:
responses_by_ward_and_day = all_responses.groupby(['wardID','day_of_year']).count()[['incidentid']].reset_index().rename(columns={'incidentid':'num_incidents'})

In [None]:
subset_days = responses_by_ward_and_day[responses_by_ward_and_day['day_of_year'] == 26]

In [None]:
subset_days.boxplot(column='num_incidents', grid=False)
plt.title("Distribution of incidents in wards on January 26")

In [None]:
group_by_day_of_year = responses_by_ward_and_day.groupby('day_of_year')

In [None]:
stats_incidents_per_day_all_wards = group_by_day_of_year['num_incidents'].describe().reset_index()

In [None]:
stats_incidents_per_day_all_wards['IQR'] = stats_incidents_per_day_all_wards['75%'] - stats_incidents_per_day_all_wards['25%']
# apply Tukey fences
stats_incidents_per_day_all_wards['outlier_upper_limit'] = stats_incidents_per_day_all_wards['75%'] + 1.5 * stats_incidents_per_day_all_wards['IQR']

In [None]:
outliers_per_ward = {}
data_outliers_per_day = []

for day in range(1,366):
    outlier_upper_limit = stats_incidents_per_day_all_wards[stats_incidents_per_day_all_wards['day_of_year'] == day].iloc[0]['outlier_upper_limit']
    outlier_wards = responses_by_ward_and_day[(responses_by_ward_and_day['day_of_year'] == day) & (responses_by_ward_and_day['num_incidents'] > outlier_upper_limit)]['wardID']
    
    for outlier_ward in outlier_wards:
        if outlier_ward in outliers_per_ward:
            outliers_per_ward[outlier_ward] += 1 
        else:
            outliers_per_ward[outlier_ward] = 1
        
        data_outliers_per_day.append([day,outlier_ward,True])

data_outliers_per_ward = []
for ward in outliers_per_ward:
    num_outliers = outliers_per_ward[ward]
    
    data_outliers_per_ward.append([ward,num_outliers])

In [None]:
outliers_per_ward = pd.DataFrame(data=data_outliers_per_ward, columns=['wardID','num_days_as_outlier'])
outliers_per_ward.head()

In [None]:
outliers_per_day = pd.DataFrame(data=data_outliers_per_day, columns=['day_of_year','wardID','is_outlier'])
outliers_per_day.shape

In [None]:
outliers_per_day.head()

In [None]:
outliers_per_ward.sort_values(by=['num_days_as_outlier'], ascending=False).head()

### Create daily london plots showing the day's outlier wards

In [None]:
london_wards = gpd.read_file("data/statistical-gis-boundaries-london-2011/ESRI/London_Ward_CityMerged.shp")

In [None]:
%matplotlib inline
london_wards_plot = london_wards.plot()
london_wards_plot.set_axis_off()

In [None]:
london_wards_profiles = pd.read_csv("data/ward-profiles-excel-version.csv", header=None)

In [None]:
london_wards_profiles.head()

In [None]:
london_wards_profiles = london_wards_profiles.iloc[:,[0,1,2]]

In [None]:
london_wards_profiles.columns = ['ward_name','wardID','GSS_CODE']

In [None]:
london_wards_profiles.head()

In [None]:
london_wards_with_old_code = pd.merge(london_wards, london_wards_profiles[['wardID','GSS_CODE']], on=['GSS_CODE'], how='left')

In [None]:
london_wards_with_incident_outliers = pd.merge(london_wards_with_old_code, outliers_per_ward, on=['wardID'], how='left')

In [None]:
london_wards_with_incident_outliers = london_wards_with_incident_outliers.fillna(0)

In [None]:
london_wards_with_incident_outliers.to_file("outputs/london_wards_with_incident_outliers.shp")

In [None]:
london_wards_with_old_code.head()

In [None]:
london_wards_with_old_code[['geometry','wardID']].head()

In [None]:
data_days_wards = []
for day in range(1,366):
    for ward in london_wards_with_old_code['wardID']:
        data_days_wards.append([day,ward])
        
days_vs_wards = pd.DataFrame(data=data_days_wards, columns=['day_of_year','wardID'])


In [None]:
days_vs_wards.head()

In [None]:
merge_days_vs_wards_outliers = pd.merge(days_vs_wards, outliers_per_day, on=['day_of_year','wardID'],how='left')

In [None]:
merge_days_vs_wards_outliers = merge_days_vs_wards_outliers.fillna(False)
merge_days_vs_wards_outliers.head()

In [None]:
merge_days_vs_wards_outliers.head()['date'] = pd.to_datetime(2018 * 1000 + merge_days_vs_wards_outliers['day_of_year'], format='%Y%j').dt.date
merge_days_vs_wards_outliers.head()

In [None]:
# Create a plot for each day of the year showing the outliers for that day
for day in range(1,366) :
    day_pattern = merge_days_vs_wards_outliers[merge_days_vs_wards_outliers['day_of_year'] == day]
    merge_day_pattern_and_geometry = pd.merge(day_pattern,london_wards_with_old_code[['geometry','wardID']], on='wardID',how='left')
    merge_day_pattern_and_geometry = gpd.GeoDataFrame(merge_day_pattern_and_geometry)

    image_title = datetime.strftime(datetime.strptime(str(merge_day_pattern_and_geometry['date'].iloc[0]), "%Y-%m-%d"),"%d %B %Y")
    image_file_path = "outputs/outliers_per_day/day_" + str(day) + ".png"
    
    fig, ax = plt.subplots(1)
    merge_day_pattern_and_geometry.plot(column='is_outlier', ax=ax)
    ax.axis('off')
    fig.suptitle(image_title)
    plt.savefig(image_file_path)
    plt.close(fig)
    