In [None]:
import pandas as pd
import requests
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np

%pylab inline

In [None]:
tfl_key = <API KEY HERE>
tfl_url  = "https://api.tfl.gov.uk/" # base URL
tflpayload = {'key': tfl_key} # Dictionary to hold request parameters

In [None]:
# startloc is string of latlong coordinates of postcode sector centroid
# endloc is string of latlong cordinates of hospital

requrl = "/Journey/JourneyResults/" + startloc + "/to/" + endloc
getreq = requests.get(tfl_url + requrl) # Do the request
jp = getreq.json()

# json path to travel duration:
jp['journeys'][0]['duration']

# json path to travel fare:
jp['journeys'][0]['fare']['totalCost']

In [None]:
def travelDuration(tfl_key, startloc, endloc, df, iRange):
    requrl = "/Journey/JourneyResults/" + startloc + "/to/" + endloc
    tfl_url  = "https://api.tfl.gov.uk/"
    tflpayload = {'key': tfl_key}
    
    for i in iRange:
        getreq = requests.get(tfl_url + requrl) # Do the request
        jp = getreq.json()
        try:
            df['travelDuration'][i] = jp['journeys'][0]['duration']
        except:
            pass

In [None]:
staffInPost = pd.read_csv("CUSP London Data Dive 2019/data/Workforce Data/LAS_Staff_in_Post_070319.csv")

In [None]:
staffInPost.columns

## Summary Statistics

In [None]:
print("Number of Males: %i" %len(staffInPost[staffInPost.Gender == 'Male']))
print("Number of Females: %i" %len(staffInPost[staffInPost.Gender == 'Female']))
print("Total Number of Employees: %i" %len(staffInPost))

In [None]:
staffInPost["LengthOfService(YearsFloat)"] = staffInPost["LengthOfService(Years)"] + staffInPost["LengthOfService(Months)"]/12

In [None]:
staffInPost.columns

In [None]:
for ageBand in sorted(staffInPost["AgeBand"].unique()):
    print("Percentage of employees " + ageBand + " : %f" %(len(staffInPost[staffInPost["AgeBand"] == ageBand])/len(staffInPost)))

In [None]:
plt.figure(figsize=(12,8))
plt.hist(staffInPost['ContractHours'], bins=30)
plt.xlabel("Contract Hours / Week")
plt.ylabel("Counts")
plt.title("Histogram")

### Investigating start months as a factor for leaving:

In [None]:
staffInPost['LatestStartDate'] = pd.to_datetime(staffInPost['LatestStartDate'], infer_datetime_format=True)

In [None]:
staffInPost['StartMonth'] = staffInPost['LatestStartDate'].dt.month

In [None]:
fig = plt.figure(figsize=(15,15))
fig.add_subplot(221)
plt.hist(staffInPost['StartMonth'], bins=30)
plt.xlabel("Starting Months")
plt.ylabel("Counts")
plt.title("All")

fig.add_subplot(222)
moreThan5 = staffInPost[staffInPost['LengthOfService(YearsFloat)'] > 5]
plt.hist(moreThan5['StartMonth'], bins=30)
plt.xlabel("Starting Months")
plt.ylabel("Counts")
plt.title("More than 5 Years")

fig.add_subplot(223)
moreThan10 = staffInPost[staffInPost['LengthOfService(YearsFloat)'] > 10]
plt.hist(moreThan10['StartMonth'], bins=30)
plt.xlabel("Starting Months")
plt.ylabel("Counts")
plt.title("More than 10 Years")

fig.add_subplot(224)
moreThan15 = staffInPost[staffInPost['LengthOfService(YearsFloat)'] > 15]
plt.hist(moreThan15['StartMonth'], bins=30)
plt.xlabel("Starting Months")
plt.ylabel("Counts")
plt.title("More than 15 Years")

In [None]:
for category in staffInPost['EmployeeCategory'].unique():
    print(category + ": %i" %len(staffInPost[staffInPost['EmployeeCategory'] == category]))

In [None]:
fig = plt.figure(figsize=(12,13))

fig.add_subplot(211)
ax = plt.subplot(211)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
fullTime = staffInPost[staffInPost['EmployeeCategory'] == 'Full Time']
ax.hist(fullTime['LengthOfService(YearsFloat)'], bins=45, color="#3F5D7D")
ax.set_xlabel("Length of Service (Years)", fontsize=14)
ax.set_ylabel("Number of Staff", fontsize=14)
ax.set_title("Full Time Staff", fontsize=16)
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)  

fig.add_subplot(212)
ax = plt.subplot(212)
plt.subplots_adjust(hspace=0.3)
plt.xticks(fontsize=12)  
plt.yticks(fontsize=12)
partTime = staffInPost[staffInPost['EmployeeCategory'] == 'Part Time']
ax.hist(partTime['LengthOfService(YearsFloat)'], bins=40, color="#3F5D7D")
ax.set_xlabel("Length of Service (Years)", fontsize=14)
ax.set_ylabel("Number of Staff", fontsize=14)
ax.set_title("Part Time Staff", fontsize=16)
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)

In [None]:
serviceLocations = pd.read_csv("CUSP London Data Dive 2019/data/Workforce Data/workplace location table.csv")
serviceLocations.columns

In [None]:
staffWorkLocations = pd.merge(left=staffInPost, right=serviceLocations, left_on='Location', right_on='esrLocationFull')
staffWorkLocations.columns

In [None]:
leavers = pd.read_csv("CUSP London Data Dive 2019/data/Workforce Data/LAS_Leavers_070319.csv")
leavers.columns

In [None]:
# reading in csv file Liam created that contains coords of postcode sector centroids:

pcSectorCentroids = pd.read_csv('co-ords4.csv')
pcSectorCentroids.rename(columns={'name': 'PostcodeSector'}, inplace=True)
pcSectorCentroids.drop(labels='Unnamed: 0', axis=1, inplace=True)
pcSectorCentroids.head()

In [None]:
staffLocations = staffWorkLocations.merge(pcSectorCentroids, how='outer', on='PostcodeSector')

In [None]:
len(staffLocations[staffLocations['centroids_str'].isnull()]['PostcodeSector'].unique())

In [None]:
staffLocations[staffLocations['centroids_str'].isnull()]['PostcodeSector'].unique()

In [None]:
# drop FALSE, W17 2, N39T 6, BT postcodes and manually insert other coordinates:

staffLocations[staffLocations['PostcodeSector'] == 'SW11 7']['centroids_str'] = '51.49177,-0.1370286'
staffLocations[staffLocations['PostcodeSector'] == 'SW11 8']['centroids_str'] = '51.5047865,-0.1541728'
staffLocations[staffLocations['PostcodeSector'] == 'KT14 3']['centroids_str'] = '51.3418376,-0.4998108'
staffLocations[staffLocations['PostcodeSector'] == 'CM6 4']['centroids_str'] = '51.8593837,0.1529848'
staffLocations[staffLocations['PostcodeSector'] == 'WD18 O']['centroids_str'] = '51.6426322,-0.4480033'
staffLocations[staffLocations['PostcodeSector'] == 'W6 2']['centroids_str'] = '51.4976485,-0.2581849'
staffLocations[staffLocations['PostcodeSector'] == 'NW9 4']['centroids_str'] = '51.5835152,-0.2870196'
staffLocations[staffLocations['PostcodeSector'] == 'TN24 2']['centroids_str'] = '51.148175,0.8501124'
staffLocations[staffLocations['PostcodeSector'] == 'DA11 O']['centroids_str'] = '51.4504867,0.3502812'

staffLocations = staffLocations[staffLocations['PostcodeSector'] != 'FALS E']
staffLocations = staffLocations[staffLocations['PostcodeSector'] != 'W17 2']
staffLocations = staffLocations[staffLocations['PostcodeSector'] != 'N39T 6']
staffLocations = staffLocations[staffLocations['PostcodeSector'] != 'BT31 9']
staffLocations = staffLocations[staffLocations['PostcodeSector'] != 'BT20 4']

In [None]:
# reading in csv file Liam created that contains coords of service locations:

stationCoords = pd.read_csv('station_pc_latlong.csv')
stationCoords.rename(columns={'Lat,long': 'WorkLatLon', 'Postcode': 'postcode'}, inplace=True)
labels=['Latitude', 'Longitude']
stationCoords.drop(labels=labels, axis=1, inplace=True)
stationCoords.head()

In [None]:
stationCoords['postcode'] = stationCoords['postcode'].astype(str)
staffLocations['postcode'] = staffLocations['postcode'].astype(str)

In [None]:
stationCoords.drop_duplicates('postcode', keep='first', inplace=True)

In [None]:
staffFinal = staffLocations.merge(right=stationCoords, how='inner', on='postcode')
len(staffFinal)

In [None]:
staffFinal.rename(columns={'location': 'WorkLocation', 'locationaddress': 'WorkAddress', 'postcode': 'WorkPostcode',
                              'esrLocationFull': 'WorkLocationFull', 'centroids_str': 'ResLatLon'}, inplace=True)

In [None]:
staffFinal['travelDuration'] = ''

In [None]:
# extracting duration from api
for i in range(0, 5712):
    startloc = staffFinal['ResLatLon'][i]
    endloc = staffFinal['WorkLatLon'][i]
    requrl = "https://api.tfl.gov.uk/Journey/JourneyResults/" + str(startloc) + "/to/" + str(endloc)
    tflpayload = {'key': tfl_key}
    
    getreq = requests.get(requrl)
    jp = getreq.json()
    try:
        staffFinal['travelDuration'][i] = jp['journeys'][0]['duration']
    except:
        staffFinal['travelDuration'][i] = np.nan

In [None]:
staffFinal.tail()

In [None]:
staffFinal.to_csv('staffFinal_v3.csv', index=False)

In [None]:
staffFinal['ResLat'] = staffFinal['ResLatLon'].str.split(pat=',', expand=True)[0]
staffFinal['ResLon'] = staffFinal['ResLatLon'].str.split(pat=',', expand=True)[1]
staffFinal.head()

In [None]:
staffFinal['WorkLat'] = staffFinal['WorkLatLon'].str.split(pat=',', expand=True)[0]
staffFinal['WorkLon'] = staffFinal['WorkLatLon'].str.split(pat=',', expand=True)[1]
staffFinal.head()

In [None]:
staffFinal.to_csv('staffFinal_v4.csv', index=False)

In [None]:
staffFinal.travelDuration = staffFinal.travelDuration.astype(float)

In [None]:
staffFinal.travelDuration.mean()

In [None]:
len(staffFinal)

In [None]:
# dropping all travel durations of more than 2 hours (120 minutes) and NaNs
labels = ['travelDuration']
staffFinal.dropna(subset=labels, inplace=True)
len(staffFinal)

In [None]:
staffFinal = staffFinal[staffFinal['travelDuration'] <= 120]
len(staffFinal)

In [None]:
locationMeanTravel = pd.DataFrame(staffFinal.groupby('WorkLatLon')['travelDuration'].mean()).reset_index()
locationMeanTravel.rename(columns={'travelDuration': 'meanDuration'}, inplace=True)
locationMeanTravel.head()

In [None]:
locationStdTravel = pd.DataFrame(staffFinal.groupby('WorkLatLon')['travelDuration'].std()).reset_index()
locationStdTravel.rename(columns={'travelDuration': 'stdDuration'}, inplace=True)
locationTravel = locationMeanTravel.merge(locationStdTravel, on='WorkLatLon')
locationTravel.head()

In [None]:
locationCount = pd.DataFrame(staffFinal.groupby('WorkLatLon')['travelDuration'].count()).reset_index()
locationCount.rename(columns={'travelDuration': 'counts'}, inplace=True)
locationTravel = locationTravel.merge(locationCount, on='WorkLatLon')
locationTravel.head()

In [None]:
locationTravel['WorkLat'] = locationTravel['WorkLatLon'].str.split(pat=',', expand=True)[0]
locationTravel['WorkLon'] = locationTravel['WorkLatLon'].str.split(pat=',', expand=True)[1]
locationTravel.sort_values('meanDuration', ascending=False).head(10)

In [None]:
locationNames = pd.DataFrame(staffFinal.groupby('WorkLatLon')['WorkLocation'].unique()).reset_index()
locationNames.rename(columns={'WorkLocation': 'locationNames'}, inplace=True)
locationTravel = locationTravel.merge(locationNames, on='WorkLatLon')
locationTravel.head(10)

In [None]:
locationTravel.locationNames = locationTravel.locationNames.astype(str)

In [None]:
locationTravel.to_csv('locationTravelDuration_v2.csv', index=False)

In [None]:
locationTravel.head(10)

In [None]:
staffFinal.columns

In [None]:
staffFinal.columns

In [None]:
staffFinal['WorkLocation'].unique()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,9))
ax = plt.subplot(111)
ax.scatter(staffFinal['travelDuration'], staffFinal['LengthOfService(YearsFloat)'])
ax.set_xlabel('travel duration (mins)', fontsize=13)
ax.set_ylabel('length of service (years)', fontsize=13)
ax.set_title('Staff In Post', fontsize=14)
plt.show()

In [None]:
staffFinal.AgeBand.unique()

In [None]:
if staffFinal['AgeBand'][0] == '<=20 Years':
    print(True)
else:
    print(False)

In [None]:
staffFinal['AgeBandIndex'] = ''

In [None]:
def ageBandIndex(elem):
    if elem == "<=20 Years":
        return 1
    elif elem == "21-25":
        return 2
    elif elem == "26-30":
        return 3
    elif elem == "31-35":
        return 4
    elif elem == "36-40":
        return 5
    elif elem == "41-45":
        return 6
    elif elem == "46-50":
        return 7
    elif elem == "51-55":
        return 8
    elif elem == "56-60":
        return 9
    elif elem == "61-65":
        return 10
    elif elem == "66-70":
        return 11
    elif elem == ">=71 Years":
        return 12
    else:
        return np.nan

In [None]:
staffFinal['AgeBandIndex'] = staffFinal['AgeBand'].apply(ageBandIndex)

In [None]:
staffFinal.corr()

In [None]:
staffFinal.columns

In [None]:
staffInPost.PositionTitle.unique()

In [None]:
len(staffInPost[staffInPost['PositionTitle'].str.lower().str.contains("emergency" or "paramedic" or "ambulance")])

In [None]:
ambulanceStation = staffInPost[staffInPost['Department'].str.lower().str.contains("ambulance station")]

In [None]:
len(ambulanceStation)

In [None]:
ambulanceStation['PositionTitle'].unique()

In [None]:
len(ambulanceStation[ambulanceStation['PositionTitle'].str.lower().str.contains("paramedic")])

In [None]:
len(ambulanceStation[ambulanceStation['PositionTitle'].str.lower().str.contains("technician")])

In [None]:
len(ambulanceStation[ambulanceStation['PositionTitle'].str.lower().str.contains("ambulance crew")])

In [None]:
len(staffInPost)

In [None]:
len(staffInPost[staffInPost['PositionTitle'].str.lower().str.contains("dispatcher")])

## Profile of the LAS Staff In Post
- Total of 5721 staff
- 2906 are based at an Ambulance Station (a bit more than half)

### Within the staff working at an ambulance station:
- 1691 are paramedics
- 904 are ambulance crew members
- 304 are technicians
- the rest are managers/leaders

In [None]:
leavers.Nationality.unique()

In [None]:
len(leavers)

In [None]:
len(leavers[leavers['Nationality'].isnull()])

In [None]:
leaversNoNA = leavers.dropna(axis=0, subset=["Nationality"])

In [None]:
len(leaversNoNA)

In [None]:
len(leaversNoNA[leaversNoNA['Nationality'].str.lower().str.contains("british")])

In [None]:
len(leaversNoNA) - len(leaversNoNA[leaversNoNA['Nationality'].str.lower().str.contains("british")])

In [None]:
398/1871

In [None]:
def nonBritish(elem):
    if elem == 'British':
        return 0
    else:
        return 1

In [None]:
leaversNoNA['nonBritish'] = leaversNoNA['Nationality'].map(nonBritish)

In [None]:
def relocation(elem):
    if "relocation" in elem:
        return 1
    else:
        return 0

In [None]:
leaversNoNA['relocation'] = leaversNoNA['Leaving Reason'].str.lower().map(relocation)

In [None]:
leaversNoNA.corr()

In [None]:
len(leaversNoNA[(leaversNoNA.relocation==1)&(leaversNoNA.nonBritish==0)])

In [None]:
len(leaversNoNA[leaversNoNA.nonBritish==0])

In [None]:
1871/2988

In [None]:
from beautifultable import BeautifulTable

In [None]:
table = BeautifulTable()
table.column_headers = ["", "British", "Non-British", "Total"]
table.append_row(["Relocated", 311, 249, 560])
table.append_row(["Other", 1162, 149, 1311])
table.append_row(["Total", 1473, 398, 1871])
print(table)

In [None]:
staffFinal.columns

In [None]:
tdmean = staffFinal.travelDuration.mean()

In [None]:
tdmean

In [None]:
plt.figure(figsize=(12,8))
plt.hist(staffFinal.travelDuration, bins=20, color="#3F5D7D")
tdmean = staffFinal.travelDuration.mean()
tdmedian = np.median(staffFinal.travelDuration)
tdsigma = staffFinal.travelDuration.std()
textstr = '\n'.join((
    r'$\mu=%.2f$' % (tdmean, ),
    r'$\mathrm{median}=%.2f$' % (tdmedian, ),
    r'$\sigma=%.2f$' % (tdsigma, )))
props = dict(boxstyle='round', facecolor='lightblue', alpha=0.3)
plt.text(0.1, 1.7, textstr, transform=ax.transAxes, fontsize=14,
        verticalalignment='top', bbox=props)
plt.title('Distribution of Travel Duration \n', fontsize=16)
plt.xlabel('Time in Minutes', fontsize=14)
plt.ylabel('Number of Staff', fontsize=14)
plt.show()