In [None]:
# Simulation imports
from reporting import simulation
from reporting.simulation import NORTH, EAST, SOUTH, WEST
from reporting.plot_outbreak import plot_outbreak

In [None]:
# Library imports
import geopandas
import matplotlib
import shapely
from matplotlib import pyplot




# Day 1
## All the field reports have come in and it's time to analyse them

In [None]:
simulation.gather_reports_for_day_1()

### Let's grab the data directly from the database

In [None]:
from psycopg2 import connect
connection = connect(dbname='johnsnow')
cases = geopandas.read_postgis(
    "select * from reporting_report", connection,
    geom_col='location',
    crs={'init': 'epsg:4326'}
)


In [None]:
cases[['doctor_name', 'patient_name', 'diagnosis', 'location']].sample(5)

### Now it's time to visualise them

In [None]:
# Set default size
matplotlib.rcParams['figure.figsize'] = [16.0, 12.0]

In [None]:
plot = cases.plot(marker="o", color="red", markersize=64, alpha=0.2)

### Let's distinguish different diagnoses

In [None]:
plot = cases.plot(marker="o", markersize=64, alpha=0.5, column='diagnosis')

### This data is useless without context. Let's map it

In [None]:
# Import London's rivers
rivers = geopandas.read_file('../open-street-map-data/london-rivers_shp/')

In [None]:
# Import London's roads
roads = geopandas.read_file('../open-street-map-data/london-roads_shp/')
# Filter the roads to just the most important ones
mains = roads[roads['highway'].isin(('trunk', 'primary', 'secondary', 'tertiary'))]

In [None]:
# Draw everything together
figure, axis = pyplot.subplots()
rivers_plot = rivers.plot(ax=axis, color='blue', alpha=0.3)
map_plot = mains.plot(ax=rivers_plot, color="black", alpha=0.2)
cases_plot = cases.plot(ax=map_plot, marker="o", column="diagnosis", markersize=64, alpha=0.5)

pyplot.show()

### What's going on? I have a hunch...

In [None]:
# Let's get open street map data about medical facilities
medical = geopandas.read_file('../open-street-map-data/london-medical_shp/')

# And use pandas filtering to select only the hospitals
hospitals = medical[medical['amenity'] == 'hospital'].copy()

In [None]:
# Overlay our various geo-datasets
figure, axis = pyplot.subplots()
rivers_plot = rivers.plot(ax=axis, color='blue', alpha=0.3)
map_plot = mains.plot(ax=rivers_plot, color="black", alpha=0.2)
cases_plot = cases.plot(ax=map_plot, marker="o", column="diagnosis", markersize=64, alpha=0.5)

# Plot a green cross at the location of each hospital
final = hospitals.plot(ax=cases_plot, marker="P", markersize=1000, color="green", alpha=0.4)

pyplot.show()

### We need to improve our analysis

In [None]:
# Create a polygon representing about 100m around each hospital
hospitals['geometry'] = hospitals['geometry'].buffer(0.001)
hospitals.plot()

In [None]:
# Join together the hospital zones with the cases
# The "join" is a spatial one - we are joining cases with the hospitals that they are close to
hospital_cases = geopandas.sjoin(
    cases,
    hospitals,
    op='within'
)
hospital_cases.plot(marker="o", column="diagnosis", markersize=64, alpha=0.5)

![Joins](spatial-join.png)

In [None]:
# Now each case that happened in a hospital is associated with the relevant hospital.
hospital_cases[['doctor_name', 'patient_name', 'diagnosis', 'location', 'name']].sample(1)

### Let's narrow down to cases that didn't happen at hospitals

In [None]:
# This is a polygon representing the whole area under study
bounding_box = shapely.geometry.Polygon((
    (WEST, NORTH),
    (EAST, NORTH),
    (EAST, SOUTH),
    (WEST, SOUTH),
    (WEST, NORTH)
))

# This is an area with holes in for each hospital zone
# It will look like a slice of Swiss cheese
minus_hospital_zones = bounding_box.difference(hospitals.unary_union)

area_of_interest = geopandas.GeoDataFrame(
    {
        'geometry': [minus_hospital_zones],
        'name': ['Area of interest']
    },
    crs={'init': 'epsg:4326'}
)
area_of_interest.plot()

In [None]:
# Now we do another spatial join saying
non_hospital_cases = geopandas.sjoin(cases, area_of_interest, op='within')
non_hospital_cases.plot(marker="o", column="diagnosis", markersize=64, alpha=0.5)

### And let's pull everything together

In [None]:
plot_outbreak()

# Day 2
### Not so peaceful

In [None]:
simulation.gather_reports_for_day_2()
plot_outbreak()

### If I was John Snow, I would investigate...

# Day 3

In [None]:
simulation.gather_reports_for_day_3()
plot_outbreak()

# 28 days later

In [None]:
simulation.gather_reports_for_day_28()
plot_outbreak()

# What else could we do?

## Use clustering algorithms to automatically detect outbreaks

Clustering algorithms could detect related diagnoses and give advance warning without human monitoring.

-----------

## Import data from APIs to find the source of an outbreak

Once we've spotted a cluster, we could import data on the kind of businesses in the area to spot e.g. a food poisoning outbreak.

-----------

## Use historic data to train an AI to recognise outbreaks

If we build up enough examples of outbreaks we could train a classifier to spot them.

-----------
