## Lab infection map
This notebook plots the locations of published laboratory-acquired infections listed in the [ABSA database](https://my.absa.org/LAI). I apply some cleaning to the data, which was mostly updated via web forms.

In [18]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.geocoders import Nominatim
from time import sleep
import pickle
import plotly.express as px
import numpy as np

In [19]:
location_cleaner = {
    'Multiple cities in China': 'China',
    'Pennsulvania and New York in USA': 'Pennsylvania, USA',
    'Fort Dietrick, Maryland, USA': 'Fort Detrick, Maryland, USA',
    'Laboratory for Mycology, Wadsworth Center for Laboratories and Research, New York State Department of Health, Albany, USA': 'Albany, NY',
    'A university in Hunan Province, China': 'Hunan Province, China',
    'Unknown, contributed by author in Boston, USA': 'Boston, MA, USA',
    'Shimoga and Uttara Kannada District in India': 'Uttara Kannada District, India',
    'Geelong, outside Melbourne, Australia': 'Melbourne, Australia',
    'National Institute of Virology in Beijing - China': 'Beijing, China',
    'Allegheny County, Pittsburgh, PA- USA': 'Pittsburgh, PA, USA',
    'United States, South Africa': 'South Africa',
    'Ft. Dietrick, Maryland, USA': 'Fort Detrick, Maryland, USA',
    'Woden Valley, Camberra, Australia.': 'Woden Valley, Canberra, Australia',
    'southern Israel': 'Israel',
    'Defence Research Establishment Suffield, Canada': 'Suffield, Canada',
    'Turku, Finland.': 'Turku, Finland',
    'Virginia, Pennsylvania and Texas (USA)': 'Virginia',
    'Case 1: Broward County, FL, USA and Case 2: Miami-Dade County, FL, USA': 'Broward County, FL, USA',
    'Istambul, Turkey': 'Istanbul, Turkey',
    'Bethesda, Maryland is the location of the first author; the other authors are in 4 other states in the USA.': 'Bethesda, MD, USA',
    'Ft. Dietrick, Maryland, USA': 'Fort Detrick, Maryland, USA',
    'Boston, Massachusets, USA': 'Boston, Massachusetts, USA',
    'Blacksburg, VA and Randolph County': 'Blacksburg, VA, USA',
    'national parks, USA': 'USA',
    'multiple cities, China': 'China',
    'Wiltshire England': 'Wiltshire, England',
    'Cleveland, Ohio. USA': 'Cleveland, OH, USA',
    'CO, WA, CA, MD, TX': 'Colorado, USA',
    'Pensacola Florida, USA': 'Pensacola, Florida, USA',
    'Nairobi, Kenya and Oxford, UK': 'Nairobi, Kenya',
    'California, Colorado, Florida, Georgia, Iowa, Maine, Michigan, Missouri, North Carolina, New Hampshire, New Jersey, New Mexico, New York, Oklahoma, Texas, Washington': 'USA',
    'Korea': 'South Korea',
    'Minesota, USA': 'Minnesota, USA',
    'Chicago, Il': 'Chicago, IL, USA',
    'Nairobi, Kenya and Oxford, UK': 'Nairobi, Kenya',
    'London. England': 'London, England',
    'Boston Massachusetts': 'Boston, MA, USA',
    'England/Wales': 'Great Britain',
    'Maryland, U.S.': 'Maryland, USA',
    'Not clear, but mostlikely Auburn Univeristy, Alabama USA': 'Auburn, Alabama, USA',
    '1980 case unknown, the second date: Caldwell Veterinary Teaching Center , Idaho': 'Idaho',
    'Most likley USA, no city given.': 'USA',
    'San Francisco, California. USA': 'San Francisco, California, USA',
    'LA, California. USA': 'Los Angeles, CA, USA',
    'Canadian Forces Base, Suffield, Alberta.': 'Suffield, Alberta, Canada',
    'Pensacola, Florida. USA': 'Pensacola, FL, USA',
    '49 states, USA': 'USA',
    'a "country [clinical] laboratory" in Australia': 'Australia',
    'Finland (multiple laboratories)': 'Finland',
    'Washington, D.C. (USA)': 'Washington, DC, USA',
    'United States in NIH-funded laboratories': 'United States',
    'Chicago, IL. USA': 'Chicago, IL, USA',
    'U.K., Wales and Scotland': 'Great Britain',
    'Unknown City, USA': 'USA',
    'Unknown (likely Washington DC or Bethesda, MD, USA)': 'Bethesda, MD, USA',
    'USAMRIID': 'Fort Detrick, Maryland',
'4 Canadian cities (exact cities not specified)': 'Canada',
    'Unknown, USA': 'USA',
    'Unknown City, USA': 'USA',
    'CDC': 'Atlanta, Georgia, USA',
    'Unknown (likely La Jolla, CA, USA)': 'La Jolla, CA, USA',
    'Richland, WA.': 'Richland, WA',
    'East Bay, California': 'Oakland, California',
    'Buenos Aires (?), Argentina': 'Buenos Aires, Argentina',
    'U. of Pretoria, 15km North of Pretoria, South Africa': 'Pretoria, South Africa',
    'Several states in USA: Illinois, Indiana, Kentucky, Maine, Massachusetts, Missouri, New Hampshire, New Jersey, Ohio, Pennsylvania, Virginia, Washington, and Wisconsin.': 'USA',
    '38 States in USA : AK, AL, AZ, CA, CO, FL, GA, IA, ID, IL, IN, KS, KY, MA, MD, MI, MN, MO, NC, ND, NE, NH, NJ, N , NV, NY, OH, OK, OR, PA, SC, SD, TN, TX, UT, WA, WI, and WY': 'USA',
    'China, University and hospital settings': 'China',
    'British Laboratories (200 England, 24 Scotland, 11 Wales and 5 Northern Ireland)': 'Great Britain',
    'Massachusetts, Boston, USA': 'Boston, Massachusetts, USA',
    'Philadelphia, PA, USA (authors)': 'Philadelphia, PA, USA',
    'Reading, Berkshire Co., England': 'Reading, Berkshire, England',
    'San Francisco, CA. USA': 'San Francisco, CA, USA',
    'Köln, Federal Republic of Germany': 'Köln, Germany',
    'Indiana, USA; Minnesota, USA': 'Indiana, USA',
    'Southern Maine, USA': 'Maine, USA',
    'Vanderburgh, Indiana. USA': 'Vanderburgh, Indiana, USA',
    'California USA; Massachusetts USA': 'California, USA',
    'Houston, Texas. USA': 'Houston, Texas, USA',
    'Alabama, Michigan, United States': 'Alabama, United States',
    'unknown, USA': 'USA',
    'Recife, PE, Brazil and Paranati, Matto Grosso, Brazil': 'Recife, PE, Brazil',
    '50 states, the District of Columbia, Puerto Rico, the Virgin Islands, Guam, American Samoa, and the Northern Mariana Islands were surveyed': 'USA',
    'Wroclaw region, Poland': 'Wroclaw, Poland',
    'Sverdlovsk, Union of Soviet Socialist Republics': 'Sverdlovsk',
    'Piladelphia, U.S.A.': 'Philadelphia, USA',
    'Piladelphia, USA' :'Philadelphia, USA',
    'Various states and provinces in the USA and Canada': 'USA',
    'Cayenne, French Guiyana': 'French Guiana',
    'Nice Cedex, France': 'Nice, France',
    'Winnipeg, Manitoba, Canada; Brandon, Manitoba, Canada': 'Winnipeg, Manitoba, Canada',
    'Maringa, Parana State, Brazil': 'Maringa, Brazil',
    'Germany, Austria and Switzerland': 'Germany',
    'Beidenkopf, Germany': 'Biedenkopf, Germany',
    'Riyahdh, Saudi Arabia': 'Riyadh, Saudi Arabia',
    'Hatfield, Pretoria, South Africia': 'Pretoria, South Africa',
    'Balboa Heights, Panama Canal Zone': 'Panama Canal Zone',
    'Institute of Preventive Medicine, National Defence University, Taipei-Taiwan': 'Taipei, Taiwan',
    'Hokuriku district, Japan': 'Hokuriku, Japan',
    'Tolouse, France': 'Toulouse, France',
}

In [20]:
agent_cleaner = {
    'influenza A (H1N1)': 'Influenza A (H1N1)',
    'herpes B virus': 'Herpes B virus',
    'Case 1 - Leptospira interrogans serogroup Icterohaemorrhagiae strain CF1, Case 2 - Leptospira interrogans serogroup Sejroe serovar hardjo': 'Case 1 - Leptospira interrogans serogroup Icterohaemorrhagiae strain CF1',
    'verotoxin-producing Escherichia coli (VTEC) (O157:H7)': 'Verotoxin-producing Escherichia coli (VTEC) (O157:H7)',
    'E.coli O157:H7': 'Escherichia coli O157:H7',
    'Neisseria Meningitidis. group C': 'Neisseria Meningitidis, group C',
    'Salmonella typhi, Salmonella paratyphi A, Salmonella paratyphi B (the abstract did not specify which caused the 2 laboratory associated cases)': 'Salmonella typhi, Salmonella paratyphi A, Salmonella paratyphi B',
    'simian foamy virus': 'Simian foamy virus',
    'simian foamy virus, human foamy virus': 'Simian foamy virus, Human foamy virus',
    'vesicular stomatitis virus': 'Vesicular stomatitis virus',
    'vaccina virus recombined with hepatitis C virus protein': 'Vaccina virus recombined with Hepatitis C virus protein',
    'vesicular stomatitis virus': 'Vesicular stomatitis virus',
    'recombinant WR strain vaccinia virus expressing VSV(IND serotype) protein N': 'Recombinant WR strain vaccinia virus expressing VSV(IND serotype) protein N',
    'hantavirus': 'Hantavirus',
    'human immunodeficiency virus, HIV': 'Human Immunodeficiency Virus, HIV',
    'Hantavirus: SinNombre(SNV) and Andes': 'Hantavirus: Sin Nombre(SNV) and Andes',
    'arboviruses': 'Arboviruses',
    'chikunguna virus, Dugbe virus, Wesselbron virus, dengue virus': 'Chikungunya virus, Dugbe virus, Wesselbron virus, Dengue virus',
    'arboviruses, Western Equine Encephalitis, Venezuelan Equine Encephalitis, Chikungunya, Mayaro, Louping Ill, Tick-borne encephalitis, Saint Louis encephalitis, Kyansasur Forest, West Nile': 'Arboviruses, Western Equine Encephalitis, Venezuelan Equine Encephalitis, Chikungunya, Mayaro, Louping Ill, Tick-borne encephalitis, Saint Louis encephalitis, Kyansasur Forest, West Nile',
    'vaccinia virus': 'Vaccinia virus',
    'E.coli 0157': 'Escherichia coli 0157',
    'ringworm, Bacillus anthracis, Brucella spp., Erysipelothrix rhusiopathiae, Leptospira spp., Newcastle virus, Chlamydia psittaci, Orf virus, Salmonella, Streptococcus suis, M. tuberculosis, M. bovis': 'Ringworm, Bacillus anthracis, Brucella spp., Erysipelothrix rhusiopathiae, Leptospira spp., Newcastle virus, Chlamydia psittaci, Orf virus, Salmonella, Streptococcus suis, M. tuberculosis, M. bovis',
    'human colonic adenocarcinoma cell line': 'Human colonic adenocarcinoma cell line',
    'Escherichia coli O157 infection': 'Escherichia coli O157',
    'TB, hepatitis A, hepatitis B, non-A non-B hepatitis, rubella, mycoplasma, Campylobacter, paratyphus, Salmonella, Chicken pox (Varicella zoster)': 'Tuberculosis, hepatitis A, hepatitis B, non-A non-B hepatitis, rubella, mycoplasma, Campylobacter, paratyphus, Salmonella, Chicken pox (Varicella zoster)',
    'lymphocytic choriomeningitis virus, Machupo, Sabia, hantavirus, Puumala virus, SARS, influenza B, Coxsackievirus': 'Lymphocytic choriomeningitis virus, Machupo, Sabia, hantavirus, Puumala virus, SARS, influenza B, Coxsackievirus',
    'parvovirus B19, HIV-1': 'Parvovirus B19, HIV-1',
    'foot and mouth disease virus': 'Foot and mouth disease virus',
    'herpes B virus, vaccinia, Lassa': 'Herpes B virus, vaccinia, Lassa',
    'cowpox virus': 'Cowpox virus',
    'norovirus': 'Norovirus',
    'vaccinia virus': 'Vaccinia virus',
    'mimivirus': 'Mimivirus',
    'vaccinia virus': 'Vaccinia virus',
    'dengue virus': 'Dengue virus',
    'polio virus': 'Polio virus',
    'polio': 'Polio',
    'cell-culture-adapted Hantavirus': 'Cell-culture-adapted Hantavirus',
    'rabies virus, modified live rabies virus': 'Rabies virus, modified live rabies virus',
    'fowl plague virus': 'Fowl plague virus',
    'Ebola virus Zaire, also Machupo, Japanese Encephalitis Virus, Dengue, Lassa, Junin, Rift Valley Fever': 'Ebola virus Zaire, Machupo, Japanese Encephalitis Virus, Dengue, Lassa, Junin, Rift Valley Fever',
    'influenza B virus': 'Influenza B',
    'vaccinia virus': 'Vaccinia virus',
    'orf virus': 'Orf virus',
    'HIV, human immunodeficiency virus type 1, HIV-1': 'HIV',
    'vaccinia': 'Vaccinia',
    'bovine spongiform encephalopathy, BSE, prion': 'Bovine spongiform encephalopathy prion',
    'Suspension of the live yeast form of Sporothrix schenckii': 'Sporothrix schenckii',
    'bovine spongiform encephalopathy prion, BSE, mad cow disease, vCJD, variant Creutzfeldt-Jakob disease': 'Bovine spongiform encephalopathy prion, variant Creutzfeldt-Jakob disease',
    'Only first initial was provided- V (suspected to be Marburg Virus)': 'Marburg Virus',
    'prion, BSE, bovine spongiform encephalopathy, CJD, Creutzfeld-Jakob Disease': 'Bovine spongiform encephalopathy prion, variant Creutzfeldt-Jakob disease',
}

In [21]:
def get_geocode(location):
    if location in locations_to_ignore:
        return None
    
    if location in location_cache:
        return location_cache[location]
    
    try:
        geocode = geolocator.geocode(location, timeout=3)
        location_cache[location] = geocode
        sleep(1)
        return geocode
    except Exception as e:
        print(f"Location {location} errored with error {e}")
        return None


In [22]:
def get_year(s):
    years = re.findall('\d{4}', s)
    if years:
        return int(years[0])
    else:
        return None

## Read data

In [23]:
f = open("raw_data.txt", "r") #copy/pasted from the two pages at https://my.absa.org/LAI
lines = f.readlines()
f.close()

In [24]:
date_strs = []
locations = []
agents = []
bsls = []

for line in lines:
    if line.startswith('Date(s) of LAI / exposure:'):
        date_strs.append(line[27:-1])
    if line.startswith('Location where LAI / exposure occurred:'):
        locations.append(line[40:-1])
    if line.startswith('Agent(s) involved:'):
        agents.append(line[19:-1])
    if line.startswith('Biological Safety Level (BSL) for work being performed?:'):
        bsls.append(line[56:-1])


## Clean data

In [25]:
geolocator = Nominatim(user_agent="MyApp")

In [26]:
location_cache = pickle.load(open('location_cache.pickle', 'rb'))
locations_to_ignore = ['Unknown', 'unknown', 'Various', 'Worldwide', 'Various locations', 'not stated', '', 
             'not known', 'multiple', 'Not stated']

In [27]:
df = pd.DataFrame({
    'orig_date_str': date_strs,
    'orig_location': locations,
    'orig_agent': agents,
    'bsl': bsls,
})

df['year'] = df['orig_date_str'].apply(get_year)
df['cleaned_agent'] = df['orig_agent'].replace(agent_cleaner)
df['orig_location'] = df['orig_location'].apply(lambda s: s.replace('U.S.A.', 'USA'))
df['orig_location'] = df['orig_location'].apply(lambda s: s.replace('U.S.A', 'USA'))
df['cleaned_location'] = df['orig_location'].replace(location_cleaner)

In [28]:
df['geocode'] = df['cleaned_location'].apply(get_geocode)
pickle.dump(location_cache, open('location_cache.pickle', 'wb'))

In [29]:
df['geocode_location'] = df['geocode'].apply(lambda g: g.address if g else None)
df['latitude'] = df['geocode'].apply(lambda g: g.latitude if g else None)
df['longitude'] = df['geocode'].apply(lambda g: g.longitude if g else None)
df['bsl'] = df['bsl'].apply(lambda s: s.strip())
df['year'] = df['year'].fillna('Unknown')

In [30]:
df.to_excel('cleaned_data.xlsx', index=False)

### Map
This makes a Plotly map, although the blog post uses a Datawrapper map

In [31]:
# Remove events with overly generic locations for large countries
df = df[~df['cleaned_location'].isin({'worldwide', 'Canada', 'USA', 'United States', 'US', 'United States of America', 'Quebec, Canada'})]
df = df[df['latitude'].notnull()]

In [32]:
df['size'] = 300 # The exact number doesn't matter
df['jittered_latitude'] = df['latitude'] + 0.06 * np.random.randn(len(df))
df['jittered_longitude'] = df['longitude'] + 0.06 * np.random.randn(len(df))

In [33]:
fig = px.scatter_mapbox(df, lat="jittered_latitude", lon="jittered_longitude", hover_name="cleaned_agent", 
                        hover_data={"cleaned_location": True, "cleaned_agent": True, "year": True,
                                     "jittered_latitude": False, "jittered_longitude": False, "size": False},
                        labels={'cleaned_agent': 'Agent', 'cleaned_location': 'Location',
                                'year': 'Year',
                               },
                        color_discrete_sequence=["#C4C967"], zoom=2, height=400, 
                        opacity=1, size='size', size_max=10)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [34]:
df[['year', 'cleaned_agent', 'cleaned_location', 'jittered_latitude', 'jittered_longitude']].to_excel('for_map.xlsx', index=False)