# Code for SwDS dissertation 1 (Part 1)
#### Part 2 of the code can be found in 'R_code.Rmd'

#### Yilun Dong (s1994256), July/2020

In [1]:
# imports
import pandas as pd
import numpy as np
import folium

In [2]:
# Read file
envdf = pd.read_excel('Data for enteric virus MSc.xlsx',
                      header = 0, sheet_name = 0)

In [3]:
# helper function
def add_points(m, df, colour):
    '''
    This function can add data from a data frame "df" to a map "m" using specific "colour".
    The radius of the circle is determined by the number of patients from the same coordinate.
    '''
    idxlist = list(df.index)
    for i in idxlist:
        lat = df['LATITUDE'][i]
        lon = df['LONGITUDE'][i]
        r = int(df.loc[i, df.columns[2]])
        folium.CircleMarker(
            popup = str(i),
            location = [lat, lon],
            radius = r,
            color = colour,
            fill = True,
            fill_color = colour
        ).add_to(m)

In [4]:
# Remove NAs in `is_coinf`
envdf_filtered = envdf[envdf['is_coinf'].notna()]

# coordinates of infections
coordinates_filtered = envdf_filtered.groupby(['LATITUDE','LONGITUDE']).size().reset_index()

# geographical distribution of infections (Table 1)
envdf_filtered.groupby(['CentrallyCity','ProvincialCity']).size().reset_index()

# coordinates of coinfections
envdf_filtered_coinf = envdf_filtered[envdf_filtered['is_coinf'] > 1]
coordinates_filtered_coinf = envdf_filtered_coinf.groupby(['LATITUDE','LONGITUDE']).size().reset_index()

# geographical distribution of coinfections (Table 1)
envdf_filtered_coinf.groupby(['CentrallyCity','ProvincialCity']).size().reset_index()

# set initial position of the map
lat_initial_filtered = np.mean(coordinates_filtered['LATITUDE'])
lon_initial_filtered = np.mean(coordinates_filtered['LONGITUDE'])

In [5]:
# Figure 1
m_1 = folium.Map(
    location=[lat_initial_filtered, lon_initial_filtered],
    zoom_start=5
    )
add_points(m_1, coordinates_filtered, 'blue')
add_points(m_1, coordinates_filtered_coinf, 'red')
m_1

In [6]:
# Find the most/least frequent viruses (Table 2)

frequency_df = pd.DataFrame()
name_list = []
frequency_list = [] 

for i in range(55,87):
    name = envdf.columns[i]
    name_list.append(name)
    count = (envdf.iloc[:,i] == 1).sum()
    frequency_list.append(count)
    
frequency_df['virus_name'] = name_list
frequency_df['frequency'] = frequency_list
frequency_df = frequency_df.sort_values(by = ['frequency'], ascending = False)
frequency_df = frequency_df.reset_index(drop = True)
frequency_df['proportion'] = frequency_df['frequency']/sum(frequency_df['frequency'])

frequency_df

Unnamed: 0,virus_name,frequency,proportion
0,Rotavirus,151,0.2106
1,Norovirus,89,0.124128
2,Alphatorquevirus,72,0.100418
3,Sapovirus,57,0.079498
4,Betatorquevirus,53,0.073919
5,Enterovirus,52,0.072524
6,Mastadenovirus,51,0.07113
7,Picobirnavirus,37,0.051604
8,Parechovirus,34,0.04742
9,Unclassified virus,26,0.036262


In [7]:
# disrtribution of common virus (Table 3)

common_virus_df = envdf[['CentrallyCity','ProvincialCity', 'Rotavirus', 'Norovirus', 'Sapovirus', 
                         'Kobuvirus', 'Mastadenovirus', 'Mamastrovirus']]
rota_df = common_virus_df[common_virus_df['Rotavirus'] 
                          == 1].groupby(['CentrallyCity','ProvincialCity']).size().reset_index()
noro_df = common_virus_df[common_virus_df['Norovirus'] 
                          == 1].groupby(['CentrallyCity','ProvincialCity']).size().reset_index()
sapo_df = common_virus_df[common_virus_df['Sapovirus'] 
                          == 1].groupby(['CentrallyCity','ProvincialCity']).size().reset_index()
kobu_df = common_virus_df[common_virus_df['Kobuvirus'] 
                          == 1].groupby(['CentrallyCity','ProvincialCity']).size().reset_index()
mastadeno_df = common_virus_df[common_virus_df['Mastadenovirus'] 
                               == 1].groupby(['CentrallyCity','ProvincialCity']).size().reset_index()
mamastro_df = common_virus_df[common_virus_df['Mamastrovirus'] 
                              == 1].groupby(['CentrallyCity','ProvincialCity']).size().reset_index()

In [8]:
# disrtribution of least frequent virus and unclassified viruses(Table 4)

uncommon_virus_df = envdf[['CentrallyCity','ProvincialCity', 
                           'Porprismacovirus', 'Protoparvovirus', 'Alphapapillomavirus',
                           'Gemycircularvirus', 'Husavirus', 'Unclassified virus']]
porprismaco_df = uncommon_virus_df[uncommon_virus_df['Porprismacovirus'] 
                                   == 1].groupby(['CentrallyCity','ProvincialCity']).size().reset_index()
protoparvo_df = uncommon_virus_df[uncommon_virus_df['Protoparvovirus'] 
                                  == 1].groupby(['CentrallyCity','ProvincialCity']).size().reset_index()
alphapapilloma_df = uncommon_virus_df[uncommon_virus_df['Alphapapillomavirus'] 
                                      == 1].groupby(['CentrallyCity','ProvincialCity']).size().reset_index()
gemycircular_df = uncommon_virus_df[uncommon_virus_df['Gemycircularvirus'] 
                                    == 1].groupby(['CentrallyCity','ProvincialCity']).size().reset_index()
husavirus_df = uncommon_virus_df[uncommon_virus_df['Husavirus'] 
                                 == 1].groupby(['CentrallyCity','ProvincialCity']).size().reset_index()
uc_df = uncommon_virus_df[uncommon_virus_df['Unclassified virus'] 
                          == 1].groupby(['CentrallyCity','ProvincialCity']).size().reset_index()