## import pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Load the k13 data in into a pandas dataframe

In [2]:
k13 = pd.read_excel('data/k13 surveyor data 02.xls')

In [3]:
## Show just the header and the top three rows
k13.head(3)

Unnamed: 0,study id,site,mutation,present,tested,country,continent,lon,lat,year,title,authors,pubMedId,val,estLoc
0,PSHHG,Chin State,wildtype,60,62,Myanmar,Asia,92.848,21.3,2014,Spread of artemisinin-resistant Plasmodium fa...,"Tun KM, Imwong M, Lwin KM, Win AA, Hlaing TM, ...",25704894,False,0
1,PSHHG,Chin State,P574L,2,62,Myanmar,Asia,92.848,21.3,2014,Spread of artemisinin-resistant Plasmodium fa...,"Tun KM, Imwong M, Lwin KM, Win AA, Hlaing TM, ...",25704894,True,0
2,PSHHG,Kachin State,wildtype,19,30,Myanmar,Asia,97.391632,25.371031,2013,Spread of artemisinin-resistant Plasmodium fa...,"Tun KM, Imwong M, Lwin KM, Win AA, Hlaing TM, ...",25704894,False,0


In [4]:
print "There are", len(k13), "rows"

There are 1038 rows


In [5]:
print "The complete list of countries is: \n\n", ', '.join(k13['country'].unique().tolist())


The complete list of countries is: 

Myanmar, Bangladesh, Thailand, Lao People's Democratic Republic, Viet Nam, Nigeria, India, Democratic Republic of the Congo, Cambodia, Kenya, Mali, Gambia, Ghana, Burkina Faso, Uganda, Malawi, Tanzania, Côte d'Ivoire, Gabon, Senegal, Angola, China, Ethiopia, Guyana, Comoros, Eritrea, Equatorial Guinea, Afghanistan, Benin, Guinea, Guinea-Bissau, Liberia, Mauritania, Niger, Sierra Leone, Cameroon, Togo, Chad, Congo, Burundi, Rwanda, Sudan, Somalia, South Sudan, Mozambique, South Africa, Zambia, Zimbabwe, Madagascar, Iran, Nepal, Indonesia, Philippines, Papua New Guinea, Solomon Islands, Colombia, Ecuador, Venezuela, French Guiana, Peru


### Seperate Myanmar, Thailand, Cambodia, Laos and Vietnam

In [6]:
k13_countries = k13[
    (k13.country == 'Myanmar') 
    | (k13.country == 'Thailand') 
    | (k13.country == 'Cambodia') 
    | (k13.country == "Lao People's Democratic Republic") 
    | (k13.country == 'Viet Nam')
].copy()

In [7]:
print "There are now", len(k13_countries), "rows"

There are now 608 rows


In [8]:
print "The list of countries is now: \n\n", ", ".join(k13_countries['country'].unique().tolist())

The list of countries is now: 

Myanmar, Thailand, Lao People's Democratic Republic, Viet Nam, Cambodia


### Get only the true values

Remove any row where val = False

In [9]:
k13_countries_val = k13_countries[ k13_countries['val'] == True ].copy()

In [10]:
print "There are now", len(k13_countries_val), "rows"

There are now 337 rows


### Include only the validated mutations

Remove any rows where the mutation is not N458Y, R539T, I543T or C580Y

From [WHO pdf](http://apps.who.int/iris/bitstream/handle/10665/255213/WHO-HTM-GMP-2017.9-eng.pdf)

| K13 MUTATION 	| CLASSIFICATION	|
|---------------|-------------------|
| E252Q 		| Not associated	|
| P441L 		| Candidate			|
| F446I 		| Candidate			|
| G449A 		| Candidate			|
| N458Y 		| **Validated**		|
| Y493H 		| **Validated**		|
| G538V 		| Candidate			|
| R539T 		| **Validated**		|
| I543T 		| **Validated**		|
| P553L 		| Candidate			|
| R561H 		| Candidate			|
| V568G 		| Candidate			|
| P574L 		| Candidate			|
| A578S 		| Not associated	|
| C580Y 		| **Validated**		|
| A675V 		| Candidate			|


In [11]:
listOfMutations = k13_countries_val.mutation.unique().tolist()
listOfMutations.sort()

print "The complete list of mutations is: \n", ", \n".join(listOfMutations)

The complete list of mutations is: 
A481V, 
A675V, 
C580C/Y, 
C580Y, 
D584V, 
E252Q, 
F446I, 
F614L, 
G449A/D, 
G538V, 
H719N, 
I543T, 
M476I, 
N458Y, 
N525D, 
N537I, 
P441L, 
P553L, 
P574L, 
R539R/T, 
R539T, 
R561H/C, 
V568G


Note: Y493H and A578S are not present in the list of mutations 

In [12]:
## Create a function to classify the mutations
def classifyMutation (mutation) :
    if mutation == "E252Q" :
        return "Not associated"
    if mutation == "P441L" :
        return "Candidate"
    if mutation == "F446I" :
        return "Candidate"
    if mutation == "G449A/D" :
        return "Candidate"
    if mutation == "N458Y" :
        return "Validated"
    if mutation == "G538V" :
        return "Candidate"
    if mutation == "R539T" :
        return "Validated"
    if mutation == "R539R/T" :
        return "Validated"
    if mutation == "I543T" :
        return "Validated"
    if mutation == "P553L" :
        return "Candidate"
    if mutation == "R561H/C" :
        return "Candidate"
    if mutation == "V568G" :
        return "Candidate"
    if mutation == "P574L" :
        return "Candidate"
    if mutation == "C580Y" :
        return "Validated"
    if mutation == "C580C/Y" :
        return "Validated"
    if mutation == "A675V" :
        return "Candidate"
    return "Not associated"


In [13]:
## Add a new column with the classification
k13_countries_val.loc[:,"classification"] = k13_countries_val.loc[:,"mutation"].apply(classifyMutation)

In [14]:
## Keep only the validated AND candidate mutations
k13_countries_val_mutations = k13_countries_val[ k13_countries_val.classification != "Not associated" ]

In [15]:
filteredMutations = k13_countries_val_mutations.mutation.unique().tolist()
filteredMutations.sort()
mutations_df = pd.DataFrame({"mutation":filteredMutations})
mutations_df["classification"] = mutations_df["mutation"].apply(classifyMutation)

print "The list of mutations is now: \n"
mutations_df

The list of mutations is now: 



Unnamed: 0,mutation,classification
0,A675V,Candidate
1,C580C/Y,Validated
2,C580Y,Validated
3,F446I,Candidate
4,G449A/D,Candidate
5,G538V,Candidate
6,I543T,Validated
7,N458Y,Validated
8,P441L,Candidate
9,P553L,Candidate


In [16]:
print "There are now", len(k13_countries_val_mutations), "rows"

There are now 291 rows


### Exclude sample size less than 20

In [17]:
## Keep only the rows where tested >= 20
k13_countries_val_mutations_size = k13_countries_val_mutations[
    k13_countries_val_mutations.tested >= 20 
]

In [18]:
print "There are now", len(k13_countries_val_mutations_size), "rows"

There are now 235 rows


In [19]:
## Confirm that the minimum tested size is greater than or equal to 20
k13_countries_val_mutations_size.tested.describe()

count    235.000000
mean      94.289362
std      113.665319
min       27.000000
25%       43.000000
50%       67.000000
75%      107.000000
max      882.000000
Name: tested, dtype: float64

### Exclue prevalence less than 5%

Prevalence is present / tested

In [20]:
k13_countries_val_mutations_size_prevalence = k13_countries_val_mutations_size[
    (k13_countries_val_mutations_size.present / k13_countries_val_mutations_size.tested) >= 0.05
]

In [21]:
print "There are now", len(k13_countries_val_mutations_size_prevalence), "rows"

There are now 120 rows


In [22]:
## Check that the minimum prevalence is greater or equal to 5%
k13_prevalence = (k13_countries_val_mutations_size_prevalence.loc[:,"present"] / k13_countries_val_mutations_size_prevalence.loc[:,"tested"])* 100

In [23]:
k13_prevalence.describe()

count    120.000000
mean      22.490639
std       21.591921
min        5.000000
25%        7.607466
50%       12.500000
75%       26.707317
max       88.095238
dtype: float64

In [26]:
reduced = k13_countries_val_mutations_size_prevalence[["lon","lat","year","pubMedId"]].copy()

In [29]:
reduced.head(10)

Unnamed: 0,lon,lat,year,pubMedId
3,97.391632,25.371031,2013,25704894
9,97.391632,25.371031,2014,25704894
16,98.121568,16.040823,2013,25704894
23,98.121568,16.040823,2013,25704894
27,98.121568,16.040823,2013,25704894
38,98.121568,16.040823,2014,25704894
50,94.923889,24.888889,2013,25704894
72,98.711517,16.29706,2013,25704894
81,106.908012,11.755278,2011,25075834
84,106.908012,11.755278,2011,25075834


In [28]:
print "There are now", len(reduced), "rows"

There are now 120 rows


In [47]:
reduced.drop_duplicates(subset=["lon", "lat", "pubMedId"], keep='first', inplace=True)

In [48]:
reduced.head(10)

Unnamed: 0,lon,lat,year,pubMedId
3,97.391632,25.371031,2013,25704894
16,98.121568,16.040823,2013,25704894
50,94.923889,24.888889,2013,25704894
72,98.711517,16.29706,2013,25704894
81,106.908012,11.755278,2011,25075834
103,98.568001,16.714899,2011,25075834
118,102.60778,12.85811,2011,25075834
126,104.845462,14.00858,2011,25075834
131,103.918999,12.5383,2011,25075834
154,96.52182,20.330143,2012,25075834


In [49]:
print "There are now", len(reduced), "rows"

There are now 42 rows


## Export to csv

In [50]:
reduced.to_csv("data/k13_by_year_reduced.csv", index=False)