# Clean Metadata
This notebook processes raw metadata provided by GISAID into the format required by Augur. Furthermore, it extracts additional information into unique variables.

## Setup

### Imports

In [1]:
import numpy as np
import pandas as pd
import plotnine as p9
import re

# Custom cleaning functions
from data.prep_data.clean_seqs import clean_strain_names
from data.prep_data.clean_seqs import deduplicate


### Files

#### Inputs

In [2]:
input_gisaid = 'data/raw/antarctica_metadata.xls'

#### Outputs



In [3]:
export_augur = 'data/clean/antarctica_metadata_clean.tsv'

### Parameters

## Load raw data from GISAID

In [4]:
meta_raw = pd.DataFrame(pd.read_excel(input_gisaid))
meta_raw.head()

Unnamed: 0,Isolate_Id,PB2 Segment_Id,PB1 Segment_Id,PA Segment_Id,HA Segment_Id,NP Segment_Id,NA Segment_Id,MP Segment_Id,NS Segment_Id,HE Segment_Id,...,PB2 INSDC_Upload,PB1 INSDC_Upload,PA INSDC_Upload,HA INSDC_Upload,NP INSDC_Upload,NA INSDC_Upload,MP INSDC_Upload,NS INSDC_Upload,HE INSDC_Upload,P3 INSDC_Upload
0,EPI_ISL_18439562,EPI2780114|A/Brown_skua/Bird_Island/128287/202...,EPI2780115|A/Brown_skua/Bird_Island/128287/202...,EPI2780113|A/Brown_skua/Bird_Island/128287/202...,EPI2780117|A/Brown_skua/Bird_Island/128287/202...,EPI2780110|A/Brown_skua/Bird_Island/128287/202...,EPI2780116|A/Brown_skua/Bird_Island/128287/202...,EPI2780112|A/Brown_skua/Bird_Island/128287/202...,EPI2780111|A/Brown_skua/Bird_Island/128287/202...,,...,,,,,,,,,,
1,EPI_ISL_18439563,EPI2780122|A/Brown_skua/Bird_Island/128288/202...,EPI2780123|A/Brown_skua/Bird_Island/128288/202...,EPI2780121|A/Brown_skua/Bird_Island/128288/202...,EPI2780125|A/Brown_skua/Bird_Island/128288/202...,EPI2780118|A/Brown_skua/Bird_Island/128288/202...,EPI2780124|A/Brown_skua/Bird_Island/128288/202...,EPI2780120|A/Brown_skua/Bird_Island/128288/202...,EPI2780119|A/Brown_skua/Bird_Island/128288/202...,,...,,,,,,,,,,
2,EPI_ISL_18439564,EPI2780130|A/Brown_skua/Bird_Island/128289/202...,EPI2780131|A/Brown_skua/Bird_Island/128289/202...,EPI2780129|A/Brown_skua/Bird_Island/128289/202...,EPI2780133|A/Brown_skua/Bird_Island/128289/202...,EPI2780126|A/Brown_skua/Bird_Island/128289/202...,EPI2780132|A/Brown_skua/Bird_Island/128289/202...,EPI2780128|A/Brown_skua/Bird_Island/128289/202...,EPI2780127|A/Brown_skua/Bird_Island/128289/202...,,...,,,,,,,,,,


Keep only the columns we are interested in.

In [5]:
retained_cols = [
    'Isolate_Name',
    'Collection_Date',
    'Isolate_Id',
    'Location',
    'Host',
    'Submitting_Lab',
    'Originating_Lab']

meta_working = meta_raw[retained_cols]

Standardize column name formatting:
- all lowercase
- underscore for word separation (already true)

In [6]:
meta_working.columns = [c.lower() for c in meta_working.columns]

Provide [required columns](https://docs.nextstrain.org/projects/ncov/en/wdl-optionals/analysis/data-prep.html#required-metadata) for Nextstrain.

In [7]:
meta_working = meta_working.rename(
    columns = {
        'isolate_name': 'strain',
        'collection_date': 'date'
    }
)
meta_working['virus'] = 'avian_flu'

## Parse metadata

### Parse country

In [8]:
split_columns = meta_working['location'].str.split(' / ', expand=True)
meta_working['region'] = split_columns[0]
meta_working['country'] = split_columns[1]

In [9]:
meta_working.head()

Unnamed: 0,strain,date,isolate_id,location,host,submitting_lab,originating_lab,virus,region,country
0,A/Brown_skua/Bird_Island/128287/2023,2023-10-08,EPI_ISL_18439562,Antarctica / Antarctica / South Georgia and th...,Other avian,Animal and Plant Health Agency (APHA),Animal and Plant Health Agency (APHA),avian_flu,Antarctica,Antarctica
1,A/Brown_skua/Bird_Island/128288/2023,2023-10-08,EPI_ISL_18439563,Antarctica / Antarctica / South Georgia and th...,Other avian,Animal and Plant Health Agency (APHA),Animal and Plant Health Agency (APHA),avian_flu,Antarctica,Antarctica
2,A/Brown_skua/Bird_Island/128289/2023,2023-10-08,EPI_ISL_18439564,Antarctica / Antarctica / South Georgia and th...,Other avian,Animal and Plant Health Agency (APHA),Animal and Plant Health Agency (APHA),avian_flu,Antarctica,Antarctica


### Parse host

In [10]:
# Rename raw host column:
meta_working = meta_working.rename(
    columns = {
        'host': 'host_raw',
    }
)

# Empirical lists of avian and human values
hosts_avian = ['Chicken', 'Other avian', 'Avian',
               'Wild bird', 'Turkey', 'Duck',
               'Gull', 'Gallus gallus domesticus',
               'Goose', 'Penguin', 'Swan',
               'Anas platyrhynchos var. domesticus',
               'Anas cyanoptera', 'Rynchops niger',
               'Cormorant', 'Calidris alba',
               'Larosterna inca']
hosts_mammal = ['Other mammals', 'Feline']

def label_host(val):
    if val in hosts_avian:
        return 'Avian'
    elif val in hosts_mammal:
        return 'Mammal'
    elif val == 'Human':
        return 'Human'
    else:
        return np.nan

meta_working['host'] = (
    meta_working['host_raw']
    .apply(label_host))

In [11]:
meta_working.head()

Unnamed: 0,strain,date,isolate_id,location,host_raw,submitting_lab,originating_lab,virus,region,country,host
0,A/Brown_skua/Bird_Island/128287/2023,2023-10-08,EPI_ISL_18439562,Antarctica / Antarctica / South Georgia and th...,Other avian,Animal and Plant Health Agency (APHA),Animal and Plant Health Agency (APHA),avian_flu,Antarctica,Antarctica,Avian
1,A/Brown_skua/Bird_Island/128288/2023,2023-10-08,EPI_ISL_18439563,Antarctica / Antarctica / South Georgia and th...,Other avian,Animal and Plant Health Agency (APHA),Animal and Plant Health Agency (APHA),avian_flu,Antarctica,Antarctica,Avian
2,A/Brown_skua/Bird_Island/128289/2023,2023-10-08,EPI_ISL_18439564,Antarctica / Antarctica / South Georgia and th...,Other avian,Animal and Plant Health Agency (APHA),Animal and Plant Health Agency (APHA),avian_flu,Antarctica,Antarctica,Avian


## Clean dataframe
Clean up values.

Clean strain names

In [12]:
meta_working['strain'] = (
    meta_working['strain']
    .apply(lambda x: clean_strain_names(x))
)

### Deduplicate strains

In [13]:
meta_working = deduplicate(meta_working, 'strain')

Originally, there were 3 records.
	There were 3 unique records.
	After cleaning, there were 3 records remaining.


## Export for Augur
Export tab-delimited file (TSV) for input into Augur.

- Replace `NaN` values with a question mark character (`?`), as preferred by Augur.
- Drop raw columns that have been parsed to new, expanded columns.
- Put columns in preferred order. This is not necessary for Augur, but provides a standard order for my own human reading.
- **Drop duplicate strains.**

In [14]:
col_order = [
    'strain',
    'date',
    'virus',
    'region', 
    'country',
    'host', 
    'isolate_id',
    'submitting_lab',
    'originating_lab']

meta_working = meta_working[col_order]

meta_clean = meta_working.fillna('?')

meta_clean.to_csv(export_augur, sep='\t', index=False)