# Clean Metadata
This notebook processes raw metadata provided by GISAID into the format required by Augur. Furthermore, it extracts additional information into unique variables.

## Setup

### Imports

In [1]:
import numpy as np
import pandas as pd

### Files

#### Inputs

In [2]:
input_gisaid = 'data/raw/northamerica_metadata.xls'

#### Outputs



In [3]:
export_augur = 'data/clean/northamerica_metadata_clean.tsv'

### Parameters

## Load raw data from GISAID

In [4]:
meta_raw = pd.DataFrame(pd.read_excel(input_gisaid))
meta_raw.head()

Unnamed: 0,Isolate_Id,PB2 Segment_Id,PB1 Segment_Id,PA Segment_Id,HA Segment_Id,NP Segment_Id,NA Segment_Id,MP Segment_Id,NS Segment_Id,HE Segment_Id,...,PB2 INSDC_Upload,PB1 INSDC_Upload,PA INSDC_Upload,HA INSDC_Upload,NP INSDC_Upload,NA INSDC_Upload,MP INSDC_Upload,NS INSDC_Upload,HE INSDC_Upload,P3 INSDC_Upload
0,EPI_ISL_9880015,EPI1985927|Seq1 [organism=Influenza A virus](A...,EPI1985928|Seq2 [organism=Influenza A virus](A...,EPI1985926|Seq3 [organism=Influenza A virus](A...,EPI1985930|Seq4 [organism=Influenza A virus](A...,EPI1985923|Seq5 [organism=Influenza A virus](A...,EPI1985929|Seq6 [organism=Influenza A virus](A...,EPI1985925|Seq7 [organism=Influenza A virus](A...,EPI1985924|Seq8 [organism=Influenza A virus](A...,,...,,,,,,,,,,
1,EPI_ISL_9880019,EPI1985935|Seq1_A/mallard/North Carolina/AH018...,EPI1985936|Seq2_A/mallard/North Carolina/AH018...,EPI1985934|Seq3_A/mallard/North Carolina/AH018...,EPI1985938|Seq4_A/mallard/North Carolina/AH018...,EPI1985931|Seq5_A/mallard/North Carolina/AH018...,EPI1985937|Seq6_A/mallard/North Carolina/AH018...,EPI1985933|Seq7_A/mallard/North Carolina/AH018...,EPI1985932|Seq8_A/mallard/North Carolina/AH018...,,...,,,,,,,,,,
2,EPI_ISL_9880021,EPI1985943|Seq1_A/northern pintail/North Carol...,EPI1985944|Seq2_A/northern pintail/North Carol...,EPI1985942|Seq3_A/northern pintail/North Carol...,EPI1985946|Seq4_A/northern pintail/North Carol...,EPI1985939|Seq5_A/northern pintail/North Carol...,EPI1985945|Seq6_A/northern pintail/North Carol...,EPI1985941|Seq7_A/northern pintail/North Carol...,EPI1985940|Seq8_A/northern pintail/North Carol...,,...,,,,,,,,,,
3,EPI_ISL_9880151,EPI1985951|Seq1_A/gadwall/North Carolina/AH018...,EPI1985952|Seq2_A/gadwall/North Carolina/AH018...,EPI1985950|Seq3_A/gadwall/North Carolina/AH018...,EPI1985954|Seq4_A/gadwall/North Carolina/AH018...,EPI1985947|Seq5_A/gadwall/North Carolina/AH018...,EPI1985953|Seq6_A/gadwall/North Carolina/AH018...,EPI1985949|Seq7_A/gadwall/North Carolina/AH018...,EPI1985948|Seq8_A/gadwall/North Carolina/AH018...,,...,,,,,,,,,,
4,EPI_ISL_9880152,EPI1985959|Seq1_A/northern shoveler/North Caro...,EPI1985960|Seq2_A/northern shoveler/North Caro...,EPI1985958|Seq3_A/northern shoveler/North Caro...,EPI1985962|Seq4_A/northern shoveler/North Caro...,EPI1985955|Seq5_A/northern shoveler/North Caro...,EPI1985961|Seq6_A/northern shoveler/North Caro...,EPI1985957|Seq7_A/northern shoveler/North Caro...,EPI1985956|Seq8_A/northern shoveler/North Caro...,,...,,,,,,,,,,


Keep only the columns we are interested in.

In [5]:
retained_cols = [
    'Isolate_Name',
    'Collection_Date',
    'Isolate_Id',
    'Location',
    'Host',
    'Submitting_Lab',
    'Originating_Lab']

meta_working = meta_raw[retained_cols]

Standardize column name formatting:
- all lowercase
- underscore for word separation (already true)

In [6]:
meta_working.columns = [c.lower() for c in meta_working.columns]

Provide [required columns](https://docs.nextstrain.org/projects/ncov/en/wdl-optionals/analysis/data-prep.html#required-metadata) for Nextstrain.

In [7]:
meta_working = meta_working.rename(
    columns = {
        'isolate_name': 'strain',
        'collection_date': 'date'
    }
)
meta_working['virus'] = 'avian_flu'

## Parse metadata

### Parse country

In [8]:
split_columns = meta_working['location'].str.split(' / ', expand=True)
meta_working['region'] = split_columns[0]
meta_working['country'] = split_columns[1]

In [9]:
meta_working['host'].unique()

array(['Avian', 'Turkey', 'Chicken', 'Pheasant', 'Mallard', 'Duck',
       'Goose', 'Gallus gallus domesticus', 'Domestic goose', 'Ostrich',
       'Other mammals', 'Meleagris gallopavo', 'Seal', 'mammals',
       'Haliaeetus leucocephalus', 'Anser caerulescens', 'Sterna hirundo',
       'Somateria mollissima', 'Anser rossii', 'Lophodytes cucullatus',
       'Aythya affinis', 'Buteo jamaicensis', 'Gallus gallus',
       'Guineafowl', 'Wild bird', 'Branta canadensis', 'Other avian',
       'Leucophaeus', 'Crow', 'Larus smithsonianus', 'Calidris alba',
       'Falco peregrinus', 'Larus marinus', 'Gull', 'Cormorant',
       'Buteo lineatus', 'Host', 'Wild birds', 'Eagle', 'Swan',
       'Larus argentatus', 'Falcon', 'Human', 'Felis catus',
       'Wild waterfowl', 'Anseriformes sp.', 'Larus delawarensis',
       'Cairina moschata', 'Larus', 'Mink', 'Animal'], dtype=object)

### Parse host

In [10]:
# Rename raw host column:
meta_working = meta_working.rename(
    columns = {
        'host': 'host_raw',
    }
)

# Empirical lists of avian and human values
hosts_avian = ['Avian', 'Turkey', 'Chicken',
               'Pheasant', 'Mallard', 'Duck',
               'Goose', 'Gallus gallus domesticus',
               'Domestic goose', 'Ostrich', 'Meleagris gallopavo', 
                'Haliaeetus leucocephalus', 'Anser caerulescens',
                'Sterna hirundo', 'Somateria mollissima', 'Anser rossii',
                'Lophodytes cucullatus', 'Aythya affinis',
                'Buteo jamaicensis', 'Gallus gallus', 'Guineafowl',
                'Wild bird', 'Branta canadensis', 'Other avian', 
                'Leucophaeus', 'Crow', 'Larus smithsonianus',
                'Calidris alba', 'Falco peregrinus', 'Larus marinus',
                'Gull', 'Cormorant', 'Buteo lineatus', 'Wild birds',
                'Eagle', 'Swan', 'Larus argentatus', 'Falcon',
                'Wild waterfowl', 'Anseriformes sp.', 'Larus delawarensis',
                'Cairina moschata', 'Larus']
hosts_mammal = ['Other mammals', 'Seal', 'mammals', 'Felis catus', 'Mink',]

def label_host(val):
    if val in hosts_avian:
        return 'Avian'
    elif val in hosts_mammal:
        return 'Mammal'
    elif val == 'Human':
        return 'Human'
    else:
        return np.nan

meta_working['host'] = (
    meta_working['host_raw']
    .apply(label_host))

## Clean dataframe
Clean up values.

Clean strain names

In [11]:
meta_working['strain'] = (
    meta_working['strain']
    .str.strip().replace(' ', '_', regex=False)
)

## Export for Augur
Export tab-delimited file (TSV) for input into Augur.

- Replace `NaN` values with a question mark character (`?`), as preferred by Augur.
- Drop raw columns that have been parsed to new, expanded columns.
- Put columns in preferred order. This is not necessary for Augur, but provides a standard order for my own human reading.
- **Drop duplicate strains.**

In [12]:
col_order = [
    'strain',
    'date',
    'virus',
    'region', 
    'country',
    'host', 
    'isolate_id',
    'submitting_lab',
    'originating_lab']

meta_working = meta_working[col_order]

meta_clean = meta_working.fillna('?')

meta_clean = meta_clean.drop_duplicates(subset='strain', keep=False)

meta_clean.to_csv(export_augur, sep='\t', index=False, header=True)