# Get Metadata from GISAID .fasta Entries

In [53]:
import pandas as pd
from Bio import SeqIO

In [54]:
input_fasta_file = "../../data/sequences/antigens/clustered_sequences_43024/HApro97"
h_sequences = SeqIO.parse(open(input_fasta_file),'fasta')

## Convert to Pandas dataframe
seq_df = pd.DataFrame(columns=['header', 'sequence'])
for i, seq in enumerate(h_sequences):
    seq_df.loc[i] = [seq.id, str(seq.seq)]

# seq_df.head()

In [55]:
## Example Header: EPI340399|HA|A/chicken/Bangladesh/11rs1984-44/2011|EPI_ISL_98378|AEQ50048|A_/_H5N1

## Split header column into parts
seq_df['gisaid_id'] = seq_df['header'].apply(lambda x: x.split('|')[0])
# seq_df['gene'] = seq_df['header'].apply(lambda x: x.split('|')[1])
seq_df['metadata'] = seq_df['header'].apply(lambda x: x.split('|')[2])
# seq_df['gisaid_isl_id'] = seq_df['header'].apply(lambda x: x.split('|')[3])
# seq_df['alt_id'] = seq_df['header'].apply(lambda x: x.split('|')[4])
# seq_df['gene_variant'] = seq_df['header'].apply(lambda x: x.split('|')[5])

## Split metadata column into parts
# seq_df['influenza_type'] = seq_df['metadata'].apply(lambda x: x.split('/')[0])
seq_df['host'] = seq_df['metadata'].apply(lambda x: x.split('/')[1])
seq_df['country'] = seq_df['metadata'].apply(lambda x: x.split('/')[2])
# # seq_df['na'] = seq_df['metadata'].apply(lambda x: x.split('/')[3])
seq_df['date'] = seq_df['metadata'].apply(lambda x: x.split('/')[-1]) ## Hacky way to get the last element since the format is messy



seq_df.head()

Unnamed: 0,header,sequence,gisaid_id,metadata,host,country,date
0,EPI340399|HA|A/chicken/Bangladesh/11rs1984-44/...,VLLLATINLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILE...,EPI340399,A/chicken/Bangladesh/11rs1984-44/2011,chicken,Bangladesh,2011
1,EPI808932|HA|A/chicken/Egypt/Q10781B/2015|EPI_...,MEKIVLLLAIVSIVNSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,EPI808932,A/chicken/Egypt/Q10781B/2015,chicken,Egypt,2015
2,EPI1158808|HA|A/chicken/Anhui/QD1/2014|EPI_ISL...,MEKIVLLLAVVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,EPI1158808,A/chicken/Anhui/QD1/2014,chicken,Anhui,2014
3,EPI340788|HA|A/chicken/Klaten/BBVW-109-II/2010...,MEKIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,EPI340788,A/chicken/Klaten/BBVW-109-II/2010,chicken,Klaten,2010
4,EPI340796|HA|A/chicken/Palu/BBVM-67/2010|EPI_I...,MHKIVLLLATISLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQ...,EPI340796,A/chicken/Palu/BBVM-67/2010,chicken,Palu,2010


In [56]:
## Read in filtered fasta file
filtered_input_fasta_file = "../../data/sequences/antigens/filtered_HApro97_highquality_HA1range_111-256.fasta"
h_sequences_filtered = SeqIO.parse(open(filtered_input_fasta_file),'fasta')

## Convert to Pandas dataframe
filtered_seq_df = pd.DataFrame(columns=['gisaid_id', 'sequence_trimmed'])
for i, seq in enumerate(h_sequences_filtered):
    filtered_seq_df.loc[i] = [seq.id, str(seq.seq)]

filtered_seq_df.head()

Unnamed: 0,gisaid_id,sequence_trimmed
0,EPI3178330,LVSSSGTLEFKNESFNWTGVKQNGTSSACKRGSSSSFFSRLNWLTH...
1,EPI243001,LMSSTNHIEKIRIIPRNSWSEHNASSGVSSSCPYNGRSSFYRNVVW...
2,EPI2144291,LLSSTNHFEKIQIIPRSSWSNHEASAGVSSACPYNGRSSFFRNVVW...
3,EPI893474,LMSSTNHFEKIQIIPRSSWSNHDASSGVSSACPFIGRPSFFRNVVW...
4,EPI1552374,LMSSTNHFEKIQIIPRSSWSNHDASSGVSSACPYNGRSSFFRNVVW...


In [57]:
## Inner join to filter out the sequences that are not in the filtered fasta file
seq_df_filtered = pd.merge(seq_df, filtered_seq_df, on='gisaid_id', how='inner')

## Reorder columns
seq_df_filtered = seq_df_filtered[['gisaid_id', 'host', 'country', 'date', 'sequence_trimmed']]

## Sort by gisaid_id
seq_df_filtered.sort_values(by='gisaid_id', inplace=True)

seq_df_filtered.head()

Unnamed: 0,gisaid_id,host,country,date,sequence_trimmed
73,EPI101477,chicken,Hebei,2005,LLSRINHFEKIQIIPKSSWSDHGASSGVSSACSYLGKPSFFRNVVW...
130,EPI101843,duck,Viet_Nam,2005,LLSRINHFEKIQIIPKSSWSNHDASSGVSSACPYLGRSSFFRNVVW...
74,EPI101875,duck,Vietnam,5,LLSRINHFEKIQIIPKSSWSNHDASSGVSSACPYLGRSSFFRNVVW...
80,EPI105698,chicken,Mahachkala,2006,LLSRINHFEKIQIIPKSSWSDHEASSGVSSACPYQGRSSFFRNVVW...
81,EPI107813,chicken,Guiyang,2006,LLSRINHFEKIQIIPKSSWPNHEASLGVSSACPYLGESSFFRNVVW...


In [59]:
## Save to CSV
seq_df_filtered.to_csv('../../data/sequences/antigens/filtered_HApro97_metadata.csv', index=False)