In [1]:
!pip install -U scikit-learn --user



In [2]:
import pandas as pd
from Bio import SeqIO
import sklearn
import random

data_folder = '/home/gs69042/CEIRS-Training-Taiwan-2019/CIERStraining-Asia2017HA/Data/'

In [3]:
sklearn.__version__

'1.0.1'

In [4]:
# Read CSV and Fasta files into dataframes.
seq_df = pd.DataFrame(columns=['id', 'seq'])
with open(data_folder + "Asia_2017_HA.fasta") as fasta_file:  # Will close handle cleanly
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        seq_df = seq_df.append({'id':seq_record.id, 'seq': str(seq_record.seq)}, ignore_index=True)

metadata_df = pd.read_csv(data_folder + 'Asia_2017_HA_RawData.csv')

# NOTE: There are 250 duplicates in this dataset. The id used in the fasta file not sufficiently unique. Needed isolate id as well I suspect.
print(metadata_df[' Isolate name '].nunique())
print(len(metadata_df))
print(seq_df.id.nunique())
print(len(seq_df))
print((seq_df.groupby('id').nunique() > 1).sum())
metadata_df.head()

2006
2256
2006
2256
seq    65
dtype: int64


Unnamed: 0,Isolate ID,Isolate name,Location,Collection date,Segment,Segment number,Type,Lineage,Passage details/history,Submitter,Sample ID by sample provider,Sample ID by submitting lab,Last modified,Originating lab,Submitting lab,Identifier,DNA Accession no.,DNA INSDC
0,EPI_ISL_299761,A/AbuDhabi/001/2017,AbuDhabi,10/15/17,HA,4,A/H3N2,,X,GartenKondorRebecca,A/ABUDHABI/001/2017,3000682775,2/20/18,ShaikhKhalifaMedicalCity,CentersforDiseaseControlandPrevention,3000682775_N8K8Y4RH_v1_4,1179733,
1,EPI_ISL_299762,A/AbuDhabi/0013/2017,AbuDhabi,10/15/17,HA,4,A/H3N2,,Original,GartenKondorRebecca,A/ABUDHABI/0013/2017ORIGINAL,3000682748,2/20/18,ShaikhKhalifaMedicalCity,CentersforDiseaseControlandPrevention,3000682748_N8K8Y4NQ_v1_4,1179741,
2,EPI_ISL_299763,A/AbuDhabi/002/2017,AbuDhabi,10/15/17,HA,4,A/H3N2,,Original,GartenKondorRebecca,A/ABUDHABI/002/2017ORIGINAL,3000682739,2/20/18,ShaikhKhalifaMedicalCity,CentersforDiseaseControlandPrevention,3000682739_N8K8Y4MH_v1_4,1179749,
3,EPI_ISL_299769,A/AbuDhabi/007/2017,AbuDhabi,10/22/17,HA,4,A/H3N2,,Original,GartenKondorRebecca,A/ABUDHABI/007/2017ORIGINAL,3000682744,2/20/18,ShaikhKhalifaMedicalCity,CentersforDiseaseControlandPrevention,3000682744_N8K8Y4N6_v1_4,1179797,
4,EPI_ISL_299770,A/AbuDhabi/008/2017,AbuDhabi,10/22/17,HA,4,A/H3N2,,Original,GartenKondorRebecca,A/ABUDHABI/008/2017ORIGINAL,3000682745,2/20/18,ShaikhKhalifaMedicalCity,CentersforDiseaseControlandPrevention,3000682745_N8K8Y4NB_v1_4,1179805,


In [5]:
# Generating composite key to match the two tables. Then matching. Outer join to validate that there are no nulls.
metadata_df['concatid'] = metadata_df[' Isolate name '] + "_" + metadata_df.Location + "_" + \
                            metadata_df[' Collection date '].apply(lambda x: f'{x.split("/")[1] if len(x.split("/")[1]) > 1 else "0" + x.split("/")[1]}/{x.split("/")[0] if len(x.split("/")[0]) > 1 else "0" + x.split("/")[0]}/20{x.split("/")[2]}')
df = metadata_df.merge(seq_df, left_on='concatid', right_on='id', how='outer').drop_duplicates(subset='id')

In [6]:
df_asia = df[~df.Location.isin(['Kansas', 'Russia', 'Switzerland'])]
random.seed(12)
taiwan_samples = random.sample(list(df_asia[df_asia.Location == 'Taiwan'].id), int(0.3*len(df_asia[df_asia.Location == 'Taiwan'].id)))
len(taiwan_samples)

46

In [7]:
250-len(df_asia[df_asia.Location == 'Taiwan'].id)

94

In [8]:
other_samples = random.sample(list(df_asia[df_asia.Location != 'Taiwan'].id), 250 - len(taiwan_samples))
len(other_samples)

204

In [9]:
filtered_df = df[df.id.isin(other_samples + taiwan_samples)]
filtered_df['beast_id'] = filtered_df['Isolate ID '] + '|' + filtered_df[' Isolate name '] + '|' + \
            filtered_df.Location + '|' + filtered_df[' DNA Accession no. '].astype(str) + '|' + \
            filtered_df[' Collection date ']
filtered_df['beast_id'].head().values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


array(['EPI_ISL_299775|A/AbuDhabi/015/2017|AbuDhabi|1179844|10/25/17',
       'EPI_ISL_299805|A/AbuDhabi/076/2017|AbuDhabi|1180084|11/6/17',
       'EPI_ISL_299812|A/AbuDhabi/106/2017|AbuDhabi|1180140|11/7/17',
       'EPI_ISL_307867|A/AbuDhabi/221/2017|AbuDhabi|1221443|12/21/17',
       'EPI_ISL_291733|A/Afghanistan/0438/2017|Afghanistan|1142486|11/9/17'],
      dtype=object)

In [None]:
f = open('Asia2017H3N2_subset.fasta', 'w')
for i, row in filtered_df.iterrows():
    f.write('>' + row.beast_id + '\n' + row.seq + '\n')
f.close()

In [16]:
# Fasta files into dataframe.
seq_df = pd.DataFrame(columns=['id', 'seq'])
with open("/home/gs69042/CEIRS-Training-Taiwan-2019/CIERStraining-Asia2017HA/RAxML/Asia2017H3N2_subset.mafftout.final.fasta") as fasta_file:  # Will close handle cleanly
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        seq_df = seq_df.append({'id':seq_record.id, 'seq': str(seq_record.seq)}, ignore_index=True)

print(seq_df.id.nunique())
seq_df = seq_df[~seq_df.id.isin(["EPI_ISL_294313|A/Taiwan/1417/2017|Taiwan|1155205|10/15/17", 
                                        "EPI_ISL_278603|A/Guangdong-Chengqu/1649/2017|China|1062586|7/11/17", 
                                        "EPI_ISL_304337|A/Taiwan/1408/2017|Taiwan|1202435|10/17/17", 
                                        "EPI_ISL_299670|A/Cambodia/B1116403/2017|Cambodia|1179512|11/15/17", 
                                        "EPI_ISL_278650|A/Fujian-Fengze/1491/2017|China|1062727|6/14/17", 
                                        "EPI_ISL_294319|A/Taiwan/1433/2017|Taiwan|1155223|10/18/17", 
                                        "EPI_ISL_300806|A/Cambodia/909/2017|Cambodia|1185494|10/5/17", 
                                        "EPI_ISL_365308|A/India/1738050/2017|India|1493360|9/2/17", 
                                        "EPI_ISL_303909|A/Ankara/1050/2017|Turkey|1201146|12/1/17", 
                                        "EPI_ISL_292542|A/Kanagawa/ZC1617/2017|Japan|1146085|2/23/17"])]

250


In [17]:
seq_df.id.nunique()

240

In [20]:
f = open('raxml/Asia2017H3N2_subset.mafftout.final.outcut.fasta', 'w')
for i, row in seq_df.iterrows():
    f.write('>' + row.id + '\n' + row.seq + '\n')
f.close()