# Outline

1. Environment Setup: Creating a conda environment and importing necessary libraries.
2. Folder Creation: Setting up folders for input and output files.
3. Data Loading: Loading metadata from a CSV file and sequence data from a FASTA file.
4. ID Comparison: Identifying which IDs from the metadata are not present in the FASTA file.
5. Sequence Retrieval: Loading another FASTA file and finding missing sequences for the identified IDs.
6. Output: Saving the newly found sequences with their IDs to a new FASTA file.

  conda create -n ancient_dna_env python=3.8 biopython pandas matplotlib numpy jupyter ipython scipy seaborn -y


In [62]:
# Importing necessary libraries for ancient mtDNA analysis
from Bio import SeqIO
import pandas as pd
import os


## Step 1: Folder Creation for Input and Output Files

In [63]:
# Check and create folder for output files
if not os.path.exists('output'):
    os.makedirs('output')

## Step 2: Data Loading

In [64]:
# Load CSV metadata for ancient mtDNA
meta_amtDB = pd.read_csv('data/amtDB/amtdb_metadata.csv', sep=',', header=0)
# Display the first few rows of the metadata for verification
meta_amtDB.head()

# Extract 'identifier' column from the metadata
identifiers_metadata = meta_amtDB['identifier'].tolist()

# Load FASTA file with mtDNA sequences from amtDB
ids_seq_fasta = [seq_record.id for seq_record in SeqIO.parse("data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta", "fasta")]


## Step 3: ID Comparison

In [65]:
# Compare IDs from metadata and FASTA file to find missing IDs in FASTA
ids_metadata_not_in_fasta = [i for i in identifiers_metadata if i not in ids_seq_fasta]


## Step 4: Sequence Retrieval

In [66]:
# Load FASTA file with mtDNA sequences from another source (e.g., mtdna_reich)
reich_ids_seq_fasta = [seq_record.id for seq_record in SeqIO.parse("data/mitogenomes_reich/mtdna_reich.fasta", "fasta")]

# Find IDs that are missing in the amtDB FASTA but present in the mtdna_reich FASTA
ids_in_both = [i for i in ids_metadata_not_in_fasta if i in reich_ids_seq_fasta]


In [67]:
# Load FASTA file with mtDNA sequences from another source reich
reich_ids_seq_fasta = [seq_record.id for seq_record in SeqIO.parse("data/mitogenomes_reich/mtdna_reich.fasta", "fasta")]

# Find IDs that are missing in the amtDB FASTA but present in the mtdna_reich FASTA
ids_in_both = [i for i in ids_metadata_not_in_fasta if i in reich_ids_seq_fasta]

# retrieve the sequences from the mtdna_reich FASTA file
reich_seqs = []
for seq_record in SeqIO.parse("data/mitogenomes_reich/mtdna_reich.fasta", "fasta"):
    if seq_record.id in ids_in_both:
        reich_seqs.append(seq_record.seq)
        
        

## Step 5: Output New Sequences

In [68]:

# There are 404 new sequences to add to our AmtDB
# Create and save these sequences to a new FASTA file using reich_seqs and their corresponding IDs

with open('output/new_sequences.fasta', 'w') as file:
    for id in ids_in_both:
        # Retrieve sequence for 'id' and write to the file
        file.write(f">{id}\n{reich_seqs[ids_in_both.index(id)]}\n")

In [69]:
# show the number of new sequences in the output
print(f"Number of new sequences: {len(ids_in_both)}")

# show the number of new sequences in the fasta file
print(f"Number of new sequences in the fasta file: {len(reich_seqs)}")

# show the number of new sequences in the fasta file using the fasta file
with open('output/new_sequences.fasta', 'r') as file:
    new_seqs = list(SeqIO.parse(file, 'fasta'))
    print(f"Number of new sequences in the fasta file using the fasta file: {len(new_seqs)}")


Number of new sequences: 404
Number of new sequences in the fasta file: 404
Number of new sequences in the fasta file using the fasta file: 404


## Meatadata retrieval

In [70]:
# using meta_amtDB 
meta_amtDB

Unnamed: 0,identifier,alternative_identifiers,country,continent,region,culture,epoch,group,comment,latitude,...,ychr_snps,avg_coverage,sequence_source,mitopatho_alleles,mitopatho_positions,mitopatho_locus,mitopatho_diseases,mitopatho_statuses,mitopatho_homoplasms,mitopatho_heteroplasms
0,RISE509,1622,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,,,,,,,
1,RISE510,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,12705T,12705,MT-CO1,Possible protective factor for normal tension ...,Reported,,
2,RISE511,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,10398G;12372A;9055A;11467G;12308G,10398;12372;9055;11467;12308,MT-ND3;MT-ND5;MT-ATP6;MT-ND4;MT-TL2,PD protective factor / longevity / altered cel...,Reported / lineage L & M marker / also hg IJK;...,+;+;+;+;+,-;-;-;-;+
3,RISE507,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,51.500000,...,,0.0,bam,4640A;11467G;12372A;150T;12308G,4640;11467;12372;150;12308,MT-ND2;MT-ND4;MT-ND5;MT-CR;MT-TL2,LHON / Epilepsy;Altered brain pH / sCJD patien...,Reported;Reported;Reported;Conflicting reports...,+;+;+;+;+,-;-;-;+;+
4,RISE508,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,51.500000,...,,0.0,bam,12372A;12308G;13637G;11467G,12372;12308;13637;11467,MT-ND5;MT-TL2;MT-ND5;MT-ND4,Altered brain pH / sCJD patients;CPEO / Stroke...,Reported;Reported;Reported;Reported,+;+;+;+,-;+;-;-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2536,SX26,,Switzerland,Europe,central Europe,Switzerland_Neolithic,Neolithic,NESw,Final Neolithic/Early Bronze Age,47.450001,...,,0.0,reconstructed,,,,,,,
2537,SX29,,Switzerland,Europe,central Europe,Grossgartach,Neolithic,NESw,Middle neolithic,48.554459,...,,0.0,reconstructed,,,,,,,
2538,SX30,,Switzerland,Europe,central Europe,Grossgartach,Neolithic,NESw,Middle neolithic,48.554459,...,,0.0,reconstructed,,,,,,,
2539,SX32,,Switzerland,Europe,central Europe,Rubane,Neolithic,NESw,Middle neolithic,48.554459,...,,0.0,reconstructed,,,,,,,


In [71]:
meta_amtDB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2541 entries, 0 to 2540
Data columns (total 36 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   identifier               2541 non-null   object 
 1   alternative_identifiers  2031 non-null   object 
 2   country                  2541 non-null   object 
 3   continent                2541 non-null   object 
 4   region                   2541 non-null   object 
 5   culture                  2541 non-null   object 
 6   epoch                    2541 non-null   object 
 7   group                    2541 non-null   object 
 8   comment                  668 non-null    object 
 9   latitude                 2541 non-null   float64
 10  longitude                2541 non-null   float64
 11  sex                      2541 non-null   object 
 12  site                     2541 non-null   object 
 13  site_detail              293 non-null    object 
 14  mt_hg                   

In [72]:
# load metadata from reich data
meta_reich_ind = pd.read_csv('data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.ind', sep='\t', header=None)

In [73]:
meta_reich_ind

Unnamed: 0,0
0,Ne30_genotyping_noUDG M China_AmurRiver_EarlyN
1,Ne61_genotyping_noUDG U China_AmurRiver_BA
2,Ne35_genotyping_noUDG F China_AmurRiver_EarlyN
3,I17622 M Albania_BA_IA_lc
4,I13833 M Albania_EarlyModern_oCa...
...,...
16384,I19456_v54.1_addback M Bulgaria_EBA
16385,S1944.E1.L3_v54.1_addback F Iran_GanjDareh_N
16386,S1951.E1.L3_v54.1_addback F Iran_GanjDareh_N
16387,S7241.E1.L1_v54.1_addback M Vietnam_N_all


In [74]:
meta_reich_anno_1240 = pd.read_csv('data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno', sep='\t', header=0)

  meta_reich_anno_1240 = pd.read_csv('data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno', sep='\t', header=0)


In [75]:
meta_reich_anno_HO = pd.read_csv('data/mitogenomes_reich/v54.1.p1_HO_public/v54.1.p1_HO_public.anno', sep='\t', header=0)

  meta_reich_anno_HO = pd.read_csv('data/mitogenomes_reich/v54.1.p1_HO_public/v54.1.p1_HO_public.anno', sep='\t', header=0)


In [76]:
meta_reich_anno_1240.head(1)

Unnamed: 0,Genetic ID,Master ID,Skeletal code,Skeletal element,"Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]",Publication,"Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)","Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]","Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]","Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990±40 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE",...,Y haplogroup (manual curation in ISOGG format),mtDNA coverage (merged data),mtDNA haplogroup if >2x or published,mtDNA match to consensus if >2x (merged data),Damage rate in first nucleotide on sequences overlapping 1240k targets (merged data),Sex ratio [Y/(Y+X) counts] (merged data),"Library type (minus=no.damage.correction, half=damage.retained.at.last.position, plus=damage.fully.corrected, ds=double.stranded.library.preparation, ss=single.stranded.library.preparation)",Libraries,ASSESSMENT,"ASSESSMENT WARNINGS (Xcontam interval is listed if lower bound is >0.005, ""QUESTIONABLE"" if lower bound is 0.01-0.02, ""QUESTIONABLE_CRITICAL"" or ""FAIL"" if lower bound is >0.02) (mtcontam confidence interval is listed if coverage >2 and upper bound is <0."
0,Ne30_genotyping_noUDG,NE30,AR9.9K_2d.rel.NE-4_deleted,..,2021,MaoFuCell2021,Direct IntCal20,9896,121,"8175-7750 calBCE (8825±30 BP, BA-152174)",...,C,99,D4m,..,12,..,ss.minus,HRR163270,PASS,..


In [77]:
meta_reich_anno_HO.head(1)

Unnamed: 0,Genetic ID,Master ID,Skeletal code,Skeletal element,"Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]",Publication,"Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)","Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]","Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]","Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990±40 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE",...,Y haplogroup (manual curation in ISOGG format),mtDNA coverage (merged data),mtDNA haplogroup if >2x or published,mtDNA match to consensus if >2x (merged data),Damage rate in first nucleotide on sequences overlapping 1240k targets (merged data),Sex ratio [Y/(Y+X) counts] (merged data),"Library type (minus=no.damage.correction, half=damage.retained.at.last.position, plus=damage.fully.corrected, ds=double.stranded.library.preparation, ss=single.stranded.library.preparation)",Libraries,ASSESSMENT,"ASSESSMENT WARNINGS (Xcontam interval is listed if lower bound is >0.005, ""QUESTIONABLE"" if lower bound is 0.01-0.02, ""QUESTIONABLE_CRITICAL"" or ""FAIL"" if lower bound is >0.02) (mtcontam confidence interval is listed if coverage >2 and upper bound is <0."
0,I001.HO,I001,..,..,2016,BroushakiScience2016,Modern,0,0,present,...,..,..,..,..,..,..,..,..,PASS,..


In [78]:
meta_reich_anno_1240.columns

Index(['Genetic ID', 'Master ID', 'Skeletal code', 'Skeletal element',
       'Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]',
       'Publication',
       'Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)',
       'Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]',
       'Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]',
       'Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990±40 BP, Ua-3

In [79]:
meta_reich_anno_HO.columns

Index(['Genetic ID', 'Master ID', 'Skeletal code', 'Skeletal element',
       'Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]',
       'Publication',
       'Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)',
       'Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]',
       'Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]',
       'Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990±40 BP, Ua-3

In [80]:
meta_amtDB.columns

Index(['identifier', 'alternative_identifiers', 'country', 'continent',
       'region', 'culture', 'epoch', 'group', 'comment', 'latitude',
       'longitude', 'sex', 'site', 'site_detail', 'mt_hg', 'ychr_hg',
       'year_from', 'year_to', 'date_detail', 'bp', 'c14_lab_code',
       'reference_name', 'reference_link', 'data_link', 'c14_sample_tag',
       'c14_layer_tag', 'ychr_snps', 'avg_coverage', 'sequence_source',
       'mitopatho_alleles', 'mitopatho_positions', 'mitopatho_locus',
       'mitopatho_diseases', 'mitopatho_statuses', 'mitopatho_homoplasms',
       'mitopatho_heteroplasms'],
      dtype='object')

In [81]:
# create a dictionary of the metadata key-value pairs column names and their corresponding values but just for the first row
meta_reich_anno_dict_1240 = meta_reich_anno_1240.iloc[0].to_dict()
meta_reich_anno_dict_1240

{'Genetic ID': 'Ne30_genotyping_noUDG',
 'Master ID': 'NE30',
 'Skeletal code': 'AR9.9K_2d.rel.NE-4_deleted',
 'Skeletal element': '..',
 'Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]': '2021',
 'Publication': 'MaoFuCell2021',
 'Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)': 'Direct IntCal20',
 'Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]': 9896,
 'Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]': 121,
 'Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age 

In [82]:
# create a dictionary of the metadata key-value pairs column names and their corresponding values but just for the first row
meta_reich_anno_dict_HO = meta_reich_anno_HO.iloc[0].to_dict()
meta_reich_anno_dict_HO

{'Genetic ID': 'I001.HO',
 'Master ID': 'I001',
 'Skeletal code': '..',
 'Skeletal element': '..',
 'Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]': '2016',
 'Publication': 'BroushakiScience2016',
 'Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)': 'Modern',
 'Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]': 0,
 'Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]': 0,
 'Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number)

In [83]:
# extract column Master ID from meta_reich_anno and save it to a list
master_id_anno_reich_1240 = meta_reich_anno_1240['Master ID'].tolist()

In [84]:
master_id_anno_reich_HO = meta_reich_anno_HO['Master ID'].tolist()

In [85]:
master_id_anno_reich_1240[:5]

['NE30', 'NE61', 'NE35', 'I17622', 'I13833']

In [91]:
master_id_anno_reich_HO[:5]

['I001', 'I002', 'IREJ-T006', 'IREJ-T009', 'IREJ-T022']

In [87]:
# extract column identifier from meta_amtDB and save it to a list
identifier_meta_amtDB = meta_amtDB['identifier'].tolist()

In [88]:
identifier_meta_amtDB[:5]

['RISE509', 'RISE510', 'RISE511', 'RISE507', 'RISE508']

In [89]:
# check for duplicates in each list
duplicates_amtDB = [i for i in identifier_meta_amtDB if identifier_meta_amtDB.count(i) > 1]
duplicates_reich_1240 = [i for i in master_id_anno_reich_1240 if master_id_anno_reich_1240.count(i) > 1]
duplicates_reich_HO = [i for i in master_id_anno_reich_HO if master_id_anno_reich_HO.count(i) > 1]

In [90]:
print(len(duplicates_amtDB), len(duplicates_reich_1240), len(duplicates_reich_HO))


0 6000 8023


In [93]:
# just unique identifiers
unique_identifier_meta_amtDB = set(identifier_meta_amtDB)
unique_master_id_anno_reich_1240 = set(master_id_anno_reich_1240)
unique_master_id_anno_reich_HO = set(master_id_anno_reich_HO)

print(len(unique_identifier_meta_amtDB), len(unique_master_id_anno_reich_1240), len(unique_master_id_anno_reich_HO))

2541 13357 16290
