conda create -n ancient_dna_env python=3.8 biopython pandas matplotlib numpy jupyter ipython scipy seaborn -y

In [1]:
# import libraries for ancient mtDNA analysis
from Bio import SeqIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import os
import sys




In [None]:
# create folders for input and output files

# create folder for input files
if not os.path.exists('input'):
    os.makedirs('input')
    
# create folder for output files
if not os.path.exists('output'):
    os.makedirs('output')
    
    


In [2]:
# load csv metadata for amtDNA
meta_amtDB = pd.read_csv('data/amtDB/amtdb_metadata.csv', sep=',', header=0)

In [3]:
meta_amtDB.head()

Unnamed: 0,identifier,alternative_identifiers,country,continent,region,culture,epoch,group,comment,latitude,...,ychr_snps,avg_coverage,sequence_source,mitopatho_alleles,mitopatho_positions,mitopatho_locus,mitopatho_diseases,mitopatho_statuses,mitopatho_homoplasms,mitopatho_heteroplasms
0,RISE509,1622.0,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,,,,,,,
1,RISE510,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,12705T,12705,MT-CO1,Possible protective factor for normal tension ...,Reported,,
2,RISE511,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,10398G;12372A;9055A;11467G;12308G,10398;12372;9055;11467;12308,MT-ND3;MT-ND5;MT-ATP6;MT-ND4;MT-TL2,PD protective factor / longevity / altered cel...,Reported / lineage L & M marker / also hg IJK;...,+;+;+;+;+,-;-;-;-;+
3,RISE507,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,51.5,...,,0.0,bam,4640A;11467G;12372A;150T;12308G,4640;11467;12372;150;12308,MT-ND2;MT-ND4;MT-ND5;MT-CR;MT-TL2,LHON / Epilepsy;Altered brain pH / sCJD patien...,Reported;Reported;Reported;Conflicting reports...,+;+;+;+;+,-;-;-;+;+
4,RISE508,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,51.5,...,,0.0,bam,12372A;12308G;13637G;11467G,12372;12308;13637;11467,MT-ND5;MT-TL2;MT-ND5;MT-ND4,Altered brain pH / sCJD patients;CPEO / Stroke...,Reported;Reported;Reported;Reported,+;+;+;+,-;+;-;-


In [4]:
# columns 
meta_amtDB.columns

Index(['identifier', 'alternative_identifiers', 'country', 'continent',
       'region', 'culture', 'epoch', 'group', 'comment', 'latitude',
       'longitude', 'sex', 'site', 'site_detail', 'mt_hg', 'ychr_hg',
       'year_from', 'year_to', 'date_detail', 'bp', 'c14_lab_code',
       'reference_name', 'reference_link', 'data_link', 'c14_sample_tag',
       'c14_layer_tag', 'ychr_snps', 'avg_coverage', 'sequence_source',
       'mitopatho_alleles', 'mitopatho_positions', 'mitopatho_locus',
       'mitopatho_diseases', 'mitopatho_statuses', 'mitopatho_homoplasms',
       'mitopatho_heteroplasms'],
      dtype='object')

In [5]:
# extract column identifier and store in list named identifiers_metadata
identifiers_metadata = meta_amtDB['identifier'].tolist()
identifiers_metadata[0:5]

['RISE509', 'RISE510', 'RISE511', 'RISE507', 'RISE508']

In [6]:
# load fasta file with mtDNA sequences from amtDB

for seq_record in SeqIO.parse("data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))



19159
Seq('GGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTC...CGT')
16537
19727
Seq('GTCTATCATCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCG...ACA')
16553
20036
Seq('GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATA')
16569
20351
Seq('GTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCG...ACA')
16553
20374
Seq('CAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTT...TCA')
16559
20810
Seq('GTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCG...TAA')
16549
20832
Seq('CAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTT...AGC')
16531
20850
Seq('TCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGT...AAA')
16545
21981
Seq('CTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGA...AAA')
16531
30900
Seq('TCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTA...ACG')
16564
AED_106
Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTCCATGCATTTGG...NNN')
16569
AED_1108
Seq('NNTCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...NNN')
16569
AED_1119
Seq('NNNCACAGGTCTATCACCCTA

In [7]:
# extract ids and store them in a list
ids_seq_fasta = []
for seq_record in SeqIO.parse("data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta", "fasta"):
    ids_seq_fasta.append(seq_record.id)

In [8]:
len(ids_seq_fasta)

1621

In [9]:
ids_seq_fasta[0:5]


['19159', '19727', '20036', '20351', '20374']

In [10]:
# compare ids from metadata and fasta file
# create a list with ids from metadata that are not in fasta file
ids_metadata_not_in_fasta = []

for i in identifiers_metadata:
    if i not in ids_seq_fasta:
        ids_metadata_not_in_fasta.append(i)
        

In [11]:
len(ids_metadata_not_in_fasta)


920

In [12]:
# ids_metadata_not_in_fasta are the ids of sequences that are in amtdb metadata but not in fasta file so they do not have sequences
# now compare ids_metadata_not_in_fasta with mtdna_reich.fasta if they occur there

# load fasta file with mtDNA sequences from mtdna_reich
# x = 0
# while x < 5:
for seq_record in SeqIO.parse("data/mitogenomes_reich/mtdna_reich.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))
        


I13559
Seq('GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I24542_d
Seq('NNTCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ANN')
16569
I16326
Seq('NNTCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I8122
Seq('NNNCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I14200
Seq('GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I2446
Seq('NNNCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I19043_old
Seq('NNNNNNNGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...NNN')
16569
I20818_old
Seq('NATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ANN')
16569
I21387_old
Seq('NNNNNNNNGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I21393_old
Seq('NATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I21395_old
Seq('GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I3587
Seq('NNTCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I7527
Seq

I6760.SG
Seq('NNTCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I6761.SG
Seq('NNTCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I6762.SG
Seq('NNTCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I5950
Seq('NNTCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I5950.DG
Seq('NNTCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I5950.SG
Seq('NNTCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I16791
Seq('NNNNNNAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I16792
Seq('NNNNNNNNGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...NNN')
16569
I16184
Seq('NNNNNNNNNNNNNNNNNNNNNTTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I1381
Seq('NNTCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I1382
Seq('NNTCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...NNN')
16569
I1390
Seq('NNNCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATN')
16569
I1391_enhanced
Seq('NN

In [13]:
reich_ids_seq_fasta = []
for seq_record in SeqIO.parse("data/mitogenomes_reich/mtdna_reich.fasta", "fasta"):
    reich_ids_seq_fasta.append(seq_record.id)

In [14]:
reich_ids_seq_fasta[0:5]

['I13559', 'I24542_d', 'I16326', 'I8122', 'I14200']

In [15]:
# compare ids from ids_metadata_not_in_fasta and reich_ids_seq_fasta and count how many ids are in both lists
ids_in_both = []
for i in ids_metadata_not_in_fasta:
    if i in reich_ids_seq_fasta:
        ids_in_both.append(i)

In [16]:
len(ids_in_both)

404

There is 404 new sequences which I need to add to our AmtDB

In [None]:






# extract sequences using ids_metadata_not_in_fasta from mtdna_reich.fasta and store them in a new fasta file called 'new_sequences_from_mtdna_reich.fasta'

for seq_record in SeqIO.parse("data/mtdna_reich.fasta", "fasta"):
    if seq_record.id in ids_metadata_not_in_fasta:
        print(seq_record.id)
        print(repr(seq_record.seq))
        print(len(seq_record))
        with open("data/mitogenomes_reich/mtdna_reich.fasta", "a") as handle:
            SeqIO.write(seq_record, handle, "fasta")

