In [1]:
# set dir to /home/dodo/projects/aDNA_Comparative_Analysis
import os
import pandas as pd
os.chdir('../../aDNA_Comparative_Analysis')
import re

In [2]:
from src.data_loading import load_data, load_mt_dataset
from src.data_processing import DataProcessor, MetadataMatcher

In [3]:
# metadata_file -> AmtDB csv metadata file
# fasta_amtdb -> AmtDB fasta file
# anno_file -> Reich annotation file = metadata file
# fasta_reich -> Reich fasta file

metadata_file = 'data/amtDB/amtdb_metadata.csv'
fasta_amtdb = 'data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta'
anno_file = 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'
fasta_reich = 'data/mitogenomes_reich/mtdna_reich.fasta'


In [4]:
meta_amtDB, ids_seq_fasta = load_data(metadata_file, fasta_amtdb)
ids_mt_dataset, meta_mt_dataset = load_mt_dataset(fasta_reich, anno_file)


Loading AmtDB metadata from 'data/amtDB/amtdb_metadata.csv' and sequence IDs from 'data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta'...
Loaded AmtDB metadata with 2541 records and 1621 sequences.

Loading 'Reich mt dataset' from 'data/mitogenomes_reich/mtdna_reich.fasta' and metadata from 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'...
Loaded 'Reich mt dataset' with 4122 sequences and metadata with 16388 records.



In [5]:
# Process AmtDB data to find missing sequences
processor_amtDB = DataProcessor(meta_amtDB, fasta_amtdb)
missing_ids_amtDB = processor_amtDB.find_missing_sequences()


Found 920 missing sequences.



In [6]:
# intersection of missing sequences in AmtDB and Master ID from meta_mt_dataset
missing_ids_amtDB_reich = list(set(missing_ids_amtDB) & set(meta_mt_dataset['Master ID']) )
len(missing_ids_amtDB_reich)


529

In [7]:


# Extract missing sequences from the Reich dataset (fasta_reich) and save them
output_fasta = "output/missing_sequences_AmtDB.fasta"
processor_amtDB.extract_and_save_sequences(fasta_reich, missing_ids_amtDB, output_fasta)

Extracting sequences matching the specified IDs from 'data/mitogenomes_reich/mtdna_reich.fasta'...
Saved 404 sequences to 'output/missing_sequences_AmtDB.fasta'.



In [8]:
!haplogrep3 classify --tree phylotree-rcrs@17.0 --in output/missing_sequences_AmtDB.fasta --out output/analysis_result.hsd --extend-report



Haplogrep 3 3.2.1
(c) 2022-2023 Sebastian Schönherr, Hansi Weissensteiner, Lukas Forer
[M::bwa_idx_load_from_disk] read 0 ALT contigs
Written haplogroups to file output/analysis_result.hsd


In [9]:
# load analysis_result.hsd
analysis_result = pd.read_csv('output/analysis_result.hsd', sep='\t')
analysis_result.head(2)


Unnamed: 0,SampleID,Haplogroup,Rank,Quality,Range,Not_Found_Polys,Found_Polys,Remaining_Polys,AAC_In_Remainings,Input_Sample
0,I0070,H13a1a,1,0.9691,1-16569,,263G 750G 1438G 2259T 4745G 4769G 8860G 13680T...,310C (localPrivateMutation) 3107T (globalPriva...,,263G 310C 750G 1438G 2259T 3107T 4745G 4769G 8...
1,I0071,U5a1,1,0.9644,1-16569,,73G 263G 750G 1438G 2706G 3197C 4769G 7028T 88...,310C (localPrivateMutation) 3107T (globalPriva...,8705C [M60T| Codon 2 | ATP6 ],73G 263G 310C 750G 1438G 2706G 3107T 3197C 476...


In [10]:
# columns
analysis_result.columns


Index(['SampleID', 'Haplogroup', 'Rank', 'Quality', 'Range', 'Not_Found_Polys',
       'Found_Polys', 'Remaining_Polys', 'AAC_In_Remainings', 'Input_Sample'],
      dtype='object')

In [11]:

# Rename 'SampleID' column to 'ID'
analysis_result.rename(columns={'SampleID': 'ID'}, inplace=True)


In [12]:
# Use 'Found_Polys' as the 'Polymorphisms' column directly
# Assuming 'Found_Polys' accurately represents the polymorphisms we want to include
# If a combination of columns is needed, you may need to concatenate them appropriately here
analysis_result['Polymorphisms'] = analysis_result['Found_Polys']

In [13]:
!python3 mitopatho/mitopatho.py -i output/processed_haplogroups.hsd -o output/mitopatho_output.txt



###########################################################################
# MitoPathoPy - Tool for annotating pathological mutations in human mtDNA #
###########################################################################
Author: Edvard Ehler, PhD (edvard.ehler@img.cas.cz)
Year: 2020
Source: https://github.com/EdaEhler/MitoPathoPy

This tool will search for the (potential) pathological mutations in you mtDNA samples. To get the right input format, please, use the Haplogrep 2 (https://haplogrep.i-med.ac.at/app/index.html) tool to turn your sequences (fastas) into HSD format file. Haplogrep can be also downloaded and run localy (https://github.com/seppinho/haplogrep-cmd). Pathological mtDNA mutations are taken from data published at https://www.mitomap.org/MITOMAP.

We invite you to check our ancient mtDNA database at: https://amtdb.org/

If you use this tool in you research, please, consider citing our article:
Ehler E, Novotný J, Juras A, Chyleński M, Moravčík O, Pačes J. AmtDB:

In [14]:
# load mitopatho_output.txt
mitopatho_output = pd.read_csv('output/mitopatho_output.txt', sep='\t')


In [15]:
# pprint first two rows of the DataFrame but full data 
id_1 = mitopatho_output[mitopatho_output['ID'] == 'I0071']
# print second column of the first row
id_1.iloc[0, 1]

'11467G,Altered brain pH / sCJD patients,Reported;12308G,CPEO / Stroke / CM / Breast & Renal & Prostate Cancer Risk / Altered brain pH /sCJD,Reported;12372A,Altered brain pH / sCJD patients,Reported;16192T,Melanoma patients,Reported;16270T,Melanoma patients,Reported'

In [16]:
missing_ids_amtDB = processor_amtDB.find_missing_sequences()
ids_mt_dataset, meta_mt_dataset = load_mt_dataset(fasta_reich, anno_file)


Found 920 missing sequences.

Loading 'Reich mt dataset' from 'data/mitogenomes_reich/mtdna_reich.fasta' and metadata from 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'...


Loaded 'Reich mt dataset' with 4122 sequences and metadata with 16388 records.



In [17]:
# how many missing_ids_amtDB in the reich dataset
missing_ids_amtDB_in_reich = [id for id in missing_ids_amtDB if id in meta_mt_dataset['Master ID'].values]
len(missing_ids_amtDB_in_reich)


529

In [18]:
import pandas as pd
import re

def parse_publication_name(reference):
    # Extracting the first name by finding the first capitalized word
    # and the year by finding the four-digit number
    first_name = re.search(r"[A-Z][a-z]+", reference).group()
    year = re.search(r"\d{4}", reference).group()
    
    # Creating the formatted string
    formatted_ref = f"{first_name} et al. {year}"
    
    return formatted_ref

def match_reich_metadata(reich_meta_file, missing_ids, mitopatho_csv):
    print("Reading Reich metadata...")
    reich_meta = pd.read_csv(reich_meta_file, sep='\t', header=0, low_memory=False)
    mitopatho = pd.read_csv(mitopatho_csv)
    print(f"Reich metadata has {len(reich_meta)} rows.")
    print(f"MitoPatho data has {len(mitopatho)} rows.")
    
    # Use set for efficient searching
    missing_ids_set = set(missing_ids)
    
    matched_data = []
    
    for _, row in reich_meta.iterrows():
        master_id = row['Master ID']
        if master_id not in missing_ids_set:
            continue
        
        # Date and C14 code extraction
        date_range_match = re.search(r'(\d{4}\u00b1\d{2})', row['Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990\u00b140 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE'])
        if date_range_match:
            date_range = date_range_match.group(1)
            bp, error = date_range.split('\u00b1')
            year_from = -int(bp.replace('BP', '').strip())  # Assume the presence of 'BP' indicates Before Present
            year_to = -(int(bp.replace('BP', '').strip()) - int(error) * 2)
        else:
            continue  # Skip this entry if no date range match is found

        date_detail = row['Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990\u00b140 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE']
        c14_lab_code_match = re.search(r'Ua-(\w+)', date_detail)
        c14_lab_code = c14_lab_code_match.group(1) if c14_lab_code_match else 'nan'
        c14_sample_tag = 1 if year_to > 1950 else 0
            
        # Aggregate data from MitoPatho if available
        mito_filtered = mitopatho[mitopatho['ID'] == master_id]
        mito_agg = {col: ";".join(mito_filtered[col].fillna('NaN').astype(str).unique()) for col in ['Polymorphism', 'Position', 'Locus', 'Diseases', 'Status', 'Homoplasmy', 'Heteroplasmy']}
        
        # Construct the matched data entry with Reich and MitoPatho data
        matched_data.append({
            'identifier':               row['Master ID'], 
            'alternative_identifiers':  row.get('Genetic ID', ""), 
            'country':                  row.get('Political Entity', ""), 
            'continent':                "",
            'region':                   "",
            'culture':                  row.get('Group ID', ""),
            'epoch':                    "",
            'group':                    row.get('Group ID', ""),
            'comment':                  "",
            'latitude':                 row.get('Lat.', ""),
            'longitude':                row.get('Long.', ""), 
            'sex':                      row.get('Molecular Sex', ""), 
            'site':                     row.get('Locality', ""), 
            'site_detail':              "",
            'mt_hg':                    row.get('mtDNA haplogroup if >2x or published', ""),
            'ychr_hg':                  row.get('Y haplogroup (manual curation in ISOGG format)', ""),
            'year_from':                year_from, 
            'year_to':                  year_to, 
            'date_detail':              row.get('Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990\u00b140 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE', ""),
            'bp':                       bp, 
            'c14_lab_code':             c14_lab_code,
            'reference_name':           parse_publication_name(row.get('Publication', "")),
            'reference_link':           "",
            'data_link':                "",
            'c14_sample_tag':           c14_sample_tag,
            'c14_layer_tag':            0,
            'ychr_snps':                row.get('Y haplogroup (manual curation in terminal mutation format)', ""),
            'avg_coverage':             row.get('mtDNA coverage (merged data)', ""),
            'sequence_source':          'fasta',
            'reference_name': parse_publication_name(row.get('Publication', "")),
            # Fields aggregated from MitoPatho
            'mitopatho_alleles':        mito_agg['Polymorphism'],
            'mitopatho_positions':      mito_agg['Position'],
            'mitopatho_locus':          mito_agg['Locus'],
            'mitopatho_diseases':       mito_agg['Diseases'],
            'mitopatho_statuses':       mito_agg['Status'],
            'mitopatho_homoplasms':     mito_agg['Homoplasmy'],
            'mitopatho_heteroplasms':   mito_agg['Heteroplasmy']
        })

    print(f"Processed {len(matched_data)} out of {len(missing_ids)} missing IDs.")
    matched_df = pd.DataFrame(matched_data)
    print(f"Matched {len(matched_df)} rows with the given missing IDs.")
    return matched_df



In [19]:
# Example usage:
reich_meta_file_path = 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'
mitopatho_csv_path = 'output/AmtDB MitoPathoTool.csv'
missing_ids = missing_ids_amtDB
matched_metadata_df = match_reich_metadata(reich_meta_file_path, missing_ids, mitopatho_csv_path)


Reading Reich metadata...
Reich metadata has 16388 rows.
MitoPatho data has 1709 rows.
Processed 263 out of 920 missing IDs.
Matched 263 rows with the given missing IDs.


In [20]:
import pandas as pd
import re

def parse_publication_name(reference):
    first_name = re.search(r"[A-Z][a-z]+", reference).group()
    year = re.search(r"\d{4}", reference).group()
    formatted_ref = f"{first_name} et al. {year}"
    return formatted_ref

def match_reich_metadata(reich_meta_file, missing_ids):
    print("Reading Reich metadata...")
    reich_meta = pd.read_csv(reich_meta_file, sep='\t', header=0, low_memory=False)
    print(f"Reich metadata has {len(reich_meta)} rows.")
    
    missing_ids_set = set(missing_ids)
    
    matched_data = []
    
    for _, row in reich_meta.iterrows():
        master_id = row['Master ID']
        if master_id not in missing_ids_set:
            continue
        
        date_range_match = re.search(r'(\d{4}\u00b1\d{2})', row['Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990\u00b140 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE'])
        if date_range_match:
            date_range = date_range_match.group(1)
            bp, error = date_range.split('\u00b1')
            year_from = -int(bp.replace('BP', '').strip())
            year_to = -(int(bp.replace('BP', '').strip()) - int(error) * 2)
        else:
            continue
        
        date_detail = row['Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990\u00b140 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE']
        c14_lab_code_match = re.search(r'Ua-(\w+)', row['Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990\u00b140 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE'])
        c14_lab_code = c14_lab_code_match.group(1) if c14_lab_code_match else 'nan'
        c14_sample_tag = 1 if year_to > 1950 else 0
            
        matched_data.append({
            'identifier':               row['Master ID'], 
            'alternative_identifiers':  row.get('Genetic ID', ""), 
            'country':                  row.get('Political Entity', ""), 
            'continent':                "",
            'region':                   "",
            'culture':                  row.get('Group ID', ""),
            'epoch':                    "",
            'group':                    row.get('Group ID', ""),
            'comment':                  "",
            'latitude':                 row.get('Lat.', ""),
            'longitude':                row.get('Long.', ""), 
            'sex':                      row.get('Molecular Sex', ""), 
            'site':                     row.get('Locality', ""), 
            'site_detail':              "",
            'mt_hg':                    row.get('mtDNA haplogroup if >2x or published', ""),
            'ychr_hg':                  row.get('Y haplogroup (manual curation in ISOGG format)', ""),
            'year_from':                year_from, 
            'year_to':                  year_to, 
            'date_detail':              date_detail,
            'bp':                       bp, 
            'c14_lab_code':             c14_lab_code,
            'reference_name':           parse_publication_name(row.get('Publication', "")),
            'c14_sample_tag':           c14_sample_tag,
            'ychr_snps':                row.get('Y haplogroup (manual curation in terminal mutation format)', ""),
            'avg_coverage':             row.get('mtDNA coverage (merged data)', ""),
            'sequence_source':          'fasta',
        })

    print(f"Processed {len(matched_data)} out of {len(missing_ids)} missing IDs.")
    matched_df = pd.DataFrame(matched_data)
    print(f"Matched {len(matched_df)} rows with the given missing IDs.")
    return matched_df

            
        # Construct the matched data entry with Reich data



In [21]:
len(missing_ids_amtDB)
meta_mt_dataset = pd.read_csv('data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno', sep='\t')
master_id = meta_mt_dataset['Master ID']
# intersection of missing sequences in AmtDB and Master ID from meta_mt_dataset
missing_ids_amtDB_reich = list(set(missing_ids_amtDB) & set(master_id) )
len(missing_ids_amtDB_reich)

  meta_mt_dataset = pd.read_csv('data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno', sep='\t')


529

In [22]:
# Example usage:
reich_meta_file_path = 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'
mitopatho_csv_path = 'output/AmtDB MitoPathoTool.csv'
missing_ids = missing_ids_amtDB
matched_metadata_df = match_reich_metadata(reich_meta_file_path, missing_ids)


Reading Reich metadata...
Reich metadata has 16388 rows.
Processed 263 out of 920 missing IDs.
Matched 263 rows with the given missing IDs.


In [29]:
import pandas as pd
import re

def parse_publication_name(reference):
    # Parses the publication name from a reference string.
    match_first_name = re.search(r"[A-Z][a-z]+", reference)
    match_year = re.search(r"\d{4}", reference)
    if match_first_name and match_year:
        first_name = match_first_name.group()
        year = match_year.group()
        formatted_ref = f"{first_name} et al. {year}"
        return formatted_ref
    return "Unknown publication"

def match_reich_metadata(reich_meta_file, missing_ids):
    print("Reading Reich metadata...")
    reich_meta = pd.read_csv(reich_meta_file, sep='\t', header=0, low_memory=False)
    print(f"Reich metadata has {len(reich_meta)} rows.")
    
    missing_ids_set = set(missing_ids)
    
    matched_data = []
    
    for _, row in reich_meta.iterrows():
        master_id = row['Master ID']
        if master_id not in missing_ids_set:
            continue
        
        # Attempt to parse the calibrated radiocarbon age and archaeological context range
        date_detail = row.get('Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990±40 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE', '')
        date_range_match = re.search(r'(\d{4})\u00b1(\d{2})', date_detail)
        c14_lab_code_match = re.search(r'Ua-(\w+)', date_detail)
        year_from, year_to, c14_lab_code = None, None, 'Unknown'

        if date_range_match:
            bp, error = date_range_match.groups()
            year_from = -int(bp)  # Assume the presence of 'BP' indicates Before Present
            year_to = -(int(bp) - int(error) * 2)
        if c14_lab_code_match:
            c14_lab_code = c14_lab_code_match.group(1)

        c14_sample_tag = 1 if year_to and year_to > 1950 else 0
        
        matched_data.append({
            'identifier':               master_id,
            'alternative_identifiers':  row.get('Genetic ID', ""), 
            'country':                  row.get('Political Entity', ""), 
            'continent':                "",
            'region':                   "",
            'culture':                  row.get('Group ID', ""),
            'epoch':                    "",
            'group':                    row.get('Group ID', ""),
            'comment':                  "",
            'latitude':                 row.get('Lat.', ""),
            'longitude':                row.get('Long.', ""), 
            'sex':                      row.get('Molecular Sex', ""), 
            'site':                     row.get('Locality', ""), 
            'site_detail':              "",
            'mt_hg':                    row.get('mtDNA haplogroup if >2x or published', ""),
            'ychr_hg':                  row.get('Y haplogroup (manual curation in ISOGG format)', ""),
            'year_from':                year_from,
            'year_to':                  year_to,
            'date_detail':              date_detail,
            'bp':                       bp if date_range_match else 'Unknown', 
            'c14_lab_code':             c14_lab_code,
            'reference_name':           parse_publication_name(row.get('Publication', "")),
            'c14_sample_tag':           c14_sample_tag,
            'ychr_snps':                row.get('Y haplogroup (manual curation in terminal mutation format)', ""),
            'avg_coverage':             row.get('mtDNA coverage (merged data)', ""),
            'sequence_source':          'fasta',
        })

    print(f"Processed {len(matched_data)} out of {len(missing_ids)} missing IDs.")
    matched_df = pd.DataFrame(matched_data)
    return matched_df

# Usage of the function remains as previously outlined.


In [30]:
# Load your dataset containing Master IDs
meta_mt_dataset = pd.read_csv('data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno', sep='\t')
master_id = meta_mt_dataset['Master ID'].tolist()

# Assuming you have a predefined list of missing IDs (missing_ids_amtDB)
# Find the intersection with Master IDs from your dataset
missing_ids_amtDB_reich = list(set(missing_ids_amtDB) & set(master_id))

# Match the metadata for the intersected IDs
reich_meta_file_path = 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'
matched_metadata_df = match_reich_metadata(reich_meta_file_path, missing_ids_amtDB_reich)

print(f"Total matched rows: {len(matched_metadata_df)}")

  meta_mt_dataset = pd.read_csv('data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno', sep='\t')


Reading Reich metadata...
Reich metadata has 16388 rows.
Processed 561 out of 529 missing IDs.
Total matched rows: 561


In [31]:
#  save the matched metadata to a file
matched_metadata_df.to_csv('output/matched_metadata.csv', index=False)