In [1]:
# set dir to /home/dodo/projects/aDNA_Comparative_Analysis
import os
import pandas as pd
os.chdir('/home/dodo/projects/aDNA_Comparative_Analysis')

In [2]:
from src.data_loading import load_data, load_mt_dataset
from src.data_processing import DataProcessor, MetadataMatcher

In [3]:
# metadata_file -> AmtDB csv metadata file
# fasta_amtdb -> AmtDB fasta file
# anno_file -> Reich annotation file = metadata file
# fasta_reich -> Reich fasta file

metadata_file = 'data/amtDB/amtdb_metadata.csv'
fasta_amtdb = 'data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta'
anno_file = 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'
fasta_reich = 'data/mitogenomes_reich/mtdna_reich.fasta'


In [4]:
meta_amtDB, ids_seq_fasta = load_data(metadata_file, fasta_amtdb)
ids_mt_dataset, meta_mt_dataset = load_mt_dataset(fasta_reich, anno_file)


Loading AmtDB metadata from 'data/amtDB/amtdb_metadata.csv' and sequence IDs from 'data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta'...
Loaded AmtDB metadata with 2541 records and 1621 sequences.

Loading 'Reich mt dataset' from 'data/mitogenomes_reich/mtdna_reich.fasta' and metadata from 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'...
Loaded 'Reich mt dataset' with 4122 sequences and metadata with 16388 records.



In [5]:
meta_amtDB.head(2)

Unnamed: 0,identifier,alternative_identifiers,country,continent,region,culture,epoch,group,comment,latitude,...,ychr_snps,avg_coverage,sequence_source,mitopatho_alleles,mitopatho_positions,mitopatho_locus,mitopatho_diseases,mitopatho_statuses,mitopatho_homoplasms,mitopatho_heteroplasms
0,RISE509,1622.0,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,,,,,,,
1,RISE510,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,12705T,12705.0,MT-CO1,Possible protective factor for normal tension ...,Reported,,


In [6]:


# Process AmtDB data to find missing sequences
processor_amtDB = DataProcessor(meta_amtDB, fasta_amtdb)
missing_ids_amtDB = processor_amtDB.find_missing_sequences()

Found 920 missing sequences.



In [7]:


# Extract missing sequences from the Reich dataset (fasta_reich) and save them
output_fasta = "output/missing_sequences_AmtDB.fasta"
processor_amtDB.extract_and_save_sequences(fasta_reich, missing_ids_amtDB, output_fasta)

Extracting sequences matching the specified IDs from 'data/mitogenomes_reich/mtdna_reich.fasta'...
Saved 404 sequences to 'output/missing_sequences_AmtDB.fasta'.



In [8]:
!haplogrep3 classify --tree phylotree-rcrs@17.0 --in output/missing_sequences_AmtDB.fasta --out output/analysis_result.hsd --extend-report



Haplogrep 3 3.2.1
(c) 2022-2023 Sebastian Schönherr, Hansi Weissensteiner, Lukas Forer
[M::bwa_idx_load_from_disk] read 0 ALT contigs
Written haplogroups to file output/analysis_result.hsd


In [9]:
# load analysis_result.hsd
analysis_result = pd.read_csv('output/analysis_result.hsd', sep='\t')
analysis_result.head(2)


Unnamed: 0,SampleID,Haplogroup,Rank,Quality,Range,Not_Found_Polys,Found_Polys,Remaining_Polys,AAC_In_Remainings,Input_Sample
0,I0070,H13a1a,1,0.9691,1-16569,,263G 750G 1438G 2259T 4745G 4769G 8860G 13680T...,310C (localPrivateMutation) 3107T (globalPriva...,,263G 310C 750G 1438G 2259T 3107T 4745G 4769G 8...
1,I0071,U5a1,1,0.9644,1-16569,,73G 263G 750G 1438G 2706G 3197C 4769G 7028T 88...,310C (localPrivateMutation) 3107T (globalPriva...,8705C [M60T| Codon 2 | ATP6 ],73G 263G 310C 750G 1438G 2706G 3107T 3197C 476...


In [10]:
# columns
analysis_result.columns


Index(['SampleID', 'Haplogroup', 'Rank', 'Quality', 'Range', 'Not_Found_Polys',
       'Found_Polys', 'Remaining_Polys', 'AAC_In_Remainings', 'Input_Sample'],
      dtype='object')

In [11]:

# Rename 'SampleID' column to 'ID'
analysis_result.rename(columns={'SampleID': 'ID'}, inplace=True)


In [12]:
# Use 'Found_Polys' as the 'Polymorphisms' column directly
# Assuming 'Found_Polys' accurately represents the polymorphisms we want to include
# If a combination of columns is needed, you may need to concatenate them appropriately here
analysis_result['Polymorphisms'] = analysis_result['Found_Polys']

In [13]:
# Ensure the columns are in the correct order: ['ID', 'Range', 'Haplogroup', 'Polymorphisms']
# Selecting the necessary columns and ensuring they are in the right order
hsd_df = analysis_result[['ID', 'Range', 'Haplogroup', 'Polymorphisms']]

# Display the first few rows of the transformed DataFrame to verify its structure
hsd_df.head()



Unnamed: 0,ID,Range,Haplogroup,Polymorphisms
0,I0070,1-16569,H13a1a,263G 750G 1438G 2259T 4745G 4769G 8860G 13680T...
1,I0071,1-16569,U5a1,73G 263G 750G 1438G 2706G 3197C 4769G 7028T 88...
2,I0073,1-16569,H,263G 750G 1438G 4769G 8860G 15326G
3,I0074,1-16569,H5,263G 456T 750G 1438G 4769G 8860G 15326G 16304C
4,I0108,3-16568 16570-16569 1-16569,H5a3,263G 456T 513A 750G 1438G 4336C 4769G 8860G 15...


In [14]:
# Save the transformed DataFrame to a new CSV file in HSD format
hsd_df.to_csv('output/processed_haplogroups.hsd', sep='\t', index=False)

In [18]:
!python3 mitopatho/mitopatho.py -i output/processed_haplogroups.hsd -o output/mitopatho_output.txt



###########################################################################
# MitoPathoPy - Tool for annotating pathological mutations in human mtDNA #
###########################################################################
Author: Edvard Ehler, PhD (edvard.ehler@img.cas.cz)
Year: 2020
Source: https://github.com/EdaEhler/MitoPathoPy

This tool will search for the (potential) pathological mutations in you mtDNA samples. To get the right input format, please, use the Haplogrep 2 (https://haplogrep.i-med.ac.at/app/index.html) tool to turn your sequences (fastas) into HSD format file. Haplogrep can be also downloaded and run localy (https://github.com/seppinho/haplogrep-cmd). Pathological mtDNA mutations are taken from data published at https://www.mitomap.org/MITOMAP.

We invite you to check our ancient mtDNA database at: https://amtdb.org/

If you use this tool in you research, please, consider citing our article:
Ehler E, Novotný J, Juras A, Chyleński M, Moravčík O, Pačes J. AmtDB:

In [23]:
# load mitopatho_output.txt
mitopatho_output = pd.read_csv('output/mitopatho_output.txt', sep='\t')
mitopatho_output.head(5)


Unnamed: 0,ID,Unnamed: 1
0,I0070,
1,I0071,"11467G,Altered brain pH / sCJD patients,Report..."
2,I0073,
3,I0074,
4,I0108,"4336C,ADPD / Hearing Loss & Migraine / autism ..."


In [29]:
# load 'AmtDB MitoPathoTools.csv'
mitopatho_tools_out = pd.read_csv('output/AmtDB MitoPathoTool.csv')
mitopatho_tools_out.columns


Index(['ID', 'Polymorphism', 'Position', 'Status', 'Locus', 'Diseases',
       'Homoplasmy', 'Heteroplasmy'],
      dtype='object')

In [None]:
def match_reich_metadata(reich_meta_file, missing_ids):
    """
    Extracts and matches metadata for missing sequences from a specified metadata file.

    Parameters:
    - reich_meta_file (str): Path to the metadata file containing information on sequences.
    - missing_ids (list): List of sequence IDs for which metadata needs to be matched.

    Returns:
    - DataFrame: DataFrame containing matched metadata entries for the missing sequences.
    """
    print("Reading Reich metadata...")
    reich_meta = pd.read_csv(reich_meta_file, sep='\t', header=0, low_memory=False)
    print(f"Reich metadata has {len(reich_meta)} rows.")
    matched_data = []
    for index, row in reich_meta.iterrows():
        if row['Master ID'] in missing_ids:
            date_range_match = re.search(r'(\d{4}\u00b1\d{2})', row['Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990\u00b140 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE'])
            if date_range_match:
                date_range = date_range_match.group(1)
                bp, error = date_range.split('\u00b1')
                year_from = -int(bp.replace('BP', ''))
                year_to = -(int(bp.replace('BP', '')) - int(error) * 2)
                date_detail = row['Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990\u00b140 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE']
                c14_lab_code_match = re.search(r'Ua-(\w+)', date_detail)
                if c14_lab_code_match:
                    c14_lab_code = c14_lab_code_match.group(1)
                else:
                    c14_lab_code = 'nan'  # or any other default value you prefer

                c14_sample_tag = 1 if year_to > 1950 else 0
                c14_layer_tag = 0
                missing = 'NaN'
                seq_source = 'fasta'
                matched_data.append({
                    'identifier':               row['Master ID'], 
                    'alternative_identifiers':  row['Genetic ID'], 
                    'country':                  row['Political Entity'], 
                    'continent':                missing,
                    'region':                   missing,
                    'culture':                  row['Group ID'],
                    'epoch':                    missing,
                    'group':                    row['Group ID'],
                    'comment':                  missing,
                    'latitude':                 row['Lat.'],
                    'longitude':                row['Long.'], 
                    'sex':                      row['Molecular Sex'], 
                    'site':                     row['Locality'], 
                    'site_detail':              missing,
                    'mt_hg':                    row['mtDNA haplogroup if >2x or published'],
                    'ychr_hg':                  row['Y haplogroup (manual curation in ISOGG format)'],
                    'year_from':                year_from, 
                    'year_to':                  year_to, 
                    'date_detail':              date_detail, 
                    'bp':                       bp, 
                    'c14_lab_code':             c14_lab_code,
                    'reference_name':           row['Publication'], # (change to our format: from MaoFuCell2021 to "Allentoft et al. 2015"
                    'reference_link':           missing, # create a reference table and link it to the metadata table)
                    'data_link':                missing,
                    'c14_sample_tag':           c14_sample_tag, # if in the date is after BP for example Ua-35016 set to 1 elso 0
                    'c14_layer_tag':            c14_layer_tag,
                    'ychr_snps':                row['Y haplogroup (manual curation in terminal mutation format)'],
                    'avg_coverage':             row['mtDNA coverage (merged data)'],
                    'sequence_source':          seq_source,
                    'mitopatho_alleles':        # mitopatho columns will be parsed from this csv file 'output/AmtDB MitoPathoTool.csv' with these 
                    'mitopatho_positions':      , positions z mitopatho
                    'mitopatho_locus':          , locus
                    'mitopatho_diseases':       , diseases
                    'mitopatho_statuses':       , status
                    'mitopatho_homoplasms':     , homoplasms
                    'mitopatho_heteroplasms':     heteroplasms
                })
    # print(f"Matched {len(matched_data)} metadata entries." if len(matched_data) > 0 else "No metadata entries matched.")
    print("Matching metadata complete.")
    return pd.DataFrame(matched_data)