In [1]:
# set dir to /home/dodo/projects/aDNA_Comparative_Analysis
import os
import pandas as pd
os.chdir('../../aDNA_Comparative_Analysis')
import re

In [2]:
from src.data_loading import load_data, load_mt_dataset
from src.data_processing import DataProcessor
from src.parsing import match_reich_metadata

In [3]:
# metadata_file -> AmtDB csv metadata file
# fasta_amtdb -> AmtDB fasta file
# anno_file -> Reich annotation file = metadata file
# fasta_reich -> Reich fasta file

metadata_file = 'data/amtDB/amtdb_metadata.csv'
fasta_amtdb = 'data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta'
anno_file = 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'
fasta_reich = 'data/mitogenomes_reich/mtdna_reich.fasta'


In [4]:
meta_amtDB, ids_seq_fasta = load_data(metadata_file, fasta_amtdb)
ids_mt_dataset, meta_mt_dataset = load_mt_dataset(fasta_reich, anno_file)


Loading AmtDB metadata from 'data/amtDB/amtdb_metadata.csv' and sequence IDs from 'data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta'...
Loaded AmtDB metadata with 2541 records and 1621 sequences.

Loading 'Reich mt dataset' from 'data/mitogenomes_reich/mtdna_reich.fasta' and metadata from 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'...
Loaded 'Reich mt dataset' with 4122 sequences and metadata with 16388 records.



In [5]:
# Process AmtDB data to find missing sequences
processor_amtDB = DataProcessor(meta_amtDB, fasta_amtdb)
missing_ids_amtDB = processor_amtDB.find_missing_sequences()


Found 920 missing sequences.



In [6]:
# intersection of missing sequences in AmtDB and Master ID from meta_mt_dataset
missing_ids_amtDB_reich = list(set(missing_ids_amtDB) & set(meta_mt_dataset['Master ID']) )
len(missing_ids_amtDB_reich)


529

In [7]:


# Extract missing sequences from the Reich dataset (fasta_reich) and save them
output_fasta = "output/missing_sequences_AmtDB.fasta"
processor_amtDB.extract_and_save_sequences(fasta_reich, missing_ids_amtDB, output_fasta)

Extracting sequences matching the specified IDs from 'data/mitogenomes_reich/mtdna_reich.fasta'...
Saved 404 sequences to 'output/missing_sequences_AmtDB.fasta'.



In [8]:
!haplogrep3 classify --tree phylotree-rcrs@17.0 --in output/missing_sequences_AmtDB.fasta --out output/analysis_result.hsd --extend-report



Haplogrep 3 3.2.1
(c) 2022-2023 Sebastian Schönherr, Hansi Weissensteiner, Lukas Forer
[M::bwa_idx_load_from_disk] read 0 ALT contigs
Written haplogroups to file output/analysis_result.hsd


In [9]:
# load analysis_result.hsd
analysis_result = pd.read_csv('output/analysis_result.hsd', sep='\t')
analysis_result.head(2)


Unnamed: 0,SampleID,Haplogroup,Rank,Quality,Range,Not_Found_Polys,Found_Polys,Remaining_Polys,AAC_In_Remainings,Input_Sample
0,I0070,H13a1a,1,0.9691,1-16569,,263G 750G 1438G 2259T 4745G 4769G 8860G 13680T...,310C (localPrivateMutation) 3107T (globalPriva...,,263G 310C 750G 1438G 2259T 3107T 4745G 4769G 8...
1,I0071,U5a1,1,0.9644,1-16569,,73G 263G 750G 1438G 2706G 3197C 4769G 7028T 88...,310C (localPrivateMutation) 3107T (globalPriva...,8705C [M60T| Codon 2 | ATP6 ],73G 263G 310C 750G 1438G 2706G 3107T 3197C 476...


In [10]:
# columns
analysis_result.columns


Index(['SampleID', 'Haplogroup', 'Rank', 'Quality', 'Range', 'Not_Found_Polys',
       'Found_Polys', 'Remaining_Polys', 'AAC_In_Remainings', 'Input_Sample'],
      dtype='object')

In [11]:

# Rename 'SampleID' column to 'ID'
analysis_result.rename(columns={'SampleID': 'ID'}, inplace=True)


In [12]:
# Use 'Found_Polys' as the 'Polymorphisms' column directly
# Assuming 'Found_Polys' accurately represents the polymorphisms we want to include
# If a combination of columns is needed, you may need to concatenate them appropriately here
analysis_result['Polymorphisms'] = analysis_result['Found_Polys']

In [13]:
!python3 mitopatho/mitopatho.py -i output/processed_haplogroups.hsd -o output/mitopatho_output.txt



###########################################################################
# MitoPathoPy - Tool for annotating pathological mutations in human mtDNA #
###########################################################################
Author: Edvard Ehler, PhD (edvard.ehler@img.cas.cz)
Year: 2020
Source: https://github.com/EdaEhler/MitoPathoPy

This tool will search for the (potential) pathological mutations in you mtDNA samples. To get the right input format, please, use the Haplogrep 2 (https://haplogrep.i-med.ac.at/app/index.html) tool to turn your sequences (fastas) into HSD format file. Haplogrep can be also downloaded and run localy (https://github.com/seppinho/haplogrep-cmd). Pathological mtDNA mutations are taken from data published at https://www.mitomap.org/MITOMAP.

We invite you to check our ancient mtDNA database at: https://amtdb.org/

If you use this tool in you research, please, consider citing our article:
Ehler E, Novotný J, Juras A, Chyleński M, Moravčík O, Pačes J. AmtDB:

In [14]:
# load mitopatho_output.txt
mitopatho_output = pd.read_csv('output/mitopatho_output.txt', sep='\t')


In [15]:
# pprint first two rows of the DataFrame but full data 
id_1 = mitopatho_output[mitopatho_output['ID'] == 'I0071']
# print second column of the first row
id_1.iloc[0, 1]

'11467G,Altered brain pH / sCJD patients,Reported;12308G,CPEO / Stroke / CM / Breast & Renal & Prostate Cancer Risk / Altered brain pH /sCJD,Reported;12372A,Altered brain pH / sCJD patients,Reported;16192T,Melanoma patients,Reported;16270T,Melanoma patients,Reported'

In [16]:
missing_ids_amtDB = processor_amtDB.find_missing_sequences()
ids_mt_dataset, meta_mt_dataset = load_mt_dataset(fasta_reich, anno_file)


Found 920 missing sequences.

Loading 'Reich mt dataset' from 'data/mitogenomes_reich/mtdna_reich.fasta' and metadata from 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'...
Loaded 'Reich mt dataset' with 4122 sequences and metadata with 16388 records.



In [17]:
# how many missing_ids_amtDB in the reich dataset
missing_ids_amtDB_in_reich = [id for id in missing_ids_amtDB if id in meta_mt_dataset['Master ID'].values]
len(missing_ids_amtDB_in_reich)


529

In [18]:
# Example of how to call the function with paths to your CSV files
reich_meta_file_path = 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'
mitopatho_csv_path = 'output/AmtDB MitoPathoTool.csv'
missing_ids = missing_ids_amtDB

# Now, match metadata with the updated function that includes MitoPatho data
matched_metadata_df = match_reich_metadata(reich_meta_file_path, missing_ids, mitopatho_csv_path)
print(f"Total matched rows: {len(matched_metadata_df)}")


Reading Reich metadata...
Reich metadata has 16388 rows.
MitoPatho data has 1709 rows.
Processed 561 out of 920 missing IDs.
Total matched rows: 561


In [19]:
#  save the matched metadata to a file
matched_metadata_df.to_csv('output/matched_metadata.csv', index=False)