In [1]:
# set dir to /home/dodo/projects/aDNA_Comparative_Analysis
import os
import pandas as pd
os.chdir('/home/dodo/projects/aDNA_Comparative_Analysis')

In [2]:
from src.data_loading import load_data, load_mt_dataset
from src.data_processing import DataProcessor, MetadataMatcher

In [3]:
# metadata_file -> AmtDB csv metadata file
# fasta_amtdb -> AmtDB fasta file
# anno_file -> Reich annotation file = metadata file
# fasta_reich -> Reich fasta file

metadata_file = 'data/amtDB/amtdb_metadata.csv'
fasta_amtdb = 'data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta'
anno_file = 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'
fasta_reich = 'data/mitogenomes_reich/mtdna_reich.fasta'


In [4]:
meta_amtDB, ids_seq_fasta = load_data(metadata_file, fasta_amtdb)
ids_mt_dataset, meta_mt_dataset = load_mt_dataset(fasta_reich, anno_file)


Loading AmtDB metadata from 'data/amtDB/amtdb_metadata.csv' and sequence IDs from 'data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta'...
Loaded AmtDB metadata with 2541 records and 1621 sequences.

Loading 'Reich mt dataset' from 'data/mitogenomes_reich/mtdna_reich.fasta' and metadata from 'data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno'...
Loaded 'Reich mt dataset' with 4122 sequences and metadata with 16388 records.



In [5]:
meta_amtDB.head(2)

Unnamed: 0,identifier,alternative_identifiers,country,continent,region,culture,epoch,group,comment,latitude,...,ychr_snps,avg_coverage,sequence_source,mitopatho_alleles,mitopatho_positions,mitopatho_locus,mitopatho_diseases,mitopatho_statuses,mitopatho_homoplasms,mitopatho_heteroplasms
0,RISE509,1622.0,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,,,,,,,
1,RISE510,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,12705T,12705.0,MT-CO1,Possible protective factor for normal tension ...,Reported,,


In [6]:


# Process AmtDB data to find missing sequences
processor_amtDB = DataProcessor(meta_amtDB, fasta_amtdb)
missing_ids_amtDB = processor_amtDB.find_missing_sequences()

Found 920 missing sequences.



In [7]:


# Extract missing sequences from the Reich dataset (fasta_reich) and save them
output_fasta = "output/missing_sequences_AmtDB.fasta"
processor_amtDB.extract_and_save_sequences(fasta_reich, missing_ids_amtDB, output_fasta)

Extracting sequences matching the specified IDs from 'data/mitogenomes_reich/mtdna_reich.fasta'...
Saved 404 sequences to 'output/missing_sequences_AmtDB.fasta'.



In [8]:
!haplogrep3 classify --tree phylotree-rcrs@17.0 --in output/missing_sequences_AmtDB.fasta --out output/analysis_result.hsd --extend-report



Haplogrep 3 3.2.1
(c) 2022-2023 Sebastian Schönherr, Hansi Weissensteiner, Lukas Forer
[M::bwa_idx_load_from_disk] read 0 ALT contigs


In [None]:
# load analysis_result.hsd
analysis_result = pd.read_csv('output/analysis_result.hsd', sep='\t')
analysis_result.head(2)


Unnamed: 0,SampleID,Haplogroup,Rank,Quality,Range,Not_Found_Polys,Found_Polys,Remaining_Polys,AAC_In_Remainings,Input_Sample
0,I0070,H13a1a,1,0.9691,1-16569,,263G 750G 1438G 2259T 4745G 4769G 8860G 13680T...,310C (localPrivateMutation) 3107T (globalPriva...,,263G 310C 750G 1438G 2259T 3107T 4745G 4769G 8...
1,I0071,U5a1,1,0.9644,1-16569,,73G 263G 750G 1438G 2706G 3197C 4769G 7028T 88...,310C (localPrivateMutation) 3107T (globalPriva...,8705C [M60T| Codon 2 | ATP6 ],73G 263G 310C 750G 1438G 2706G 3107T 3197C 476...


In [None]:
# columns
analysis_result.columns


Index(['SampleID', 'Haplogroup', 'Rank', 'Quality', 'Range', 'Not_Found_Polys',
       'Found_Polys', 'Remaining_Polys', 'AAC_In_Remainings', 'Input_Sample'],
      dtype='object')

In [None]:

# Rename 'SampleID' column to 'ID'
analysis_result.rename(columns={'SampleID': 'ID'}, inplace=True)


In [None]:
# Use 'Found_Polys' as the 'Polymorphisms' column directly
# Assuming 'Found_Polys' accurately represents the polymorphisms we want to include
# If a combination of columns is needed, you may need to concatenate them appropriately here
analysis_result['Polymorphisms'] = analysis_result['Found_Polys']

In [None]:
# Ensure the columns are in the correct order: ['ID', 'Range', 'Haplogroup', 'Polymorphisms']
# Selecting the necessary columns and ensuring they are in the right order
hsd_df = analysis_result[['ID', 'Range', 'Haplogroup', 'Polymorphisms']]

# Display the first few rows of the transformed DataFrame to verify its structure
hsd_df.head()



Unnamed: 0,ID,Range,Haplogroup,Polymorphisms
0,I0070,1-16569,H13a1a,263G 750G 1438G 2259T 4745G 4769G 8860G 13680T...
1,I0071,1-16569,U5a1,73G 263G 750G 1438G 2706G 3197C 4769G 7028T 88...
2,I0073,1-16569,H,263G 750G 1438G 4769G 8860G 15326G
3,I0074,1-16569,H5,263G 456T 750G 1438G 4769G 8860G 15326G 16304C
4,I0108,3-16568 16570-16569 1-16569,H5a3,263G 456T 513A 750G 1438G 4336C 4769G 8860G 15...


In [None]:
# Save the transformed DataFrame to a new CSV file in HSD format
hsd_df.to_csv('output/processed_haplogroups.hsd', sep='\t', index=False)