# Outline

1. Environment Setup: Creating a conda environment and importing necessary libraries.
2. Folder Creation: Setting up folders for input and output files.
3. Data Loading: Loading metadata from a CSV file and sequence data from a FASTA file.
4. ID Comparison: Identifying which IDs from the metadata are not present in the FASTA file.
5. Sequence Retrieval: Loading another FASTA file and finding missing sequences for the identified IDs.
6. Output: Saving the newly found sequences with their IDs to a new FASTA file.

  conda create -n ancient_dna_env python=3.8 biopython pandas matplotlib numpy jupyter ipython scipy seaborn -y


In [11]:
## V6

# Importing necessary libraries
from Bio import SeqIO
import pandas as pd
import re
import os

# Function to create necessary directories
def create_directories():
    if not os.path.exists('output'):
        os.makedirs('output')

# Function to load metadata and sequence IDs
def load_data(metadata_file, fasta_file):
    """
    Loads metadata from a CSV file and sequence IDs from a FASTA file.

    Parameters:
    - metadata_file (str): Path to the CSV file containing metadata.
    - fasta_file (str): Path to the FASTA file from which sequence IDs will be extracted.

    Returns:
    - meta_amtDB (DataFrame): DataFrame containing loaded metadata.
    - ids_seq_fasta (list): List of sequence IDs extracted from the FASTA file.
    """
    meta_amtDB = pd.read_csv(metadata_file, sep=',', header=0)
    ids_seq_fasta = [seq_record.id for seq_record in SeqIO.parse(fasta_file, "fasta")]
    return meta_amtDB, ids_seq_fasta

# Function to find missing sequences in AmtDB
def find_missing_sequences(meta_amtDB, ids_seq_fasta):
    """
    Identifies sequence IDs that are present in the metadata DataFrame but missing from the list of sequence IDs.

    Parameters:
    - meta_amtDB (DataFrame): DataFrame containing metadata with sequence identifiers.
    - ids_seq_fasta (list): List of sequence IDs extracted from the FASTA file.

    Returns:
    - list: List of sequence IDs that are missing in the FASTA file but present in the metadata.
    """
    amtDB_ids = set(meta_amtDB['identifier'])
    fasta_ids = set(ids_seq_fasta)
    return list(amtDB_ids.difference(fasta_ids))

def extract_sequences_from_aadr(aadr_fasta, missing_ids):
    """
    Extracts sequences from a FASTA file that match the missing sequence IDs.

    Parameters:
    - aadr_fasta (str): Path to the FASTA file from which sequences are to be extracted.
    - missing_ids (list): List of sequence IDs that are missing and need to be extracted.

    Returns:
    - new_sequences (list): List of SeqRecord objects for the extracted sequences.
    """
    new_sequences = []
    for seq_record in SeqIO.parse(aadr_fasta, "fasta"):
        if seq_record.id in missing_ids:
            new_sequences.append(seq_record)
    return new_sequences

def save_new_sequences(new_sequences, output_file):
    """
    Saves the newly extracted sequences to a FASTA file.

    Parameters:
    - new_sequences (list): List of SeqRecord objects representing the new sequences.
    - output_file (str): Path to the output FASTA file where sequences will be saved.

    Returns:
    None
    """
    with open(output_file, 'w') as file:
        for seq in new_sequences:
            SeqIO.write(seq, file, "fasta")
            
# Function to extract and match metadata from Reich mitogenomes
def match_reich_metadata(reich_meta_file, missing_ids):
    """
    Extracts and matches metadata for missing sequences from a specified metadata file.

    Parameters:
    - reich_meta_file (str): Path to the metadata file containing information on sequences.
    - missing_ids (list): List of sequence IDs for which metadata needs to be matched.

    Returns:
    - DataFrame: DataFrame containing matched metadata entries for the missing sequences.
    """
    print("Reading Reich metadata...")
    reich_meta = pd.read_csv(reich_meta_file, sep='\t', header=0, low_memory=False)
    print(f"Reich metadata has {len(reich_meta)} rows.")
    matched_data = []
    for index, row in reich_meta.iterrows():
        if row['Master ID'] in missing_ids:
            date_range_match = re.search(r'(\d{4}\u00b1\d{2})', row['Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990\u00b140 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE'])
            if date_range_match:
                date_range = date_range_match.group(1)
                bp, error = date_range.split('\u00b1')
                year_from = -int(bp.replace('BP', ''))
                year_to = -(int(bp.replace('BP', '')) - int(error) * 2)
                date_detail = row['Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990\u00b140 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE']
                c14_lab_code_match = re.search(r'Ua-(\w+)', date_detail)
                if c14_lab_code_match:
                    c14_lab_code = c14_lab_code_match.group(1)
                else:
                    c14_lab_code = 'nan'  # or any other default value you prefer

                matched_data.append({
                    'identifier':               row['Master ID'], 
                    'alternative_identifiers':  row['Genetic ID'], 
                    'country':                  row['Political Entity'], 
                    # 'continent':                ,
                    # 'region':                   , 
                    # 'culture':                  , 
                    # 'epoch':                    , 
                    # 'group':                    , 
                    # 'comment':                  , 
                    'latitude':                 row['Lat.'],
                    'longitude':                row['Long.'], 
                    'sex':                      row['Molecular Sex'], 
                    'site':                     row['Locality'], 
                    # 'site_detail':              , 
                    # 'mt_hg':                    , 
                    # 'ychr_hg':                  ,
                    'year_from':                year_from, 
                    'year_to':                  year_to, 
                    'date_detail':              date_detail, 
                    'bp':                       bp, 
                    'c14_lab_code':             c14_lab_code,
                    'reference_name':           row['Publication'], 
                    # 'reference_link':           , 
                    # 'data_link':                , 
                    # 'c14_sample_tag':           ,
                    # 'c14_layer_tag':            , 
                    # 'ychr_snps':                , 
                    # 'avg_coverage':             , 
                    # 'sequence_source':          ,
                    # 'mitopatho_alleles':        , 
                    # 'mitopatho_positions':      , 
                    # 'mitopatho_locus':          ,
                    # 'mitopatho_diseases':       , 
                    # 'mitopatho_statuses':       , 
                    # 'mitopatho_homoplasms':     ,
                    # 'mitopatho_heteroplasms':   
                })
    # print(f"Matched {len(matched_data)} metadata entries." if len(matched_data) > 0 else "No metadata entries matched.")
    print("Matching metadata complete.")
    return pd.DataFrame(matched_data)


# Main pipeline execution
def main():
    """
    Main function to execute the pipeline for processing sequence data and metadata.
    It orchestrates creating necessary directories, loading data, finding and extracting missing sequences,
    saving new sequences, and matching and saving metadata for these sequences.

    """
    create_directories()
    print("Output directory created or already exists.\n")

    meta_amtDB, ids_seq_fasta = load_data('data/amtDB/amtdb_metadata.csv', "data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta")
    print(f"Metadata loaded with {meta_amtDB.shape[0]} records.")
    print(f"Sequence IDs loaded with {len(ids_seq_fasta)} entries.\n")

    missing_ids = find_missing_sequences(meta_amtDB, ids_seq_fasta)
    print(f"Found {len(missing_ids)} missing sequences in amtDB.\n")

    # Extract new sequences from mtdna_reich.fasta
    new_sequences = extract_sequences_from_aadr("data/mitogenomes_reich/mtdna_reich.fasta", missing_ids)
    print(f"Extracted and ready to save {len(new_sequences)} new sequences from mtdna_reich.fasta.")

    # Save new sequences to a FASTA file
    save_new_sequences(new_sequences, 'output/new_sequences.fasta')
    print("New sequences successfully saved to 'output/new_sequences.fasta'.\n")

    # Extract and match metadata from Reich mitogenomes for all missing IDs
    all_metadata = match_reich_metadata("data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno", missing_ids)
    print(f"Matched and processed metadata for {len(all_metadata)} missing sequences.\n")

    # Filter metadata for only new sequences
    new_sequences_ids = {seq.id for seq in new_sequences}
    new_sequences_metadata = all_metadata[all_metadata['identifier'].isin(new_sequences_ids)]
    
    # Define the desired column order
    column_order = ['identifier', 'alternative_identifiers', 'country', 'continent',
                    'region', 'culture', 'epoch', 'group', 'comment', 'latitude',
                    'longitude', 'sex', 'site', 'site_detail', 'mt_hg', 'ychr_hg',
                    'year_from', 'year_to', 'date_detail', 'bp', 'c14_lab_code',
                    'reference_name', 'reference_link', 'data_link', 'c14_sample_tag',
                    'c14_layer_tag', 'ychr_snps', 'avg_coverage', 'sequence_source',
                    'mitopatho_alleles', 'mitopatho_positions', 'mitopatho_locus',
                    'mitopatho_diseases', 'mitopatho_statuses', 'mitopatho_homoplasms',
                    'mitopatho_heteroplasms']

    # Filter metadata for only new sequences and create a copy to avoid SettingWithCopyWarning
    new_sequences_metadata = all_metadata[all_metadata['identifier'].isin(new_sequences_ids)].copy()

    # Complete missing columns and fill with empty string
    for column in column_order:
        if column not in new_sequences_metadata.columns:
            new_sequences_metadata[column] = ""

    # Reorder and save metadata for new sequences
    new_sequences_metadata = new_sequences_metadata[column_order]
    new_sequences_metadata.to_csv('output/new_sequences_metadata.csv', sep=',', index=False)
    print(f"Metadata for {len(new_sequences_metadata)} new sequences saved to 'output/new_sequences_metadata.csv'.\n")

    # Filter metadata for missing IDs with available metadata and create a copy
    missing_ids_metadata = all_metadata[all_metadata['identifier'].isin(missing_ids)].copy()

    # Complete missing columns and fill with empty string
    for column in column_order:
        if column not in missing_ids_metadata.columns:
            missing_ids_metadata[column] = ""

    # Reorder and save metadata for missing IDs
    missing_ids_metadata = missing_ids_metadata[column_order]
    missing_ids_metadata.to_csv('output/new_missing_ids_metadata.csv', sep=',', index=False)
    print(f"Metadata for {len(missing_ids_metadata)} missing IDs saved to 'output/new_missing_ids_metadata.csv'.\n")


    # show the number of new sequences in the fasta file
    print("Pipeline execution complete. Check the 'output' directory for results.")

if __name__ == "__main__":
    main()



Output directory created or already exists.

Metadata loaded with 2541 records.
Sequence IDs loaded with 1621 entries.

Found 920 missing sequences in amtDB.

Extracted and ready to save 404 new sequences from mtdna_reich.fasta.
New sequences successfully saved to 'output/new_sequences.fasta'.

Reading Reich metadata...
Reich metadata has 16388 rows.
Matching metadata complete.
Matched and processed metadata for 263 missing sequences.



UnboundLocalError: local variable 'missing_ids_metadata' referenced before assignment

In [None]:
# I will perform a check to see if the metadata file contains entries for all the identified missing sequences.
meta_amtDB, ids_seq_fasta = load_data('data/amtDB/amtdb_metadata.csv', "data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta")
missing_ids = find_missing_sequences(meta_amtDB, ids_seq_fasta)
# Load the metadata file
reich_meta = pd.read_csv('data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno', sep='\t', header=0, low_memory=False)

# Extract the list of Genetic IDs from the metadata file
master_ids_in_meta = set(reich_meta['Master ID'].unique())

# Check if each missing ID from amtDB is present in the reich_meta
missing_ids_presence = {id: id in master_ids_in_meta for id in missing_ids}

# Count the number of missing IDs that have corresponding metadata
metadata_available_count = sum(missing_ids_presence.values())

print(f'Number of missing IDs that have corresponding metadata: {metadata_available_count} out of {len(missing_ids)}')


Number of missing IDs that have corresponding metadata: 529 out of 920


In [6]:
meta_amtDB, ids_seq_fasta = load_data('data/amtDB/amtdb_metadata.csv', "data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta")
# order by identifier
meta_amtDB = meta_amtDB.sort_values('identifier')
ids_seq_fasta.sort()
ids_seq_fasta[:5], len(meta_amtDB), len(ids_seq_fasta)

(['19159', '19727', '20036', '20351', '20374'], 2541, 1621)

In [7]:
print(f"amtDB metadata has {len(meta_amtDB)} rows = {len(meta_amtDB['identifier'].unique())} unique identifiers.")
print(f"amtDB fasta has {len(ids_seq_fasta)} sequences.")
print(f"Missing sequences in amtDB: {len(find_missing_sequences(meta_amtDB, ids_seq_fasta))}")
ids_of_missing_seq_in_amtDB = find_missing_sequences(meta_amtDB, ids_seq_fasta)
print(len(ids_of_missing_seq_in_amtDB))
ids_of_missing_seq_in_amtDB[:5]

amtDB metadata has 2541 rows = 2541 unique identifiers.
amtDB fasta has 1621 sequences.
Missing sequences in amtDB: 920
920


['LIFART60-30A', 'MX310', 'I5428', 'EHU002', 'JK2880']

In [73]:
# column_mapping = {
#     'identifier': 'Master ID',  # Direct match ('RISE510': 'RISE510')
#     'alternative_identifiers': 'Genetic ID',  # Likely match, but may not always correlate (nan: 'RISE510_noUDG.SG')
#     'country': 'Political Entity',  # Direct match ('Russia': 'Russia')
#     # 'continent': 'Not directly correlated',  # No direct correlation (: )
#     'region': 'Locality',  # Close match, though might not be exact ('Altai': )
#     'culture': 'Group ID',  # Close match, but interpretation might be required (: )
#     # 'epoch': 'Not directly correlated',  # No direct correlation (: )
#     # 'group': 'Group ID',  # Close match, but interpretation might be required (: )
#     # 'comment': 'Not directly correlated',  # No direct correlation (: )
#     'latitude': 'Lat.',  # Direct match (: )
#     'longitude': 'Long.',  # Direct match (: )
#     'sex': 'Molecular Sex',  # Direct match (: )
#     'site': 'Locality',  # Close match, though might not be exact (: )
#     # 'site_detail': 'Skeletal element',  # Possible match, but interpretation might be required (: )
#     'mt_hg': 'mtDNA haplogroup if >2x or published',  # Direct match (: )
#     # 'ychr_hg': 'Y haplogroup (manual curation in terminal mutation format)',  # Close match, but interpretation might be required (: )
#     # 'year_from': 'Date mean in BP in years before 1950 CE',  # Close match, requires calculation (: )
#     # 'year_to': 'Not directly correlated',  # No direct correlation (: )
#     # 'date_detail': 'Full Date One of two formats',  # Close match, but interpretation might be required (: )
#     # 'bp': 'Not directly correlated',  # No direct correlation (: )
#     # 'c14_lab_code': 'Not directly correlated',  # No direct correlation (: )
#     'reference_name': 'Publication',  # Direct match (: )
#     # 'reference_link': 'Not directly correlated',  # No direct correlation (: )
#     # 'data_link': 'Not directly correlated',  # No direct correlation (: )
#     # 'c14_sample_tag': 'Not directly correlated',  # No direct correlation (: )
#     # 'c14_layer_tag': 'Not directly correlated',  # No direct correlation (: )
#     # 'ychr_snps': 'Not directly correlated',  # No direct correlation (: )
#     # 'avg_coverage': 'Not directly correlated',  # No direct correlation (: )
#     # 'sequence_source': 'Data source',  # Possible match, but interpretation might be required (: )
#     # 'mitopatho_alleles': 'Not directly correlated',  # No direct correlation (: )
#     # 'mitopatho_positions': 'Not directly correlated',  # No direct correlation (: )
#     # 'mitopatho_locus': 'Not directly correlated',  # No direct correlation (: )
#     # 'mitopatho_diseases': 'Not directly correlated',  # No direct correlation (: )
#     # 'mitopatho_statuses': 'Not directly correlated',  # No direct correlation (: )
#     # 'mitopatho_homoplasms': 'Not directly correlated',  # No direct correlation (: )
#     # 'mitopatho_heteroplasms': 'Not directly correlated',  # No direct correlation (: )
# }


In [74]:
meta_amtDB, ids_seq_fasta = load_data('data/amtDB/amtdb_metadata.csv', "data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta")
missing_ids = find_missing_sequences(meta_amtDB, ids_seq_fasta)


In [75]:
matched_metadata = match_metadata(meta_amtDB, "data/mitogenomes_reich/mtdna_reich_metadata.csv", missing_ids)

## Step 1: Folder Creation for Input and Output Files

In [76]:
# Check and create folder for output files
if not os.path.exists('output'):
    os.makedirs('output')

## Step 2: Data Loading

In [77]:
# Load CSV metadata for ancient mtDNA
meta_amtDB = pd.read_csv('data/amtDB/amtdb_metadata.csv', sep=',', header=0)
# Display the first few rows of the metadata for verification
meta_amtDB.head()

# Extract 'identifier' column from the metadata
identifiers_metadata = meta_amtDB['identifier'].tolist()

# Load FASTA file with mtDNA sequences from amtDB
ids_seq_fasta = [seq_record.id for seq_record in SeqIO.parse("data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta", "fasta")]


## Step 3: ID Comparison

In [78]:
# Compare IDs from metadata and FASTA file to find missing IDs in FASTA
ids_metadata_not_in_fasta = [i for i in identifiers_metadata if i not in ids_seq_fasta]
len(ids_metadata_not_in_fasta)

920

## Step 4: Sequence Retrieval

In [79]:
# Load FASTA file with mtDNA sequences from another source (e.g., mtdna_reich)
reich_ids_seq_fasta = [seq_record.id for seq_record in SeqIO.parse("data/mitogenomes_reich/mtdna_reich.fasta", "fasta")]

# Find IDs that are missing in the amtDB FASTA but present in the mtdna_reich FASTA
ids_in_both = [i for i in ids_metadata_not_in_fasta if i in reich_ids_seq_fasta]


In [80]:
# Load FASTA file with mtDNA sequences from another source reich
reich_ids_seq_fasta = [seq_record.id for seq_record in SeqIO.parse("data/mitogenomes_reich/mtdna_reich.fasta", "fasta")]

# Find IDs that are missing in the amtDB FASTA but present in the mtdna_reich FASTA
ids_in_both = [i for i in ids_metadata_not_in_fasta if i in reich_ids_seq_fasta]

# retrieve the sequences from the mtdna_reich FASTA file
reich_seqs = []
for seq_record in SeqIO.parse("data/mitogenomes_reich/mtdna_reich.fasta", "fasta"):
    if seq_record.id in ids_in_both:
        reich_seqs.append(seq_record.seq)
        
        

## Step 5: Output New Sequences

In [81]:

# There are 404 new sequences to add to our AmtDB
# Create and save these sequences to a new FASTA file using reich_seqs and their corresponding IDs

with open('output/new_sequences.fasta', 'w') as file:
    for id in ids_in_both:
        # Retrieve sequence for 'id' and write to the file
        file.write(f">{id}\n{reich_seqs[ids_in_both.index(id)]}\n")

In [82]:
# show the number of new sequences in the output
print(f"Number of new sequences: {len(ids_in_both)}")

# show the number of new sequences in the fasta file
print(f"Number of new sequences in the fasta file: {len(reich_seqs)}")

# show the number of new sequences in the fasta file using the fasta file
with open('output/new_sequences.fasta', 'r') as file:
    new_seqs = list(SeqIO.parse(file, 'fasta'))
    print(f"Number of new sequences in the fasta file using the fasta file: {len(new_seqs)}")


Number of new sequences: 404
Number of new sequences in the fasta file: 404
Number of new sequences in the fasta file using the fasta file: 404


## Meatadata retrieval

In [83]:
# using meta_amtDB 
meta_amtDB

Unnamed: 0,identifier,alternative_identifiers,country,continent,region,culture,epoch,group,comment,latitude,...,ychr_snps,avg_coverage,sequence_source,mitopatho_alleles,mitopatho_positions,mitopatho_locus,mitopatho_diseases,mitopatho_statuses,mitopatho_homoplasms,mitopatho_heteroplasms
0,RISE509,1622,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,,,,,,,
1,RISE510,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,12705T,12705,MT-CO1,Possible protective factor for normal tension ...,Reported,,
2,RISE511,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,54.580002,...,,0.0,bam,10398G;12372A;9055A;11467G;12308G,10398;12372;9055;11467;12308,MT-ND3;MT-ND5;MT-ATP6;MT-ND4;MT-TL2,PD protective factor / longevity / altered cel...,Reported / lineage L & M marker / also hg IJK;...,+;+;+;+;+,-;-;-;-;+
3,RISE507,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,51.500000,...,,0.0,bam,4640A;11467G;12372A;150T;12308G,4640;11467;12372;150;12308,MT-ND2;MT-ND4;MT-ND5;MT-CR;MT-TL2,LHON / Epilepsy;Altered brain pH / sCJD patien...,Reported;Reported;Reported;Conflicting reports...,+;+;+;+;+,-;-;-;+;+
4,RISE508,,Russia,Asia,Altai,Afanasievo,Bronze Age,BARu,,51.500000,...,,0.0,bam,12372A;12308G;13637G;11467G,12372;12308;13637;11467,MT-ND5;MT-TL2;MT-ND5;MT-ND4,Altered brain pH / sCJD patients;CPEO / Stroke...,Reported;Reported;Reported;Reported,+;+;+;+,-;+;-;-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2536,SX26,,Switzerland,Europe,central Europe,Switzerland_Neolithic,Neolithic,NESw,Final Neolithic/Early Bronze Age,47.450001,...,,0.0,reconstructed,,,,,,,
2537,SX29,,Switzerland,Europe,central Europe,Grossgartach,Neolithic,NESw,Middle neolithic,48.554459,...,,0.0,reconstructed,,,,,,,
2538,SX30,,Switzerland,Europe,central Europe,Grossgartach,Neolithic,NESw,Middle neolithic,48.554459,...,,0.0,reconstructed,,,,,,,
2539,SX32,,Switzerland,Europe,central Europe,Rubane,Neolithic,NESw,Middle neolithic,48.554459,...,,0.0,reconstructed,,,,,,,


In [84]:
meta_amtDB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2541 entries, 0 to 2540
Data columns (total 36 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   identifier               2541 non-null   object 
 1   alternative_identifiers  2031 non-null   object 
 2   country                  2541 non-null   object 
 3   continent                2541 non-null   object 
 4   region                   2541 non-null   object 
 5   culture                  2541 non-null   object 
 6   epoch                    2541 non-null   object 
 7   group                    2541 non-null   object 
 8   comment                  668 non-null    object 
 9   latitude                 2541 non-null   float64
 10  longitude                2541 non-null   float64
 11  sex                      2541 non-null   object 
 12  site                     2541 non-null   object 
 13  site_detail              293 non-null    object 
 14  mt_hg                   

In [85]:
# load metadata from reich data
meta_reich_ind = pd.read_csv('data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.ind', sep='\t', header=None)

In [86]:
meta_reich_ind

Unnamed: 0,0
0,Ne30_genotyping_noUDG M China_AmurRiver_EarlyN
1,Ne61_genotyping_noUDG U China_AmurRiver_BA
2,Ne35_genotyping_noUDG F China_AmurRiver_EarlyN
3,I17622 M Albania_BA_IA_lc
4,I13833 M Albania_EarlyModern_oCa...
...,...
16384,I19456_v54.1_addback M Bulgaria_EBA
16385,S1944.E1.L3_v54.1_addback F Iran_GanjDareh_N
16386,S1951.E1.L3_v54.1_addback F Iran_GanjDareh_N
16387,S7241.E1.L1_v54.1_addback M Vietnam_N_all


In [87]:
meta_reich_anno_1240 = pd.read_csv('data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno', sep='\t', header=0)

  meta_reich_anno_1240 = pd.read_csv('data/mitogenomes_reich/v54.1.p1_1240K_public/v54.1.p1_1240K_public.anno', sep='\t', header=0)


In [88]:
meta_reich_anno_HO = pd.read_csv('data/mitogenomes_reich/v54.1.p1_HO_public/v54.1.p1_HO_public.anno', sep='\t', header=0)

  meta_reich_anno_HO = pd.read_csv('data/mitogenomes_reich/v54.1.p1_HO_public/v54.1.p1_HO_public.anno', sep='\t', header=0)


In [89]:
meta_reich_anno_1240.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16388 entries, 0 to 16387
Data columns (total 35 columns):
 #   Column                                                                                                                                                                                                                                                          Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                                                          --------------  ----- 
 0   Genetic ID                                                                                                                                                                                                                                                      16388 non-null  object
 1   Master ID                         

In [90]:
meta_reich_anno_HO.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20502 entries, 0 to 20501
Data columns (total 35 columns):
 #   Column                                                                                                                                                                                                                                                          Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                                                          --------------  ----- 
 0   Genetic ID                                                                                                                                                                                                                                                      20502 non-null  object
 1   Master ID                         

In [91]:
meta_reich_anno_1240.head(1)

Unnamed: 0,Genetic ID,Master ID,Skeletal code,Skeletal element,"Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]",Publication,"Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)","Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]","Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]","Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990±40 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE",...,Y haplogroup (manual curation in ISOGG format),mtDNA coverage (merged data),mtDNA haplogroup if >2x or published,mtDNA match to consensus if >2x (merged data),Damage rate in first nucleotide on sequences overlapping 1240k targets (merged data),Sex ratio [Y/(Y+X) counts] (merged data),"Library type (minus=no.damage.correction, half=damage.retained.at.last.position, plus=damage.fully.corrected, ds=double.stranded.library.preparation, ss=single.stranded.library.preparation)",Libraries,ASSESSMENT,"ASSESSMENT WARNINGS (Xcontam interval is listed if lower bound is >0.005, ""QUESTIONABLE"" if lower bound is 0.01-0.02, ""QUESTIONABLE_CRITICAL"" or ""FAIL"" if lower bound is >0.02) (mtcontam confidence interval is listed if coverage >2 and upper bound is <0."
0,Ne30_genotyping_noUDG,NE30,AR9.9K_2d.rel.NE-4_deleted,..,2021,MaoFuCell2021,Direct IntCal20,9896,121,"8175-7750 calBCE (8825±30 BP, BA-152174)",...,C,99,D4m,..,12,..,ss.minus,HRR163270,PASS,..


In [92]:
meta_reich_anno_HO.head(1)

Unnamed: 0,Genetic ID,Master ID,Skeletal code,Skeletal element,"Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]",Publication,"Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)","Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]","Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]","Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990±40 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE",...,Y haplogroup (manual curation in ISOGG format),mtDNA coverage (merged data),mtDNA haplogroup if >2x or published,mtDNA match to consensus if >2x (merged data),Damage rate in first nucleotide on sequences overlapping 1240k targets (merged data),Sex ratio [Y/(Y+X) counts] (merged data),"Library type (minus=no.damage.correction, half=damage.retained.at.last.position, plus=damage.fully.corrected, ds=double.stranded.library.preparation, ss=single.stranded.library.preparation)",Libraries,ASSESSMENT,"ASSESSMENT WARNINGS (Xcontam interval is listed if lower bound is >0.005, ""QUESTIONABLE"" if lower bound is 0.01-0.02, ""QUESTIONABLE_CRITICAL"" or ""FAIL"" if lower bound is >0.02) (mtcontam confidence interval is listed if coverage >2 and upper bound is <0."
0,I001.HO,I001,..,..,2016,BroushakiScience2016,Modern,0,0,present,...,..,..,..,..,..,..,..,..,PASS,..


In [93]:
meta_reich_anno_1240.columns

Index(['Genetic ID', 'Master ID', 'Skeletal code', 'Skeletal element',
       'Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]',
       'Publication',
       'Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)',
       'Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]',
       'Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]',
       'Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990±40 BP, Ua-3

In [94]:
meta_reich_anno_HO.columns

Index(['Genetic ID', 'Master ID', 'Skeletal code', 'Skeletal element',
       'Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]',
       'Publication',
       'Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)',
       'Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]',
       'Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]',
       'Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990±40 BP, Ua-3

In [95]:
meta_amtDB.columns

Index(['identifier', 'alternative_identifiers', 'country', 'continent',
       'region', 'culture', 'epoch', 'group', 'comment', 'latitude',
       'longitude', 'sex', 'site', 'site_detail', 'mt_hg', 'ychr_hg',
       'year_from', 'year_to', 'date_detail', 'bp', 'c14_lab_code',
       'reference_name', 'reference_link', 'data_link', 'c14_sample_tag',
       'c14_layer_tag', 'ychr_snps', 'avg_coverage', 'sequence_source',
       'mitopatho_alleles', 'mitopatho_positions', 'mitopatho_locus',
       'mitopatho_diseases', 'mitopatho_statuses', 'mitopatho_homoplasms',
       'mitopatho_heteroplasms'],
      dtype='object')

In [96]:
# create a dictionary of the metadata key-value pairs column names and their corresponding values but just for the first row
# meta_amtDB_dict = meta_amtDB.iloc[0].to_dict()
# meta_amtDB_dict

# the entry which have RISE510 in the Master ID column
meta_amtDB[meta_amtDB['identifier'] == 'RISE510'].to_dict()

{'identifier': {1: 'RISE510'},
 'alternative_identifiers': {1: nan},
 'country': {1: 'Russia'},
 'continent': {1: 'Asia'},
 'region': {1: 'Altai'},
 'culture': {1: 'Afanasievo'},
 'epoch': {1: 'Bronze Age'},
 'group': {1: 'BARu'},
 'comment': {1: nan},
 'latitude': {1: 54.58000183105469},
 'longitude': {1: 90.77999877929688},
 'sex': {1: 'F'},
 'site': {1: 'Bateni'},
 'site_detail': {1: nan},
 'mt_hg': {1: 'J2a2a'},
 'ychr_hg': {1: nan},
 'year_from': {1: -2851},
 'year_to': {1: -2468},
 'date_detail': {1: '2851-2468 calBCE (4040±45 BP, OxA-31222)'},
 'bp': {1: '4040±45'},
 'c14_lab_code': {1: 'OxA-31222'},
 'reference_name': {1: 'Allentoft et al. 2015'},
 'reference_link': {1: 'https://dx.doi.org/10.1038/nature14507'},
 'data_link': {1: 'https://www.ebi.ac.uk/ena/data/view/PRJEB9021'},
 'c14_sample_tag': {1: 1},
 'c14_layer_tag': {1: 0},
 'ychr_snps': {1: nan},
 'avg_coverage': {1: 0.0},
 'sequence_source': {1: 'bam'},
 'mitopatho_alleles': {1: '12705T'},
 'mitopatho_positions': {1: '

In [97]:
# create a dictionary of the metadata key-value pairs column names and their corresponding values but just for the first row
# meta_reich_anno_dict_1240 = meta_reich_anno_1240.iloc[0].to_dict()
# meta_reich_anno_dict_1240

# the entry which have RISE509 in the Master ID column
meta_reich_anno_1240[meta_reich_anno_1240['Master ID'] == 'RISE510'].to_dict()



{'Genetic ID': {9464: 'RISE510_noUDG.SG'},
 'Master ID': {9464: 'RISE510'},
 'Skeletal code': {9464: '6136-9'},
 'Skeletal element': {9464: 'tooth'},
 'Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]': {9464: '2015'},
 'Publication': {9464: 'AllentoftNature2015'},
 'Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)': {9464: 'Direct: IntCal20'},
 'Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]': {9464: 4531},
 'Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]': {9464: 92},
 'Full Date One of t

In [98]:
# create a dictionary of the metadata key-value pairs column names and their corresponding values but just for the first row
# meta_reich_anno_dict_HO = meta_reich_anno_HO.iloc[0].to_dict()
# meta_reich_anno_dict_HO

meta_reich_anno_HO[meta_reich_anno_HO['Master ID'] == 'RISE510'].to_dict()


{'Genetic ID': {13578: 'RISE510_noUDG.SG'},
 'Master ID': {13578: 'RISE510'},
 'Skeletal code': {13578: '6136-9'},
 'Skeletal element': {13578: 'tooth'},
 'Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]': {13578: '2015'},
 'Publication': {13578: 'AllentoftNature2015'},
 'Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)': {13578: 'Direct: IntCal20'},
 'Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]': {13578: 4531},
 'Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]': {13578: 92},
 'Full Date

In [99]:
# extract column Master ID from meta_reich_anno and save it to a list
master_id_anno_reich_1240 = meta_reich_anno_1240['Master ID'].tolist()

In [100]:
master_id_anno_reich_HO = meta_reich_anno_HO['Master ID'].tolist()

In [101]:
master_id_anno_reich_1240[:5]

['NE30', 'NE61', 'NE35', 'I17622', 'I13833']

In [102]:
master_id_anno_reich_HO[:5]

['I001', 'I002', 'IREJ-T006', 'IREJ-T009', 'IREJ-T022']

In [103]:
# extract column identifier from meta_amtDB and save it to a list
identifier_meta_amtDB = meta_amtDB['identifier'].tolist()

In [104]:
identifier_meta_amtDB[:5]

['RISE509', 'RISE510', 'RISE511', 'RISE507', 'RISE508']

In [105]:
# check for duplicates in each list
duplicates_amtDB = [i for i in identifier_meta_amtDB if identifier_meta_amtDB.count(i) > 1]
duplicates_reich_1240 = [i for i in master_id_anno_reich_1240 if master_id_anno_reich_1240.count(i) > 1]
duplicates_reich_HO = [i for i in master_id_anno_reich_HO if master_id_anno_reich_HO.count(i) > 1]

In [106]:
print(len(duplicates_amtDB), len(duplicates_reich_1240), len(duplicates_reich_HO))


0 6000 8023


In [107]:
# just unique identifiers
unique_identifier_meta_amtDB = set(identifier_meta_amtDB)
unique_master_id_anno_reich_1240 = set(master_id_anno_reich_1240)
unique_master_id_anno_reich_HO = set(master_id_anno_reich_HO)

print(len(unique_identifier_meta_amtDB), len(unique_master_id_anno_reich_1240), len(unique_master_id_anno_reich_HO))

2541 13357 16290
