# Outline

1. Environment Setup: Creating a conda environment and importing necessary libraries.
2. Folder Creation: Setting up folders for input and output files.
3. Data Loading: Loading metadata from a CSV file and sequence data from a FASTA file.
4. ID Comparison: Identifying which IDs from the metadata are not present in the FASTA file.
5. Sequence Retrieval: Loading another FASTA file and finding missing sequences for the identified IDs.
6. Output: Saving the newly found sequences with their IDs to a new FASTA file.

  conda create -n ancient_dna_env python=3.8 biopython pandas matplotlib numpy jupyter ipython scipy seaborn -y


In [1]:
# Importing necessary libraries for ancient mtDNA analysis
from Bio import SeqIO
import pandas as pd
import os




## Step 1: Folder Creation for Input and Output Files

In [2]:
# Check and create folder for output files
if not os.path.exists('output'):
    os.makedirs('output')

## Step 2: Data Loading

In [3]:
# Load CSV metadata for ancient mtDNA
meta_amtDB = pd.read_csv('data/amtDB/amtdb_metadata.csv', sep=',', header=0)
# Display the first few rows of the metadata for verification
meta_amtDB.head()

# Extract 'identifier' column from the metadata
identifiers_metadata = meta_amtDB['identifier'].tolist()

# Load FASTA file with mtDNA sequences from amtDB
ids_seq_fasta = [seq_record.id for seq_record in SeqIO.parse("data/amtDB/amtdb_1621-samples_7f_a0pkh.fasta", "fasta")]


## Step 3: ID Comparison

In [4]:
# Compare IDs from metadata and FASTA file to find missing IDs in FASTA
ids_metadata_not_in_fasta = [i for i in identifiers_metadata if i not in ids_seq_fasta]


## Step 4: Sequence Retrieval

In [9]:
# Load FASTA file with mtDNA sequences from another source (e.g., mtdna_reich)
reich_ids_seq_fasta = [seq_record.id for seq_record in SeqIO.parse("data/mitogenomes_reich/mtdna_reich.fasta", "fasta")]

# Find IDs that are missing in the amtDB FASTA but present in the mtdna_reich FASTA
ids_in_both = [i for i in ids_metadata_not_in_fasta if i in reich_ids_seq_fasta]


In [16]:
# Load FASTA file with mtDNA sequences from another source reich
reich_ids_seq_fasta = [seq_record.id for seq_record in SeqIO.parse("data/mitogenomes_reich/mtdna_reich.fasta", "fasta")]

# Find IDs that are missing in the amtDB FASTA but present in the mtdna_reich FASTA
ids_in_both = [i for i in ids_metadata_not_in_fasta if i in reich_ids_seq_fasta]

# retrieve the sequences from the mtdna_reich FASTA file
reich_seqs = []
for seq_record in SeqIO.parse("data/mitogenomes_reich/mtdna_reich.fasta", "fasta"):
    if seq_record.id in ids_in_both:
        reich_seqs.append(seq_record.seq)
        
        

## Step 5: Output New Sequences

In [23]:

# There are 404 new sequences to add to our AmtDB
# Create and save these sequences to a new FASTA file using reich_seqs and their corresponding IDs

with open('output/new_sequences.fasta', 'w') as file:
    for id in ids_in_both:
        # Retrieve sequence for 'id' and write to the file
        file.write(f">{id}\n{reich_seqs[ids_in_both.index(id)]}\n")

In [24]:
# show the number of new sequences
print(len(ids_in_both))

404
