# Genomic Sequence Files Explanation

## 1. `GCF_001018655.2_ASM101865v2_genomic.fna`
- Contains the **complete genomic sequences**, including both **chromosome(s) and plasmids**.
- You can locate specific sequences using **SeqID**, such as:
  - `NZ_CP026962.1` (chromosome)
  - `NZ_CP026963.1` (plasmid)

## 2. `cds_from_genomic.fna`
- Contains **coding sequences (CDS)** extracted from the genomic data.
- The whole genome sequence is **fragmented into individual coding sequences (genes)**.
- **SeqID** can help identify whether a CDS originates from the **chromosome or plasmids**.

## 3. `protein.faa`
- Contains **protein sequences** translated from the CDS in `cds_from_genomic.fna`.
- There is a **one-to-one correspondence** between:
  - A CDS in `cds_from_genomic.fna`
  - Its respective protein in `protein.faa`
- Protein names can be **polished or corrected** by cross-referencing with the CDS file.


In [None]:
import os
# extract all the folder's name under current working directory
current_dir = os.getcwd()
folders = [f for f in os.listdir(current_dir) if os.path.isdir(os.path.join(current_dir, f))]
print(folders)
print(current_dir, '\n', len(folders))


['GCF_001018655.2', 'GCF_003325115.1', 'GCF_023299525.1', 'GCF_016889925.1', 'GCF_022494645.1', 'GCF_001692655.1', 'GCF_003071285.1', 'GCF_000017425.1', 'GCF_030020705.1', 'GCF_013745515.1', 'GCF_005082585.1', 'GCF_026210615.1', 'GCF_003343155.1', 'GCF_034356035.1', 'GCF_030644305.1', 'GCF_019551855.1', 'GCF_005885935.1', 'GCF_025854055.1', 'GCF_004124015.1', 'GCF_009857075.1', 'GCF_016889645.1', 'GCF_028473645.1', 'GCF_029229485.1', 'GCF_009912735.1', 'GCF_016756155.1', 'GCF_022655645.1', 'GCF_028622715.1', 'GCF_030020595.1', 'GCF_015161405.1', 'GCF_008124065.1', 'GCF_028622775.1', 'GCF_003324795.1', 'GCF_003019655.1', 'GCF_005153985.1', 'GCF_004768605.1', 'GCF_003073635.1', 'GCF_022655705.1', 'GCF_008369765.1', 'GCF_013391125.1', 'GCF_024917615.1', 'GCF_030295765.1', 'GCF_025984565.1', 'GCF_946415285.1', 'GCF_021459905.1', 'GCF_012980785.1', 'GCF_001874625.1', 'GCF_008694005.1', 'GCF_003606345.3', 'GCF_026725495.1', 'GCF_000430085.2', 'GCF_016454465.1', 'GCF_009730355.1', 'GCF_016766

# data mining from NCBI

In [5]:
import pandas as pd
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Load CSV file
host_plasmid_original_data = pd.read_csv('Supplementary_2.csv')
df_no_duplicates = host_plasmid_original_data['AnnotationAccession'].drop_duplicates()
# Set up Selenium WebDriver
driver = webdriver.Chrome()

# Iterate through each row in the CSV file
for i in range(df_no_duplicates.shape[0]):
    # Extract and format AnnotationAccession
    AnnotationAccession = df_no_duplicates.iloc[i].split('.')[0]+ '.' +  df_no_duplicates.iloc[i].split('.')[1][:1]
    if AnnotationAccession not in folders:
        time.sleep(0.5)
        # Construct the URL
        url = f'https://www.ncbi.nlm.nih.gov/datasets/genome/{AnnotationAccession}/'
        driver.get(url)
        print(f"Processing: {url}")

        # Find and click the download button
        try:
            # Wait until the button is clickable
            button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="page_content"]/div/div[1]/section[1]/div[2]/button[3]'))
            )
            button.click()
            print("Button clicked successfully!")

            # Wait for the textarea to appear and extract the download URL
            time.sleep(1)
            content = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="copy-txt"]'))
            )
            download_url = content.get_attribute("value")
            while not download_url:
                download_url = content.get_attribute("value")
                time.sleep(1)   

            print(f"Extracted Download URL: {download_url}")

        except Exception as e:
            print(f"Error: {e}")
            print(f"Failed to extract download URL for AnnotationAccession: {AnnotationAccession}")
            continue  # Skip to the next entry if extraction fails

        # Define filename dynamically
        filename = f"{AnnotationAccession}.zip"

        # Download the file
        try:
            response = requests.get(download_url, stream=True)
            response.raise_for_status()  # Check if the request was successful

            with open(filename, "wb") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    file.write(chunk)

            print(f"File downloaded successfully: {filename}")

        except Exception as e:
            print(f"Download Error: {e}")
            
        import zipfile
        import os     
        import shutil   
        # Define the file name and target extraction directory
        zip_file = f"{AnnotationAccession}.zip"
        extract_dir = f"{AnnotationAccession}"  # Change if you want a different folder name

        # Create the directory if it does not exist
        os.makedirs(extract_dir, exist_ok=True)

        # Extract and move only .fna and .faa files to the extract_dir
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            for file in zip_ref.namelist():
                if file.endswith(".fna") or file.endswith(".faa"):  # Filter for .fna and .faa files
                    # Extract file to a temporary location
                    temp_path = zip_ref.extract(file, extract_dir)
                    # Move the file to the extract_dir (in case it's nested in subfolders)
                    final_path = os.path.join(extract_dir, os.path.basename(file))
                    shutil.move(temp_path, final_path)
                    print(f"Extracted: {final_path}")
        print(f"Extraction complete. Only .fna and .faa files saved in: {extract_dir}")
        os.remove(zip_file)
        print(f"Deleted zip file: {zip_file}")

# Close the WebDriver after all downloads
driver.quit()
print("Processing completed.")


Processing completed.


# Extract genomics sequences

In [None]:
import os
# extract all the folder's name under current working directory
current_dir = os.getcwd()
folders = [f for f in os.listdir(current_dir) if os.path.isdir(os.path.join(current_dir, f))]
print(folders)
print(len(folders))


['GCF_001018655.2', 'GCF_003325115.1', 'GCF_023299525.1', 'GCF_016889925.1', 'GCF_022494645.1', 'GCF_001692655.1', 'GCF_003071285.1', 'GCF_000017425.1', 'GCF_030020705.1', 'GCF_013745515.1', 'GCF_005082585.1', 'GCF_026210615.1', 'GCF_003343155.1', 'GCF_034356035.1', 'GCF_030644305.1', 'GCF_019551855.1', 'GCF_005885935.1', 'GCF_025854055.1', 'GCF_004124015.1', 'GCF_009857075.1', 'GCF_016889645.1', 'GCF_028473645.1', 'GCF_029229485.1', 'GCF_009912735.1', 'GCF_016756155.1', 'GCF_022655645.1', 'GCF_028622715.1', 'GCF_030020595.1', 'GCF_015161405.1', 'GCF_008124065.1', 'GCF_028622775.1', 'GCF_003324795.1', 'GCF_003019655.1', 'GCF_005153985.1', 'GCF_004768605.1', 'GCF_003073635.1', 'GCF_022655705.1', 'GCF_008369765.1', 'GCF_013391125.1', 'GCF_024917615.1', 'GCF_030295765.1', 'GCF_025984565.1', 'GCF_946415285.1', 'GCF_021459905.1', 'GCF_012980785.1', 'GCF_001874625.1', 'GCF_008694005.1', 'GCF_003606345.3', 'GCF_026725495.1', 'GCF_000430085.2', 'GCF_016454465.1', 'GCF_009730355.1', 'GCF_016766

In [None]:
# extract all protein information
from Bio import SeqIO
current_dir = os.getcwd()

df_all_cds = [] # compile all the list together
df_all_complete = [] # compile all the list together
sequences = {}
single_item = []
for folder_name in folders[:1000]:
    folder_path = os.path.join(current_dir, folder_name) # find the correct file path
    # find the fna file names: one is complete genomic sequence and another is coding sequences
    faa_files = [f for f in os.listdir(folder_path) if f.endswith('.fna')] 
    print(f'extracing the information from {faa_files} ')
    for faa_file in faa_files:
        if faa_file[0] == 'c':
            file_path = os.path.join(folder_path, faa_file) # this is the cds file 
            for record in SeqIO.parse(file_path, "fasta"):
                single_item.append(str(folder_name)) # AccessionNumber
                single_item.append(str(record.id)) # protein_id
                single_item.append(str(record.seq)) # sequence
                df_all_cds.append(single_item)
                single_item = []
        if faa_file[0] == 'G':
            file_path = os.path.join(folder_path, faa_file) # this is the complete genomic sequences
            for record in SeqIO.parse(file_path, "fasta"):
                single_item.append(str(folder_name)) # AccessionNumber
                single_item.append(str(record.id)) # protein_id
                single_item.append(str(record.seq)) # sequence
                df_all_complete.append(single_item)
                single_item = []
                
# export data
import csv
# Define header
header = ["AnnotationAccession", "SeqID&protein_id", "protein_sequence"]
# Write to CSV
with open("cds_sequences.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(header)  # Write header
    writer.writerows(df_all_cds)   # Write data rows
with open("complete_genomic_sequences.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(header)  # Write header
    writer.writerows(df_all_complete)   # Write data rows

In [None]:
# extract all protein information
from Bio import SeqIO
current_dir = os.getcwd()

df_all_cds = [] # compile all the list together
df_all_complete = [] # compile all the list together
sequences = {}
single_item = []
for folder_name in folders[1000:2000]:
    folder_path = os.path.join(current_dir, folder_name) # find the correct file path
    # find the fna file names: one is complete genomic sequence and another is coding sequences
    faa_files = [f for f in os.listdir(folder_path) if f.endswith('.fna')] 
    print(f'extracing the information from {faa_files} ')
    for faa_file in faa_files:
        if faa_file[0] == 'c':
            file_path = os.path.join(folder_path, faa_file) # this is the cds file 
            for record in SeqIO.parse(file_path, "fasta"):
                single_item.append(str(folder_name)) # AccessionNumber
                single_item.append(str(record.id)) # protein_id
                single_item.append(str(record.seq)) # sequence
                df_all_cds.append(single_item)
                single_item = []
        if faa_file[0] == 'G':
            file_path = os.path.join(folder_path, faa_file) # this is the complete genomic sequences
            for record in SeqIO.parse(file_path, "fasta"):
                single_item.append(str(folder_name)) # AccessionNumber
                single_item.append(str(record.id)) # protein_id
                single_item.append(str(record.seq)) # sequence
                df_all_complete.append(single_item)
                single_item = []
                
# export data
import csv
# Define header
header = ["AnnotationAccession", "SeqID&protein_id", "protein_sequence"]
# Write to CSV
with open("cds_sequences_2.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(header)  # Write header
    writer.writerows(df_all_cds)   # Write data rows
with open("complete_genomic_sequences_2.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(header)  # Write header
    writer.writerows(df_all_complete)   # Write data rows

In [None]:
# extract all protein information
from Bio import SeqIO
current_dir = os.getcwd()

df_all_cds = [] # compile all the list together
df_all_complete = [] # compile all the list together
sequences = {}
single_item = []
for folder_name in folders[2000:3000]:
    folder_path = os.path.join(current_dir, folder_name) # find the correct file path
    # find the fna file names: one is complete genomic sequence and another is coding sequences
    faa_files = [f for f in os.listdir(folder_path) if f.endswith('.fna')] 
    print(f'extracing the information from {faa_files} ')
    for faa_file in faa_files:
        if faa_file[0] == 'c':
            file_path = os.path.join(folder_path, faa_file) # this is the cds file 
            for record in SeqIO.parse(file_path, "fasta"):
                single_item.append(str(folder_name)) # AccessionNumber
                single_item.append(str(record.id)) # protein_id
                single_item.append(str(record.seq)) # sequence
                df_all_cds.append(single_item)
                single_item = []
        if faa_file[0] == 'G':
            file_path = os.path.join(folder_path, faa_file) # this is the complete genomic sequences
            for record in SeqIO.parse(file_path, "fasta"):
                single_item.append(str(folder_name)) # AccessionNumber
                single_item.append(str(record.id)) # protein_id
                single_item.append(str(record.seq)) # sequence
                df_all_complete.append(single_item)
                single_item = []
                
# export data
import csv
# Define header
header = ["AnnotationAccession", "SeqID&protein_id", "protein_sequence"]
# Write to CSV
with open("cds_sequences_3.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(header)  # Write header
    writer.writerows(df_all_cds)   # Write data rows
with open("complete_genomic_sequences_3.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(header)  # Write header
    writer.writerows(df_all_complete)   # Write data rows

In [None]:
# extract all protein information
from Bio import SeqIO
current_dir = os.getcwd()

df_all_cds = [] # compile all the list together
df_all_complete = [] # compile all the list together
sequences = {}
single_item = []
for folder_name in folders[3000:]:
    folder_path = os.path.join(current_dir, folder_name) # find the correct file path
    # find the fna file names: one is complete genomic sequence and another is coding sequences
    faa_files = [f for f in os.listdir(folder_path) if f.endswith('.fna')] 
    print(f'extracing the information from {faa_files} ')
    for faa_file in faa_files:
        if faa_file[0] == 'c':
            file_path = os.path.join(folder_path, faa_file) # this is the cds file 
            for record in SeqIO.parse(file_path, "fasta"):
                single_item.append(str(folder_name)) # AccessionNumber
                single_item.append(str(record.id)) # protein_id
                single_item.append(str(record.seq)) # sequence
                df_all_cds.append(single_item)
                single_item = []
        if faa_file[0] == 'G':
            file_path = os.path.join(folder_path, faa_file) # this is the complete genomic sequences
            for record in SeqIO.parse(file_path, "fasta"):
                single_item.append(str(folder_name)) # AccessionNumber
                single_item.append(str(record.id)) # protein_id
                single_item.append(str(record.seq)) # sequence
                df_all_complete.append(single_item)
                single_item = []
                
# export data
import csv
# Define header
header = ["AnnotationAccession", "SeqID&protein_id", "protein_sequence"]
# Write to CSV
with open("cds_sequences_4.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(header)  # Write header
    writer.writerows(df_all_cds)   # Write data rows
with open("complete_genomic_sequences_4.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(header)  # Write header
    writer.writerows(df_all_complete)   # Write data rows

In [1]:
import pandas as pd
df_all = pd.read_csv('complete_genomic_sequences.csv')
df_all['protein_sequence'].drop_duplicates().shape

(1977511,)

In [2]:
df_all.drop_duplicates().shape

(3666292, 3)

# protein sequence 

In [None]:
# extract all protein information
from Bio import SeqIO
current_dir = os.getcwd()

df_all = [] # compile all the list together
sequences = {}
single_item = []
for folder_name in folders:
    if folder_name == "GCF_000237975.1":
        continue
    else: 
        folder_path = os.path.join(current_dir, folder_name) # find the correct file path
        file_path = os.path.join(folder_path, 'protein.faa')
        print(f'extracing the information from {file_path}')
        for record in SeqIO.parse(file_path, "fasta"):
            single_item.append(str(folder_name)) # AccessionNumber
            single_item.append(str(record.id)) # protein_id
            single_item.append(str(record.seq)) # sequence
            df_all.append(single_item)
            single_item = []

# export data
import csv
# Define header
header = ["accession_number", "protein_id", "protein_sequence"]
# Write to CSV
with open("protein_sequences.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(header)  # Write header
    writer.writerows(df_all)   # Write data rows

In [1]:
import pandas as pd
df_all = pd.read_csv('protein_sequences.csv')

In [2]:
df_all.head()

Unnamed: 0,accession_number,protein_id,protein_sequence
0,GCF_001018655.2,WP_000002068.1,MANLQKYIEYSREVQQARENNQPIVALESTIISHGMPYPQNVEMAT...
1,GCF_001018655.2,WP_000002683.1,MANNHNQNGQDSTQQVINFLKVFKWRIVGFLAFLLIAILFLTLGFW...
2,GCF_001018655.2,WP_000002973.1,MANPIKIGIGGPVGAGKTQLIEKVVKRLSKEMSIGVITNDIYTKED...
3,GCF_001018655.2,WP_000003402.1,MSEQNTFVASDETVGRNRKPNRKAPKQISFRVSESEYLKLQQSAET...
4,GCF_001018655.2,WP_000003559.1,MSEQQTMSELKQQALVDINEANDERALQEVKVKYLGKKGSVSGLMK...


In [6]:
df_all.shape

(19279219, 3)

In [4]:

df_all['protein_id'].drop_duplicates().shape

(5903237,)

In [5]:
df_all['protein_sequence'].drop_duplicates().shape

(5903237,)

# combined the sequence datasets into a signle dataset

In [2]:
import os
print(os.getcwd())
os.chdir('/hpc/group/youlab/zd75')

/hpc/home/zd75/workplace/data


In [None]:
import pandas as pd

# Load the datasets
df_all1 = pd.read_csv('complete_genomic_sequences.csv')
df_all2 = pd.read_csv('complete_genomic_sequences_2.csv')
df_all13 = pd.read_csv('complete_genomic_sequences_3.csv')
df_all14 = pd.read_csv('complete_genomic_sequences_4.csv')

# Combine the datasets
df_combined = pd.concat([df_all1, df_all2, df_all13, df_all14], ignore_index=True)

# Save the combined dataframe to a new CSV file
df_combined.to_csv('combined_genomic_sequences.csv', index=False)

# Display the first few rows
print(df_combined.head())
import pandas as pd

# Load the datasets
df_all1 = pd.read_csv('cds_sequences.csv')
df_all2 = pd.read_csv('cds_sequences_2.csv')
df_all13 = pd.read_csv('cds_sequences_3.csv')
df_all14 = pd.read_csv('cds_sequences_4.csv')

# Combine the datasets
df_combined = pd.concat([df_all1, df_all2, df_all13, df_all14], ignore_index=True)

# Save the combined dataframe to a new CSV file
df_combined.to_csv('combined_cds_sequences.csv', index=False)

# Display the first few rows
print(df_combined.head())