In [13]:
# This code will create a table for each chromosome showing the
# scaled median coverage for each position in which coding regions occur

# Step 1: Check working directory
import os
os.chdir('/Users/evaedwards/Final-Year-Project/Datasets/TXT')
print(os.getcwd())

# Step 2: Make txt. file for the chromosome i.e CP034456
import pandas as pd
import glob

# Define input and output file paths
input_files = glob.glob("TXT/*.txt")  # Adjust to match your directory
output_folder = "CP034462_txt/"  # Folder to store filtered files
os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

# Process each file
for file in input_files:
    strain_name = os.path.basename(file).replace(".txt", "")  # Extract strain name
    
    # Load the coverage data (assuming tab-separated format, no headers)
    df = pd.read_csv(file, sep="\t", header=None, names=["Chromosome", "Position", "Coverage"])
    
    # Filter only for the chromosome i.e CP034456
    df_filtered = df[(df["Chromosome"] == "CP034462")]
    
    # Save the filtered data - change name for the chromosome i.e CP034456
    output_file = os.path.join(output_folder, f"{strain_name.upper()}_CH62.txt")
    df_filtered.to_csv(output_file, sep="\t", index=False, header=False)
    
    print(f"Saved filtered data for {strain_name} to {output_file}")

# Step 3: Make the appropriate gbk file for the chromosome of interest i.e CP034456
# so we can then emalgamate

from Bio import SeqIO
import pandas as pd

# File paths
genbank_file = "GCA_004217705.1_ASM421770v1_genomic-1.gbk"
record_id = "CP034462.1" # Change this to the appropriate chromosome

# Initialize an empty list to store the gene information
gene_data = []

# Parse the GenBank file and find the specific record by ID
from Bio import SeqIO

with open(genbank_file, "r") as handle:
    for record in SeqIO.parse(handle, "genbank"):
        if record.id == record_id:
            for feature in record.features:
                if feature.type == "CDS":
                    gene_name = feature.qualifiers.get("gene", ["unknown"])[0]
                    product = feature.qualifiers.get("product", ["unknown"])[0]
                    strand = "+" if feature.strand == 1 else "-"
                    
                    gene_info = {
                        "Gene": gene_name,
                        "Product": product,
                        "Start Position": int(feature.location.start),
                        "End Position": int(feature.location.end),
                        "Strand": strand
                    }
                    gene_data.append(gene_info)

# Print or store results
print(gene_data)

# Create a DataFrame from the filtered gene data
df = pd.DataFrame(gene_data)

# Display the table
print(df)

# Save the filtered table to a CSV file - change to appropraite chromosome
df.to_csv("AnnotatedGenome_CP62.csv", index=False)

# Step 4: 

import os
import glob
import pandas as pd
import numpy as np

# Define paths
base_dir = "/Users/evaedwards/Final-Year-Project/Datasets/TXT"
annotated_file = os.path.join(base_dir, "AnnotatedGenome_CP62.csv")
coverage_files = glob.glob(os.path.join(base_dir, "CP034462_txt", "*.txt"))
wg_median_file = os.path.join(base_dir, "WG_Median_Coverage_CAPITALIZED.csv")
apc_median_file = os.path.join(base_dir, "WG_Median_Coverage_APC.csv")  # Added APC median coverage file

# Load annotated genome data
genes_df = pd.read_csv(annotated_file)

# Load whole-genome median coverage data
wg_median_df = pd.read_csv(wg_median_file)
wg_median_dict = dict(zip(wg_median_df["Strain"], wg_median_df["Median Coverage"]))  # {Strain: MedianCoverage}

# Load APC median coverage data
apc_median_df = pd.read_csv(apc_median_file)
wg_median_dict.update(dict(zip(apc_median_df["Strain"], apc_median_df["Median Coverage"])))  # Merge APC data

# Store processed data
final_data = []

# Process each strain's coverage file
for file in coverage_files:
    strain_name = os.path.basename(file).split("_")[0].upper()  # Extract strain name
    
    # Load coverage data
    coverage_df = pd.read_csv(file, sep="\t", header=None, names=["Chromosome", "Position", "Coverage"])
    coverage_dict = dict(zip(coverage_df["Position"], coverage_df["Coverage"]))  # {Position: Coverage}
    
    # Get the whole-genome median for this strain (default to 1 if missing to avoid division errors)
    wg_median = wg_median_dict.get(strain_name, 1)
    
    # Process each gene
    for _, gene in genes_df.iterrows():
        gene_function = gene["Product"]
        gene_id = gene["Gene"]
        start_pos, end_pos = gene["Start Position"], gene["End Position"]
        length = end_pos - start_pos  

        # Extract coverage values within the gene range
        coverage_values = [coverage_dict[pos] for pos in range(start_pos, end_pos + 1) if pos in coverage_dict]

        # Calculate median coverage for the gene
        median_coverage = round(np.median(coverage_values)) if coverage_values else None

        # Calculate scaled coverage
        scaled_coverage = round(median_coverage / wg_median, 3) if median_coverage is not None else None

        # Store result
        final_data.append([gene_id, gene_function, start_pos, end_pos, length, strain_name, scaled_coverage])

# Convert to DataFrame
final_df = pd.DataFrame(final_data, columns=["Gene ID", "Gene Function", "Start Position", "End Position", "Length", "Strain", "Scaled Coverage"])

# Pivot table to make strains as columns
final_pivot = final_df.pivot(index=["Gene ID", "Gene Function", "Start Position", "End Position", "Length"], columns="Strain", values="Scaled Coverage")

# Reset column names for clarity
final_pivot.columns = [f"{strain}" for strain in final_pivot.columns]
final_pivot.reset_index(inplace=True)

# Save the final table
output_file = os.path.join(base_dir, "Table_CRC_CH62.csv")
final_pivot.to_csv(output_file, index=False)

# Display result
print(final_pivot.head())

/Users/evaedwards/Final-Year-Project/Datasets/TXT
Saved filtered data for matc2 to CP034462_txt/MATC2_CH62.txt
Saved filtered data for WS4 to CP034462_txt/WS4_CH62.txt
Saved filtered data for jog19 to CP034462_txt/JOG19_CH62.txt
Saved filtered data for WS5 to CP034462_txt/WS5_CH62.txt
Saved filtered data for matc3 to CP034462_txt/MATC3_CH62.txt
Saved filtered data for matc1 to CP034462_txt/MATC1_CH62.txt
Saved filtered data for WS7 to CP034462_txt/WS7_CH62.txt
Saved filtered data for WS6 to CP034462_txt/WS6_CH62.txt
Saved filtered data for matc4 to CP034462_txt/MATC4_CH62.txt
Saved filtered data for WS2 to CP034462_txt/WS2_CH62.txt
Saved filtered data for WS3 to CP034462_txt/WS3_CH62.txt
Saved filtered data for WS1 to CP034462_txt/WS1_CH62.txt
Saved filtered data for jog20 to CP034462_txt/JOG20_CH62.txt
Saved filtered data for jog21 to CP034462_txt/JOG21_CH62.txt
Saved filtered data for mat22 to CP034462_txt/MAT22_CH62.txt
Saved filtered data for PP16 to CP034462_txt/PP16_CH62.txt
Save



[{'Gene': 'MPUL0G00100', 'Product': 'Transcriptional activator of glycolytic enzymes', 'Start Position': 6215, 'End Position': 8981, 'Strand': '+'}, {'Gene': 'MPUL0G00110', 'Product': 'Transcriptional activator of glycolytic enzymes', 'Start Position': 10919, 'End Position': 13685, 'Strand': '+'}, {'Gene': 'MPUL0G00120', 'Product': 'Protein of unknown function DUF775', 'Start Position': 29471, 'End Position': 30095, 'Strand': '-'}, {'Gene': 'MPUL0G00130', 'Product': 'protein ATS1', 'Start Position': 30775, 'End Position': 31837, 'Strand': '-'}, {'Gene': 'MPUL0G00140', 'Product': 'SWI/SNF-related', 'Start Position': 33270, 'End Position': 36405, 'Strand': '+'}, {'Gene': 'MPUL0G00150', 'Product': 'Fe-S cluster biogenesis protein NfuA, 4Fe-4S-binding domain', 'Start Position': 37347, 'End Position': 38112, 'Strand': '+'}, {'Gene': 'unknown', 'Product': 'hypothetical protein', 'Start Position': 38163, 'End Position': 40050, 'Strand': '-'}, {'Gene': 'unknown', 'Product': 'hypothetical prote

In [15]:
# Step 3: Make the appropriate gbk file for the chromosome of interest i.e CP034456
# so we can then emalgamate

from Bio import SeqIO
import pandas as pd

# File paths
genbank_file = "GCA_004217705.1_ASM421770v1_genomic-1.gbk"
record_id = "CP034462.1" # Change this to the appropriate chromosome

# Initialize an empty list to store the gene information
gene_data = []

# Parse the GenBank file and find the specific record by ID
from Bio import SeqIO

with open(genbank_file, "r") as handle:
    for record in SeqIO.parse(handle, "genbank"):
        if record.id == record_id:
            for feature in record.features:
                if feature.type == "CDS":
                    gene_name = feature.qualifiers.get("gene", ["unknown"])[0]
                    product = feature.qualifiers.get("product", ["unknown"])[0]
                    strand = "+" if feature.strand == 1 else "-"
                    
                    gene_info = {
                        "Gene": gene_name,
                        "Product": product,
                        "Start Position": int(feature.location.start),
                        "End Position": int(feature.location.end),
                        "Strand": strand
                    }
                    gene_data.append(gene_info)

# Print or store results
print(gene_data)

# Create a DataFrame from the filtered gene data
df = pd.DataFrame(gene_data)

# Display the table
print(df)

# Save the filtered table to a CSV file - change to appropraite chromosome
df.to_csv("AnnotatedGenome_CP62.csv", index=False)

# Step 4: 

import os
import glob
import pandas as pd
import numpy as np

# Define paths
base_dir = "/Users/evaedwards/Final-Year-Project/Datasets/TXT"
annotated_file = os.path.join(base_dir, "AnnotatedGenome_CP62.csv")
coverage_files = glob.glob(os.path.join(base_dir, "CP034462_txt", "*.txt"))
wg_median_file = os.path.join(base_dir, "WG_Median_Coverage_CAPITALIZED.csv")
apc_median_file = os.path.join(base_dir, "WG_Median_Coverage_APC.csv")  # Added APC median coverage file

# Load annotated genome data
genes_df = pd.read_csv(annotated_file)

# Load whole-genome median coverage data
wg_median_df = pd.read_csv(wg_median_file)
wg_median_dict = dict(zip(wg_median_df["Strain"], wg_median_df["Median Coverage"]))  # {Strain: MedianCoverage}

# Load APC median coverage data
apc_median_df = pd.read_csv(apc_median_file)
wg_median_dict.update(dict(zip(apc_median_df["Strain"], apc_median_df["Median Coverage"])))  # Merge APC data

# Store processed data
final_data = []

# Process each strain's coverage file
for file in coverage_files:
    strain_name = os.path.basename(file).split("_")[0].upper()  # Extract strain name
    
    # Load coverage data
    coverage_df = pd.read_csv(file, sep="\t", header=None, names=["Chromosome", "Position", "Coverage"])
    coverage_dict = dict(zip(coverage_df["Position"], coverage_df["Coverage"]))  # {Position: Coverage}
    
    # Get the whole-genome median for this strain (default to 1 if missing to avoid division errors)
    wg_median = wg_median_dict.get(strain_name, 1)
    
    # Process each gene
    for _, gene in genes_df.iterrows():
        gene_function = gene["Product"]
        gene_id = gene["Gene"]
        start_pos, end_pos = gene["Start Position"], gene["End Position"]
        length = end_pos - start_pos  

        # Extract coverage values within the gene range
        coverage_values = [coverage_dict[pos] for pos in range(start_pos, end_pos + 1) if pos in coverage_dict]

        # Calculate median coverage for the gene
        median_coverage = round(np.median(coverage_values)) if coverage_values else None

        # Calculate scaled coverage
        scaled_coverage = round(median_coverage / wg_median, 3) if median_coverage is not None else None

        # Store result
        final_data.append([gene_id, gene_function, start_pos, end_pos, length, strain_name, scaled_coverage])

# Convert to DataFrame
final_df = pd.DataFrame(final_data, columns=["Gene ID", "Gene Function", "Start Position", "End Position", "Length", "Strain", "Scaled Coverage"])

# Pivot table to make strains as columns
final_pivot = final_df.pivot(index=["Gene ID", "Gene Function", "Start Position", "End Position", "Length"], columns="Strain", values="Scaled Coverage")

# Reset column names for clarity
final_pivot.columns = [f"{strain}" for strain in final_pivot.columns]
final_pivot.reset_index(inplace=True)

# Save the final table
output_file = os.path.join(base_dir, "Table_CRC_CH62.csv")
final_pivot.to_csv(output_file, index=False)

# Display result
print(final_pivot.head())



[{'Gene': 'MPUL0G00100', 'Product': 'Transcriptional activator of glycolytic enzymes', 'Start Position': 6215, 'End Position': 8981, 'Strand': '+'}, {'Gene': 'MPUL0G00110', 'Product': 'Transcriptional activator of glycolytic enzymes', 'Start Position': 10919, 'End Position': 13685, 'Strand': '+'}, {'Gene': 'MPUL0G00120', 'Product': 'Protein of unknown function DUF775', 'Start Position': 29471, 'End Position': 30095, 'Strand': '-'}, {'Gene': 'MPUL0G00130', 'Product': 'protein ATS1', 'Start Position': 30775, 'End Position': 31837, 'Strand': '-'}, {'Gene': 'MPUL0G00140', 'Product': 'SWI/SNF-related', 'Start Position': 33270, 'End Position': 36405, 'Strand': '+'}, {'Gene': 'MPUL0G00150', 'Product': 'Fe-S cluster biogenesis protein NfuA, 4Fe-4S-binding domain', 'Start Position': 37347, 'End Position': 38112, 'Strand': '+'}, {'Gene': 'unknown', 'Product': 'hypothetical protein', 'Start Position': 38163, 'End Position': 40050, 'Strand': '-'}, {'Gene': 'unknown', 'Product': 'hypothetical prote