In [2]:
# Redrafting CH58 analysis - Table_CRC_Ch58
# Purpose here is to make a table with the following variables: Gene_ID, Gene_Function, 
# Start position, End position, Length, Mean coverage of the gene per strain

# Step 1: Check working directory
import os
os.chdir('/Users/evaedwards/Final-Year-Project/Datasets/TXT')
print(os.getcwd())

/Users/evaedwards/Final-Year-Project/Datasets/TXT


In [6]:
# Step 2: Make shorter txt files for every strain for the region of interest for Ch58
# we are doing every strain in order to compare between strains, we know the interesting ones
# but having a comparison might be useful

import pandas as pd
import glob

# Define input and output file paths
input_files = glob.glob("CH58/*.txt")  # Adjust to match your directory
output_folder = "filtered_data/"  # Folder to store filtered files
os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

# Process each file
for file in input_files:
    strain_name = os.path.basename(file).replace(".txt", "")  # Extract strain name
    
    # Load the coverage data (assuming tab-separated format, no headers)
    df = pd.read_csv(file, sep="\t", header=None, names=["Chromosome", "Position", "Coverage"])
    
    # Filter only for CP034458 and positions 0-1,150,000 as this is the region of interest
    df_filtered = df[(df["Chromosome"] == "CP034458") & (df["Position"] >= 0) & (df["Position"] <= 1150000)]
    
    # Save the filtered data
    output_file = os.path.join(output_folder, f"{strain_name}_CH58short.txt")
    df_filtered.to_csv(output_file, sep="\t", index=False, header=False)
    
    print(f"Saved filtered data for {strain_name} to {output_file}")

Saved filtered data for matc2 to filtered_data/matc2_CH58short.txt
Saved filtered data for WS4 to filtered_data/WS4_CH58short.txt
Saved filtered data for jog19 to filtered_data/jog19_CH58short.txt
Saved filtered data for WS5 to filtered_data/WS5_CH58short.txt
Saved filtered data for matc3 to filtered_data/matc3_CH58short.txt
Saved filtered data for matc1 to filtered_data/matc1_CH58short.txt
Saved filtered data for WS7 to filtered_data/WS7_CH58short.txt
Saved filtered data for WS6 to filtered_data/WS6_CH58short.txt
Saved filtered data for matc4 to filtered_data/matc4_CH58short.txt
Saved filtered data for WS2 to filtered_data/WS2_CH58short.txt
Saved filtered data for WS3 to filtered_data/WS3_CH58short.txt
Saved filtered data for WS1 to filtered_data/WS1_CH58short.txt
Saved filtered data for jog20 to filtered_data/jog20_CH58short.txt
Saved filtered data for jog21 to filtered_data/jog21_CH58short.txt
Saved filtered data for mat22 to filtered_data/mat22_CH58short.txt
Saved filtered data for

In [13]:
import pandas as pd
import glob
import os
import numpy as np

# Define input and output file paths
input_files = glob.glob("CH58/*.txt")  # Adjust to match your directory
output_folder = "filtered_data/"  # Folder to store filtered files
os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

# Dictionary to store median coverages for each strain
median_coverages = {}

# Process each file
for file in input_files:
    strain_name = os.path.basename(file).replace(".txt", "")  # Extract strain name
    
    # Load the coverage data (assuming tab-separated format, no headers)
    df = pd.read_csv(file, sep="\t", header=None, names=["Chromosome", "Position", "Coverage"])
    
    # Filter only for CP034458 and positions 0-1,150,000 as this is the region of interest
    df_filtered = df[(df["Chromosome"] == "CP034458")]
    
    # Calculate median coverage for this strain
    median_coverage = np.median(df_filtered["Coverage"].dropna())  # Drop NaN values
    median_coverages[strain_name] = round(median_coverage)  # Store rounded median

    # Add a new column with median coverage (same for all rows in this strain's file)
    df_filtered["Median Coverage"] = round(median_coverage)  # Adds the median to each row
    
    # Save the filtered data with median coverage
    output_file = os.path.join(output_folder, f"{strain_name}_CH58.txt")
    df_filtered.to_csv(output_file, sep="\t", index=False, header=False)

    print(f"Saved filtered data for {strain_name} to {output_file}")

# Step 2: Create a separate CSV file with just the median coverages
median_df = pd.DataFrame(list(median_coverages.items()), columns=["Strain", "Median Coverage"])
median_output_file = os.path.join(output_folder, "Median_Coverage_CP58.csv")
median_df.to_csv(median_output_file, index=False)

print(f"Saved median coverages to {median_output_file}")


Saved median coverages to filtered_data/Median_Coverage_CP58.csv


In [48]:
# Checking it has worked
df = pd.read_csv("filtered_data/WS4_CH58short.txt", sep="\t", header=None, names=["Chromosome", "Position", "Coverage"])

# Display the first 5 rows
print(df.head())

  Chromosome  Position  Coverage
0   CP034458         1       152
1   CP034458         2       213
2   CP034458         3       216
3   CP034458         4       227
4   CP034458         5       243


In [49]:
# Step 3: Make the appropriate gbk file for the Chromosome ..58 and to the
# correct position so we can then emalgamate

from Bio import SeqIO
import pandas as pd

# File paths
genbank_file = "GCA_004217705.1_ASM421770v1_genomic-1.gbk"
record_id = "CP034458.1"

# Initialize an empty list to store the gene information
gene_data = []

# Parse the GenBank file and find the specific record by ID
with open(genbank_file, "r") as handle:
    for record in SeqIO.parse(handle, "genbank"):
        if record.id == record_id:
            # Extract the CDS (coding region) annotations
            for feature in record.features:
                if feature.type == "CDS":
                    start_position = int(feature.location.start)
                    end_position = int(feature.location.end)

                    # Filter to only include genes that start at ≤ 1,150,000
                    if start_position <= 1150000:
                        gene_name = feature.qualifiers.get("gene", ["unknown"])[0]  # Get gene name (fallback to "unknown")
                        product = feature.qualifiers.get("product", ["unknown"])[0]  # Get product description
                        strand = "+" if feature.strand == 1 else "-"  # Determine strand direction

                        # Store gene information
                        gene_info = {
                            "Gene": gene_name,
                            "Product": product,
                            "Start Position": start_position,
                            "End Position": end_position,
                            "Strand": strand
                        }
                        gene_data.append(gene_info)

# Create a DataFrame from the filtered gene data
df = pd.DataFrame(gene_data)

# Display the table
print(df)

# Save the filtered table to a CSV file
df.to_csv("AnnotatedGenome_CP58_short.csv", index=False)




            Gene                                            Product  \
0    MPUL0C00100  Hyphally regulated cell wall GPI-anchored prot...   
1    MPUL0C00110    Hyphally regulated cell wall protein N-terminal   
2    MPUL0C00120         small oligopeptide transporter, OPT family   
3    MPUL0C00130         Pimeloyl-ACP methyl ester carboxylesterase   
4        unknown                               hypothetical protein   
..           ...                                                ...   
390  MPUL0C04050                           DNA repair protein RAD50   
391  MPUL0C04060                              C2H2-type zinc finger   
392      unknown                               hypothetical protein   
393  MPUL0C04080  DNA-directed RNA polymerase I and III subunit ...   
394  MPUL0C04090               ATP-dependent RNA helicase DDX5/DBP2   

     Start Position  End Position Strand  
0             27681         38832      -  
1             46148         50885      -  
2             5859

In [50]:
# Step 4

import pandas as pd
import glob
import os

# Define paths
base_dir = "/Users/evaedwards/Final-Year-Project/Datasets/TXT"
annotated_file = os.path.join(base_dir, "AnnotatedGenome_CP58_short.csv")
coverage_files = glob.glob(os.path.join(base_dir, "filtered_data", "*.txt"))

# Load annotated genome data
genes_df = pd.read_csv(annotated_file)

# Store processed data
final_data = []

# Process each strain's coverage file
for file in coverage_files:
    strain_name = os.path.basename(file).split("_")[0].upper()  # Extract strain name
    
    # Load coverage data into a dictionary for fast lookup
    coverage_df = pd.read_csv(file, sep="\t", header=None, names=["Chromosome", "Position", "Coverage"])
    coverage_dict = dict(zip(coverage_df["Position"], coverage_df["Coverage"]))  # {Position: Coverage}
    
    # Process each gene
    for _, gene in genes_df.iterrows():
        gene_function = gene["Product"]  # Renaming Product to Gene Function
        gene_id = gene["Gene"]  # Assuming this column exists in the gbk annotation file
        start_pos, end_pos = gene["Start Position"], gene["End Position"]
        length = end_pos - start_pos  # Calculate length

        # Extract coverage values within the gene range
        coverage_values = [coverage_dict[pos] for pos in range(start_pos, end_pos + 1) if pos in coverage_dict]

    
        # Calculate mean coverage and round to nearest whole number for ease of display
        mean_coverage = round(sum(coverage_values) / len(coverage_values)) if coverage_values else None
        
        # Store result
        final_data.append([gene_id, gene_function, start_pos, end_pos, length, strain_name, mean_coverage])

# Convert to DataFrame
final_df = pd.DataFrame(final_data, columns=["Gene ID", "Gene Function", "Start Position", "End Position", "Length", "Strain", "Mean Coverage"])

# Pivot table to make strains as columns
final_pivot = final_df.pivot(index=["Gene ID", "Gene Function", "Start Position", "End Position", "Length"], columns="Strain", values="Mean Coverage")

# Reset column names for clarity
final_pivot.columns = [f"{strain}" for strain in final_pivot.columns]
final_pivot.reset_index(inplace=True)

# Save the final table
output_file = os.path.join(base_dir, "Table_CRC_CH58_ALL.csv")
final_pivot.to_csv(output_file, index=False)

# Display result
print(final_pivot.head())


       Gene ID                                      Gene Function  \
0  MPUL0C00100  Hyphally regulated cell wall GPI-anchored prot...   
1  MPUL0C00110    Hyphally regulated cell wall protein N-terminal   
2  MPUL0C00120         small oligopeptide transporter, OPT family   
3  MPUL0C00130         Pimeloyl-ACP methyl ester carboxylesterase   
4  MPUL0C00150                                          mitofilin   

   Start Position  End Position  Length  JOG1  JOG10  JOG11  JOG12  JOG13  \
0           27681         38832   11151  1059    604    715    393    599   
1           46148         50885    4737   600    339    372    206    324   
2           58599         61146    2547   334    205    217    127    195   
3           61437         62331     894   324    194    204    112    186   
4           66379         68122    1743   336    199    215    124    192   

   ...  WS11  WS12  WS2  WS3  WS4   WS5   WS6  WS7  WS8  WS9  
0  ...   450   622  584  424  690  1002  1082  557  453  56

In [51]:
# Work out median coverage per starin to add it to file in order to standardize

import pandas as pd
import glob
import os
import numpy as np

os.chdir('/Users/evaedwards/Final-Year-Project/Datasets/TXT')

# Define input files and output file
input_files = glob.glob("CH58/*.txt")  # Adjust to match your directory
output_file = "filtered_data/Median_Coverage_CP58.csv"  # CSV to store results
print(input_files)

# Dictionary to store median coverages
median_coverages = {}

# Process each file
for file in input_files:
    strain_name = os.path.basename(file).replace(".txt", "")  # Extract strain name

    # Load the coverage data (assuming tab-separated format, no headers)
    df = pd.read_csv(file, sep="\t", header=None, names=["Chromosome", "Position", "Coverage"])

    # Filter only for CP034458
    df_filtered = df[df["Chromosome"] == "CP034458"]

    # Calculate median coverage for CP034458
    median_coverage = np.median(df_filtered["Coverage"].dropna())  # Drop NaN values
    median_coverages[strain_name] = round(median_coverage)  # Store rounded median

# Convert dictionary to DataFrame
median_df = pd.DataFrame(list(median_coverages.items()), columns=["Strain", "Median Coverage"])

# Save to CSV
median_df.to_csv(output_file, index=False)

print(f"Saved median coverages to {output_file}")


['CH58/matc2.txt', 'CH58/WS4.txt', 'CH58/jog19.txt', 'CH58/WS5.txt', 'CH58/matc3.txt', 'CH58/matc1.txt', 'CH58/WS7.txt', 'CH58/WS6.txt', 'CH58/matc4.txt', 'CH58/WS2.txt', 'CH58/WS3.txt', 'CH58/WS1.txt', 'CH58/jog20.txt', 'CH58/jog21.txt', 'CH58/mat22.txt', 'CH58/PP16.txt', 'CH58/jog1.txt', 'CH58/jogc5.txt', 'CH58/jogc4.txt', 'CH58/PP17.txt', 'CH58/mat23.txt', 'CH58/mat21.txt', 'CH58/PP15.txt', 'CH58/jog2.txt', 'CH58/NW13.txt', 'CH58/jog3.txt', 'CH58/PP14.txt', 'CH58/mat20.txt', 'CH58/mat24.txt', 'CH58/mat18.txt', 'CH58/PP10.txt', 'CH58/jog7.txt', 'CH58/NW16.txt', 'CH58/NW17.txt', 'CH58/jogc2.txt', 'CH58/jog6.txt', 'CH58/PP11.txt', 'CH58/mat19.txt', 'CH58/PP8.txt', 'CH58/PP13.txt', 'CH58/NW15.txt', 'CH58/NW29.txt', 'CH58/NW28.txt', 'CH58/jogc1.txt', 'CH58/NW14.txt', 'CH58/jog5.txt', 'CH58/PP12.txt', 'CH58/PP9.txt', 'CH58/mat17.txt', 'CH58/PP4.txt', 'CH58/PP23.txt', 'CH58/jog8.txt', 'CH58/NW19.txt', 'CH58/NW31.txt', 'CH58/NW25.txt', 'CH58/NW24.txt', 'CH58/NW30.txt', 'CH58/NW18.txt', 'CH5