In [6]:
import pandas as pd
import glob
import os
import numpy as np

os.chdir('/Users/evaedwards/Final-Year-Project/Datasets/TXT')
print(os.getcwd())

# Define input and output file paths
input_files = glob.glob("CH58/*.txt")  # This contains full txt files but just called this for now
output_folder = "wg_median/"  # Folder to store filtered files
os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

# Dictionary to store median coverages for each strain
median_coverages = {}

# Process each file
for file in input_files:
    strain_name = os.path.basename(file).replace(".txt", "")  # Extract strain name
    
    # Load the coverage data (assuming tab-separated format, no headers)
    df = pd.read_csv(file, sep="\t", header=None, names=["Chromosome", "Position", "Coverage"])
    
    # Calculate median coverage for this strain
    median_coverage = np.median(df["Coverage"].dropna())  # Drop NaN values
    median_coverages[strain_name] = round(median_coverage)  # Store rounded median

    # Add a new column with median coverage (same for all rows in this strain's file)
    df["Median Coverage"] = round(median_coverage)  # Adds the median to each row
    
    # Save the filtered data with median coverage
    output_file = os.path.join(output_folder, f"{strain_name}_MCWG.txt")
    df.to_csv(output_file, sep="\t", index=False, header=False)

    print(f"Saved filtered data for {strain_name} to {output_file}")

# Step 2: Create a separate CSV file with just the median coverages
median_df = pd.DataFrame(list(median_coverages.items()), columns=["Strain", "Median Coverage"])
median_output_file = os.path.join(output_folder, "WG_Median_Coverage.csv")
median_df.to_csv(median_output_file, index=False)

print(f"Saved median coverages to {median_output_file}")

/Users/evaedwards/Final-Year-Project/Datasets/TXT
Saved filtered data for matc2 to wg_median/matc2_MCWG.txt
Saved filtered data for WS4 to wg_median/WS4_MCWG.txt
Saved filtered data for jog19 to wg_median/jog19_MCWG.txt
Saved filtered data for WS5 to wg_median/WS5_MCWG.txt
Saved filtered data for matc3 to wg_median/matc3_MCWG.txt
Saved filtered data for matc1 to wg_median/matc1_MCWG.txt
Saved filtered data for WS7 to wg_median/WS7_MCWG.txt
Saved filtered data for WS6 to wg_median/WS6_MCWG.txt
Saved filtered data for matc4 to wg_median/matc4_MCWG.txt
Saved filtered data for WS2 to wg_median/WS2_MCWG.txt
Saved filtered data for WS3 to wg_median/WS3_MCWG.txt
Saved filtered data for WS1 to wg_median/WS1_MCWG.txt
Saved filtered data for jog20 to wg_median/jog20_MCWG.txt
Saved filtered data for jog21 to wg_median/jog21_MCWG.txt
Saved filtered data for mat22 to wg_median/mat22_MCWG.txt
Saved filtered data for PP16 to wg_median/PP16_MCWG.txt
Saved filtered data for jog1 to wg_median/jog1_MCWG.

In [7]:
import pandas as pd
import os

os.chdir('/Users/evaedwards/Final-Year-Project/Datasets/TXT')
print(os.getcwd())

# Load the CSV file
file_path = "WG_median_coverage.csv"  # Adjust path if needed
df = pd.read_csv(file_path)

# Ensure strain names are capitalized (assuming the column is named "Strain")
df["Strain"] = df["Strain"].str.upper()

# Save the modified data to a new CSV file
output_path = "WG_median_coverage_CAPITALIZED.csv"  # New file name
df.to_csv(output_path, index=False)

print(f"Saved updated CSV as {output_path}")


/Users/evaedwards/Final-Year-Project/Datasets/TXT
Saved updated CSV as WG_median_coverage_CAPITALIZED.csv


In [10]:
import pandas as pd
import glob
import os
import numpy as np

os.chdir('/Users/evaedwards/Final-Year-Project/Datasets/TXT')
print(os.getcwd())

# Define input and output file paths
input_files = glob.glob("APC/*.txt")  # This contains full txt files but just called this for now
output_folder = "wg_median/"  # Folder to store filtered files
os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

# Dictionary to store median coverages for each strain
median_coverages = {}

# Process each file
for file in input_files:
    strain_name = os.path.basename(file).replace(".txt", "")  # Extract strain name
    
    # Load the coverage data (assuming tab-separated format, no headers)
    df = pd.read_csv(file, sep="\t", header=None, names=["Chromosome", "Position", "Coverage"])
    
    # Calculate median coverage for this strain
    median_coverage = np.median(df["Coverage"].dropna())  # Drop NaN values
    median_coverages[strain_name] = round(median_coverage)  # Store rounded median

    # Add a new column with median coverage (same for all rows in this strain's file)
    df["Median Coverage"] = round(median_coverage)  # Adds the median to each row
    
    # Save the filtered data with median coverage
    output_file = os.path.join(output_folder, f"{strain_name}_MCWG.txt")
    df.to_csv(output_file, sep="\t", index=False, header=False)

    print(f"Saved filtered data for {strain_name} to {output_file}")

# Step 2: Create a separate CSV file with just the median coverages
median_df = pd.DataFrame(list(median_coverages.items()), columns=["Strain", "Median Coverage"])
median_output_file = os.path.join(output_folder, "WG_Median_Coverage_APC.csv")
median_df.to_csv(median_output_file, index=False)

print(f"Saved median coverages to {median_output_file}")

/Users/evaedwards/Final-Year-Project/Datasets/TXT
Saved filtered data for APC12 to wg_median/APC12_MCWG.txt
Saved median coverages to wg_median/WG_Median_Coverage_APC.csv


In [2]:
!git --version

git version 2.39.2 (Apple Git-143)


In [4]:
import os
os.getcwd()

'/Users/evaedwards/Final-Year-Project/Code'

In [6]:
!git add WG_mediancoverage.ipynb

In [8]:
!git commit -m "Added WG_mediancoverage.ipynb"

On branch main
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.DS_Store[m
	[31m.ipynb_checkpoints/[m
	[31m97CRP 2.R[m
	[31m97CRP.R[m
	[31mFigure1.R[m
	[31mFigure2-draft2.R[m
	[31mFigure2A.R[m
	[31mFigure2B.R[m
	[31mFinal_Year_Project/[m
	[31mHCcoverage.R[m
	[31mIC_code.R[m
	[31mIP_draft2.R[m
	[31mIPshiny_adv.R[m
	[31mInteractivePlot_Shiny.R[m
	[31mInteractive_Coverage.R[m
	[31mInteractive_Coverage_All.R[m
	[31mInteractive_Plot.R[m
	[31mQ1final.R[m
	[31mQ1lmer.R[m
	[31mQ1q1new.R[m
	[31mQ2.R[m
	[31mQ2d2.R[m
	[31mTable_Alldata_2.R[m
	[31mallcoveragwe.R[m
	[31mchart.R[m
	[31minteractiveplot_updated.R[m
	[31msignif.R[m

nothing added to commit but untracked files present (use "git add" to track)
