In [None]:
import gzip
import pandas as pd

# Specify the compressed VCF file path
vcf_file = "C:/Users/dharm/Downloads/Pupil_bio/filtered/PA220KH-lib09-P19-Tumor_S2_L001.sorted_filtered.vcf.gz"
header = None
# Load the VCF file, using the '#CHROM' line as the header
with gzip.open(vcf_file, 'rt', encoding='utf-8') as file:
    for line in file:
        if line.startswith("#CHROM"):
            print(line)
            header = line.strip().lstrip("#").split("\t")
            break

# Load the data, skipping all lines starting with '##'
df = pd.read_csv(vcf_file, compression='gzip', comment='#', sep='\t', names=header, encoding='utf-8')

# Display the first few rows
print(df.head())


In [None]:
vcf_file = "C:/Users/dharm/Downloads/Pupil_bio/filtered/PA221MH-lib09-P19-Norm_S1_L001.sorted_filtered.vcf.gz"
header = None
# Load the VCF file, using the '#CHROM' line as the header
with gzip.open(vcf_file, 'rt', encoding='utf-8') as file:
    for line in file:
        if line.startswith("#CHROM"):
            print(line)
            header = line.strip().lstrip("#").split("\t")
            break

# Load the data, skipping all lines starting with '##'
df2 = pd.read_csv(vcf_file, compression='gzip', comment='#', sep='\t', names=header, encoding='utf-8')

# Display the first few rows
print(df2.head())

In [None]:
df2

In [None]:
import pandas as pd

# Path to the file
file_path = r'C:/Users/dharm/Downloads/Pupil_bio/human_genome/GCA_000001405.29_GRCh38.p14_genomic.fna.ann'

# Initialize lists to store data
data = []

# Open the file and read line by line
with open(file_path, 'r') as file:
    for line in file:
        # Split the line by whitespace
        columns = line.split()
        
        # Handle lines with more than 6 columns by combining description parts
        if len(columns) > 6:
            # Combine all parts except the first 3 and last 3 into the description
            description = " ".join(columns[2:-3])
            # Create a new row with the correct number of columns
            row = [columns[0], columns[1], description] + columns[-3:]
            data.append(row)
        elif len(columns) == 6:
            data.append(columns)

# Create a DataFrame from the list of data
ref = pd.DataFrame(data, columns=["Index", "Contig", "Description", "Start", "Length", "Unknown"])

# Display the first few rows to check
print(ref.head())


In [None]:
ref

In [None]:
ref_germline = ref[ref['Description'].str.contains(r'Homo sapiens chromosome [XY]', regex=True)]


In [None]:
ref_germline

In [None]:
tumor_chrm_id = set(list(df["POS"]))
normal_chrm_id = set(list(df2["POS"]))

In [None]:
normal_chrm_id

In [None]:
tumor_only_variant = list(tumor_chrm_id - normal_chrm_id)

In [None]:
tumor_only_variant

In [None]:
df_tumor = df[df["POS"].isin(tumor_only_variant)]

In [None]:
def has_duplicates(lst):
    return len(lst) != len(set(lst))

In [None]:
print(has_duplicates(normal_chrm_id))

In [None]:
somatic_df =  df_tumor[~df_tumor["CHROM"].isin(list(ref_germline["Contig"]))]

In [None]:
VCF_final = somatic_df[somatic_df['ALT']!="."]

In [None]:
VCF_final.to_csv("Somatic_Variant_Tumor_only.csv")

In [None]:
def calculate_mutation_frequency(info):
    # Debug: Print the entire INFO field
    print(f"INFO field: {info}")

    # Extract DP4 values from the INFO field
    dp4_field = next((field for field in info.split(";") if field.startswith("DP4=")), None)
    
    # Debug: Print the extracted DP4 field
    print(f"Extracted DP4 field: {dp4_field}")

    if dp4_field:
        # Extract counts
        counts = list(map(int, dp4_field.split("=")[1].split(",")))
        forward_ref, reverse_ref, forward_alt, reverse_alt = counts

        # Debug: Print individual counts
        print(f"Counts: Forward Ref={forward_ref}, Reverse Ref={reverse_ref}, Forward Alt={forward_alt}, Reverse Alt={reverse_alt}")

        # Calculate the alternate allele frequency
        alt_allele_freq = (forward_alt + reverse_alt) / sum(counts)

        # Debug: Print calculated alternate allele frequency
        print(f"Calculated Alt Allele Frequency: {alt_allele_freq}")

        return alt_allele_freq

    # Debug: If DP4 field is missing
    print("DP4 field is missing or malformed.")
    return None


In [None]:
# Apply the function and observe the debug output
df2["Mutation_Frequency"] = df2["INFO"].apply(calculate_mutation_frequency)

In [None]:
df2.to_csv("Normal_mutation_frequency.csv")