In [None]:
# By: Connor S. Murray
# Started: 12.14.2024
# This script analyzes eGenes consequences from a VCF

# Libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set working directory
os.chdir("/standard/vol185/cphg_Manichaikul/users/csm6hg")

# Read in metadata
meta = pd.read_table("data/gene_consequence_snp_counts.txt.gz")

# Display the first few rows to verify structure
print("Raw data:")
print(meta.head())
 
# Simplifying the consequence annotations
def simplify_consequence(consequence):
    if "5_prime" in consequence:
        return "5'UTR"
    elif "3_prime" in consequence:
        return "3'UTR"
    elif "premature_start_codon_gain_variant" in consequence or "start" in consequence:
        return "Start Codon"
    elif "conservative_inframe" in consequence:
        return "Inframe INDEL"
    elif "frameshift" in consequence:
        return "Frameshift"
    elif "fusion" in consequence:
        return "Gene Fusion"
    elif "disruptive" in consequence:
        return "Disruptive"
    elif "stop_lost" in consequence or "stop" in consequence:
        return "Stop Codon"
    elif "exon" in consequence:
        return "Exonic"
    elif "intron" in consequence:
        return "Intronic"
    elif "TFBS" in consequence:
        return "Regulatory"
    elif "splice" in consequence:
        return "Splicing"
    elif "structural" in consequence:
        return "Structural"
    else:
        return consequence.capitalize()  # For any unclassified annotations

# Apply the simplification function to create a new column
meta["simp_ann"] = meta["consequence"].apply(simplify_consequence)

# Total number of SNPs
tot = meta['snp_count'].sum()

# Calculate the proportion of SNPs for each consequence
meta["prop"] = (meta["snp_count"] / tot)*100

# Group by simplified consequence to summarize the data
grouped_meta = meta.groupby("simp_ann", as_index=False).agg(
    total_snps=("snp_count", "sum"),
    prop=("prop", "sum"))

# Sort by count for better visualization
grouped_meta = grouped_meta.sort_values(by=['simp_ann'], ascending=True)

# Display the resulting simplified DataFrame
print("Aggregate data:")
print(grouped_meta)
# Sort data by proportion for cleaner visualization
grouped_meta = grouped_meta.sort_values("prop", ascending=False)

# Apply log transformation to the proportions (optional for debugging/log-prop column)
grouped_meta['log_prop'] = np.log10(grouped_meta['prop'] + 1e-6)  # Small constant to avoid log(0)

# Plotting SNP consequences using seaborn
sns.set(style="whitegrid")
plt.figure(figsize=(6, 6))
ax = sns.barplot(
    data=grouped_meta,
    y="simp_ann",
    x="prop",
    palette="viridis"
)

# Fix x-axis scale
plt.xscale("log")
ticks = [0.01, 0.1, 1, 10, 100]  # Example tick values
plt.xticks(ticks=ticks, labels=[f"{tick}%" for tick in ticks], fontsize=10)
plt.xlabel("TOPCHeF SNPs (%)", fontsize=14, fontweight='bold')
plt.ylabel("Consequence", fontsize=14, fontweight='bold')

# Display the number of SNPs on each bar
for bar, label in zip(ax.patches, grouped_meta['total_snps']):
    width = bar.get_width()  # Get bar width (x-value)
    ax.text(
        width,  # Position the text at the end of the bar
        bar.get_y() + bar.get_height() / 2,  # Vertically center the text on the bar
        f"{int(label)}",  # Convert `total_snps` to integer for display
        ha="left",  # Horizontal alignment
        va="center",  # Vertical alignment
        fontsize=10,
        color="black"
    )

# Save or show the plot
#plt.savefig("plots/snp_consequence_proportions_with_counts.pdf", format="pdf", bbox_inches="tight")
plt.show()

# Output file
output_file = "data/topchef_vcf_summarized_metadata.csv"

# Output file
grouped_meta.to_csv(output_file, index=False, sep="\t")