## Manual variant pathogenicity predictor

In [None]:
import pandas as pd
import numpy as np

Part 1 - Common Mutations Between Datasets

In [None]:

# Combine datasets 
datasets = []
for i in [1, 2, 3]:
    datasets.append(pd.read_csv(f"/work/project/ext_016/RNA-Seq-Variant-Calling_1/dataset_{i}_positions_confidence.csv"))
   
for idx, df in enumerate(datasets, start=1):
    df['Dataset'] = idx

combined_df = pd.concat(datasets)

# Standardize the labels
combined_df['Disease'] = combined_df['Disease'].replace({
    'Healthy': 'healthy',
    'systemic lupus erythematosus (SLE)': 'SLE',
    'systemic lupus erythematosus': 'SLE'
})


In [None]:
# Group by POS and calculate means for numeric columns
# For other columns (REF, ALT, etc.), take the first value since they're constant
aggregated_df = combined_df.groupby('POS').agg({
    'Run': 'first',
    'REF': 'first',
    'ALT': 'first',
    'GQ': 'mean',
    'QUAL': 'mean',
    'DP': 'mean',
    'gq_conf': 'mean',
    'qual_conf': 'mean',
    'dp_conf': 'mean',
    'confidence': 'mean',
    'FILTER': 'first'
}).reset_index()
# aggregated_df

In [None]:

result_df = []

# Combine same variations and their quantitative data; Occurence in datasets, Names of the datasets, % healthy, % SLE
pos_occurrence_counts = combined_df['POS'].value_counts().reset_index()
pos_occurrence_counts.columns = ['POS', 'Occurrence']

# Group by POS and create the aggregated dataset information
pos_grouped = combined_df.groupby('POS').agg({
    'Dataset': lambda x: ','.join(map(str, sorted(x.unique()))),  # Convert integers to strings before joining
}).rename(columns={'Dataset': 'Datasets'})

# Count total healthy and SLE samples
total_healthy = len(combined_df[combined_df['Disease'] == 'healthy'].drop_duplicates('Run'))
print(total_healthy)
total_sle = len(combined_df[combined_df['Disease'] == 'SLE'].drop_duplicates('Run'))
print(total_sle)

# calculate percentages for Healthy and SLE
def calculate_percentages(pos):
    # Get all samples for this position
    samples = combined_df[combined_df['POS'] == pos]
    

    # Count healthy and SLE samples for this position
    pos_healthy = len(samples[samples['Disease'] == 'healthy'])
    pos_sle = len(samples[samples['Disease'] == 'SLE'])
    
    # Calculate # of samples
    healthy = pos_healthy
    sle= pos_sle
    
    return pd.Series({
        'Healthy': round(healthy, 1),
        'SLE': round(sle, 1)
    })

# Calculate percentages for each position
percentages = pd.DataFrame([calculate_percentages(pos) for pos in pos_grouped.index], 
                         index=pos_grouped.index)

# Combine the results
result_df = pd.concat([pos_grouped, percentages], axis=1)
# Replace ",<NON_REF>" with an empty string
result_df["ALT"] = result_df["ALT"].str.replace(",<NON_REF>", "", regex=False)
# Remove the Run column
result_df = result_df.drop("Run", axis=1)

# Sort by position
result_df = result_df.sort_index()
result_df = result_df.merge(pos_occurrence_counts, on='POS')
result_df = result_df.merge(aggregated_df, on='POS')

In [None]:
vep = pd.read_csv('VEP_results_aggregated.csv')

VEP Merging

In [None]:
#result_df = {}
result_df = result_df.merge(vep, on='POS')

In [None]:
result_df.to_csv("result_df.csv", index = False)

Ordering the positions


In [None]:

results=[]
def calculate_vep_score(row, weights):
    # Original score calculation
    consequences = str(row['Consequence']).split(',')
    base_score = max(weights.get(consequence.strip(), 0) for consequence in consequences)
    
    # PolyPhen
    polyphen_score = float(row['PolyPhen']) if pd.notna(row['PolyPhen']) else 0
    if polyphen_score >= 0.447:
        base_score += 10
    
    # CADD
    cadd_phred = float(row['CADD_PHRED']) if pd.notna(row['CADD_PHRED']) else 0
    if cadd_phred >= 20:
        base_score += 10
    elif cadd_phred >= 10:
        base_score += 5
    
    # SIFT
    sift_score = float(row['SIFT']) if pd.notna(row['SIFT']) else 0
    if sift_score <= 0.05 and sift_score > 0.00:
        base_score += 10
    
    # AF
    af_score = float(row['AF']) if pd.notna(row['AF']) else 0
    if af_score < 0.01:
        base_score += 10
    
    # SLE vs Healthy
    lupus_score = float(row["SLE"])
    healthy_score = float(row["Healthy"])
    if lupus_score >= 1 and healthy_score == 0:
        base_score += 10
    
    # Apply confidence modifier
    base_score = base_score * row["confidence"]
    
    # Normalize to 1-100 scale
    # Assuming theoretical max is 70 × max_confidence
    max_possible_score = 70 
    
    # Scale to 1-100 (with 1 as minimum)
    normalized_score = 1 + (base_score / max_possible_score) * 99
    
    # Ensure the score doesn't exceed 100
    return min(100, normalized_score)

def process_variants(df):
    """
    Process all variants in a DataFrame and calculate their VEP scores.
    
    Parameters:
    df: pandas DataFrame containing variant information
    
    Returns:
    pandas DataFrame: Original DataFrame with additional 'vep_score' column
    """
    # Define weights for each consequence type
    weights = {
        'intergenic_variant': 1,
        'feature_truncation': 3,
        'regulatory_region_variant': 3,
        'feature_elongation': 3,
        'regulatory_region_amplification': 3,
        'regulatory_region_ablation': 3,
        'TF_binding_site_variant': 3,
        'TFBS_amplification': 3,
        'TFBS_ablation': 3,
        'downstream_gene_variant': 3,
        'upstream_gene_variant': 3,
        'non_coding_transcript_variant': 3,
        'NMD_transcript_variant': 3,
        'intron_variant': 3,
        'non_coding_transcript_exon_variant': 3,
        '3_prime_UTR_variant': 5,
        '5_prime_UTR_variant': 5,
        'mature_miRNA_variant': 5,
        'coding_sequence_variant': 5,
        'synonymous_variant': 5,
        'stop_retained_variant': 5,
        'incomplete_terminal_codon_variant': 5,
        'splice_region_variant': 5,
        'protein_altering_variant': 10,
        'missense_variant': 10,
        'inframe_deletion': 15,
        'inframe_insertion': 15,
        'transcript_amplification': 15,
        'start_lost': 15,
        'stop_lost': 15,
        'frameshift_variant': 20,
        'stop_gained': 20,
        'splice_donor_variant': 20,
        'splice_acceptor_variant': 20,
        'transcript_ablation': 20
    }
    
    # Calculate scores for all variants
    df['score'] = df.apply(lambda row: calculate_vep_score(row, weights), axis=1)
    
    return df

results = process_variants(result_df)


In [None]:
# Drop the columns that are not needed
results = results.sort_values(by='score', ascending=False)
results= results.drop("CLIN_SIG", axis = 1) 
results= results.drop("ClinPred", axis = 1)
#results= results.drop("CADD_PHRED", axis = 1)
results

In [None]:
# save file
results.to_csv("Final_table_unfiltered.csv", index = False)