## Cleaning the VEP output data

In [None]:
import re
import numpy as np
import pandas as pd

In [None]:
# Function to filter the Variant effect predictor output

def filter_vep_columns(input_file, output_file=None):
    # Define the columns we want to keep
    columns_to_keep = [
        "#Uploaded_variation",
        "Consequence",
        "IMPACT",
        "BIOTYPE",
        "Protein_position",
        "Amino_acids",
        "SIFT",
        "PolyPhen",
        "AF",
        "CLIN_SIG",
        "ClinPred",
        "CADD_PHRED"
    ]
    
    # Define numeric columns where we want to replace "-" with 0
    numeric_columns = [
        "SIFT",
        "PolyPhen",
        "AF",
        "ClinPred",
        "CADD_PHRED"
    ]
    
    # Read the VEP file
    df = pd.read_csv(input_file, sep='\t')
    
    # Select only the columns we want to keep
    filtered_df = df[columns_to_keep]
    
    # Replace "-" with 0 only in numeric columns
    filtered_df[numeric_columns] = filtered_df[numeric_columns].replace("-", 0)
    
    # If output file is specified, save the filtered data
    if output_file:
        filtered_df.to_csv(output_file, sep='\t', index=False)
        
    return filtered_df

vep = filter_vep_columns('VEP_results_new.txt', 'filtered_VEP.txt')
vep

In [None]:
# FUnction for processing SIFT
def process_sift(value):
    # If value is '0' or 0, return 0
    if value == '0' or value == 0:
        return 0
    
    # Split by comma if multiple values exist
    predictions = value.split(',')
    
    # Extract numbers inside parentheses
    import re
    scores = []
    for pred in predictions:
        match = re.search(r'\((0?\.\d+)\)', pred)
        if match:
            score = float(match.group(1))
            if score != 0:  # Only append non-zero scores
                scores.append(score)
    
    # Return minimum value if scores exist, otherwise 0
    return min(scores) if scores else 0

# Apply the function to the SIFT column
vep['SIFT'] = vep['SIFT'].apply(process_sift)

In [None]:
# Extract only the position number using string operations
vep['POS'] = vep['#Uploaded_variation'].str.split('_').str[1]
def merge_strings(x):
    # Filter out empty strings and zeros, then join with comma
    values = [str(val) for val in x if val not in [0, '0', '']]
    return ','.join(values) if values else '0'

def first_non_zero(x):
    # Returns first non-zero value, or 0 if all are zero
    non_zero = [val for val in x if val != 0 and val != '0']
    return non_zero[0] if non_zero else 0

aggregated_vep = vep.groupby('POS').agg({
    '#Uploaded_variation': 'first',
    'IMPACT': 'first',
    'BIOTYPE': merge_strings,
    'Consequence': merge_strings,
    'AF': 'first',
    'Protein_position': first_non_zero,
    'Amino_acids': first_non_zero,
    'CLIN_SIG': 'first',
    'ClinPred': 'first',
    'CADD_PHRED': 'first',
    'SIFT': merge_strings,
    'PolyPhen': merge_strings
}).reset_index()

aggregated_vep

In [None]:

# Function to extract the numeric value from the PolyPhen string
def extract_polyphen_score(value):
    # Handle null/NaN values
    if pd.isna(value):
        return 0.0  # Return 0 instead of np.nan
    
    # Convert to string to handle any non-string inputs
    value = str(value)
    
    if value == '0':  # Handle the '0' case
        return 0.0
    
    # Use regex to extract number within parentheses
    match = re.search(r'\((0\.\d+)\)', value)
    if match:
        return float(match.group(1))
    
    # Try direct conversion if it's already a numeric string
    try:
        return float(value)
    except ValueError:
        return 0.0  # Return 0 instead of np.nan

# Apply the function to create a new column with just the numeric scores
aggregated_vep['PolyPhen'] = aggregated_vep['PolyPhen'].apply(extract_polyphen_score)

# If there are any remaining NaN values, fill them with 0
aggregated_vep['PolyPhen'] = aggregated_vep['PolyPhen'].fillna(0.0)

# Check unique values
aggregated_vep['PolyPhen'].unique()

In [None]:
def extract_sift_score(value):
    # Handle null/NaN values
    if pd.isna(value):
        return np.nan
    
    # Convert to string to handle non-string inputs
    value = str(value)
    
    if value == '0':
        return 0.0
    
    # Handle comma-separated values - take the minimum score
    # (since for SIFT, lower scores indicate more damaging)
    if ',' in value:
        try:
            scores = [float(x) for x in value.split(',')]
            return min(scores)  # Return the most damaging score
        except ValueError:
            pass
    
    # For parentheses format like PolyPhen
    match = re.search(r'\((0\.\d+)\)', value)
    if match:
        return float(match.group(1))
    
    # Try direct conversion for simple numeric strings
    try:
        return float(value)
    except ValueError:
        return np.nan

# Apply the function
aggregated_vep['SIFT'] = aggregated_vep['SIFT'].apply(extract_sift_score)

In [None]:
# Get the proper amino acid change in the right format

def transform_amino_acid(amino_acid, position):
    # Case 1: X/Y format (reference/variant)
    if '/' in amino_acid:
        ref, change = amino_acid.split('/')
        # Handle the special case of a change from nothing (-) to something
        if ref == '-':
            return f"ins{position}{change}"  # Insertion notation
        # Handle change to stop codon
        elif change == '*':
            return f"{ref}{position}Ter"  # Termination notation
        # Standard mutation
        else:
            return f"{ref}{position}{change}"
    
    # Case 2: Single letter (no change indicated)
    elif amino_acid in "ACDEFGHIKLMNPQRSTVWY":
        return f"{amino_acid}{position}="
    
    # Case 3: "-" means no amino acid at that location
    elif amino_acid == '-':
        return "-"  # or another suitable notation
    
    # Return as is for any other unexpected format
    else:
        return amino_acid

# Apply the transformation
aggregated_vep['Amino_acids'] = aggregated_vep.apply(
    lambda row: transform_amino_acid(row['Amino_acids'], row['Protein_position']), 
    axis=1
)
# aggregated_vep['Amino_acids'].unique()

In [None]:
# save file
aggregated_vep.to_csv("VEP_results_aggregated.csv", index = False)