In [None]:
import pandas as pd
import numpy as np
import sys
import re
import functools
import json
import random
import os
import math
import datetime

import matplotlib.pyplot as plt

In [None]:
start_time = datetime.datetime.now()

In [None]:
#os.getcwd()

# Linux workstation
#data_path = '/home/db600/phd/data/'

# Laptop
data_path = 'C:\\Users\\dan\\Documents\\phd\\data\\'

os.listdir(data_path)

In [None]:
mut_path = data_path + 'depmap\\OmicsSomaticMutations.csv'
exp_path = data_path + 'depmap\\OmicsExpressionProteinCodingGenesTPMLogp1.csv' 
conv_path = data_path + 'biomart\\ensembl_biomart_plus_fasta.csv'

In [None]:
# Read the multi_gene_converter into a DF
conv = pd.read_csv(conv_path, header = 0, index_col = 0)
conv = conv.drop(columns='Unnamed: 0')

In [None]:
conv.head()

In [None]:
def load_list(path):
    with open(path) as f:
        g = json.load(f)
    return g

kinases_path = "C:\\Users\\dan\\PycharmProjects\\kinase-onc-tsg\\data\\kinases.json"
oncs_path = "C:\\Users\\dan\\PycharmProjects\\kinase-onc-tsg\\data\\oncs.json"
tsgs_path = "C:\\Users\\dan\\PycharmProjects\\kinase-onc-tsg\\data\\tsgs.json"

kinases = load_list(kinases_path)
oncs = load_list(oncs_path)
tsgs =load_list(tsgs_path)

In [None]:
# Get list of training cell lines used in original program
# Manually saved to CSV earlier
cell_lines = pd.read_csv(data_path + 'dependant\\original_training_cell_lines.csv')
cell_lines = cell_lines.rename(columns={'cell_line': 'CCLEName'})
cell_lines

In [None]:
# Load the DepMap model metadata
depmap_model = pd.read_csv(data_path + 'depmap\\Model.csv')

# Add depmap model ID to the list of 39 original training cell lines
cell_lines = pd.merge(cell_lines, depmap_model[['ModelID', 'CCLEName']], on='CCLEName', how='left')

# View the cell line list
cell_lines

In [None]:
# Load the expression and mutation data
mutation_df = pd.read_csv(mut_path, low_memory=False)
expression_df = pd.read_csv(exp_path)

In [None]:
# Preview expression data
expression_df.head()

In [None]:
# Preview mutation data
mutation_df.head()

In [None]:
mutation_df.columns

In [None]:
mutation_df['TranscriptStrand'].value_counts()

In [None]:
# Take a look at the mutation classes
mutation_df['VariantInfo'].unique()

In [None]:
# Select all mutations for our cell lines of interest only: where 'ModelID' is in cell_lines['ModelID']
#mutation_df = mutation_df.loc[mutation_df['Tumor_Sample_Barcode'].map(lambda x: x in cell_lines)]

mutation_df = mutation_df[mutation_df['ModelID'].isin(cell_lines['ModelID'])]

mutation_df.head()

In [None]:
# Check we've got the right number of ModelID values (should be 39)
mutation_df['ModelID'].unique()

In [None]:
# Take a look at the mutation classes
mutation_df['VariantInfo'].unique()

In [None]:
# First we separate out the badly pathogenic mutations - these are assumed to result in loss of function
pathogenic = ('FRAME_SHIFT_DEL', 'FRAME_SHIFT_INS', 'NONSENSE', 'NONSTOP', 'START_CODON_INS')

##### NOTE ######
# These are the variations that were considered pathogenic on the original version
# pathogenic = ['Frame_Shift_Del', 'Frame_Shift_Ins','Nonsense_Mutation','Nonstop_Mutation','Stop_Codon_Del']

# I will keep this the same for now (note no 'Stop_Codon_Del' class in the new data - presumably because it's the same a NONSTOP) but note:

# IN_FRAME_DEL - this is likely damaging but could be LOF or GOF -  select which depending on whether its onc, tsg or kinase?
# IN_FRAME_INS - as above
# START_CODON_INS - this is likely to prevent the translation of the protein, so LOF?
# START_CODON_SNP - this may prevent translation if the SNP switched the codon from methianine to another amino acid

# Filter mut_df to only include rows where the variant classification is in the pathogenic list defined above
pathogenic_mutations = mutation_df.loc[mutation_df['VariantInfo'].isin(pathogenic)]

pathogenic_mutations.head()

In [None]:
# See if DepMap classification agrees that these are all likely to be LOF (it does)
pathogenic_mutations['LikelyGoF'].value_counts()

In [None]:
pathogenic_mutations['LikelyLoF'].value_counts()

In [None]:
# Group by Tumour_Sample_Barcode (cell-line name) so we have 39 rows (one for each cell line), and a column containing comma seperated list
# of all the highly pathogenic mutations in that sample
path_muts_per_sample = pathogenic_mutations.groupby('ModelID')['HugoSymbol'].apply(lambda x: ', '.join(x)).reset_index()

# Check there are no consecutive commas (denoting missing values)
#path_muts_per_sample[path_muts_per_sample['HugoSymbol'].str.contains(", , ")]
path_muts_per_sample

In [None]:
# Write to csv
path_muts_per_sample.to_csv(data_path + '\\dependant\\pathogenic_mutations_per_sample.csv')

In [None]:
# Select all rows of mutation DF mutation_df where VariantInfo = 'MISSENSE' and VariantType = 'SNP' 
# May also want to include START_CODON_SNP here later (not sure if SNPs in start codon will be covered by the tools that assess mutations - introns only?)
missense_snp = mutation_df[(mutation_df['VariantInfo']=='MISSENSE') & (mutation_df['VariantType']=='SNP')]

In [None]:
missense_snp[missense_snp['Transcript'].isnull()]

In [None]:
# Select all DNP and TNP missense mutations (in original version these are not assessed)
missense_dnp_tnp = mutation_df[(mutation_df['VariantInfo']=='MISSENSE') & ((mutation_df['VariantType'] == 'DNP') | (mutation_df['VariantType'] == 'TNP'))]
missense_dnp_tnp

In [None]:
# Check what's in the chromosomes column
missense_snp['Chrom'].unique()

In [None]:
# Create a VCF file with the SNP missense mutation data

# Copy a subset of columns from the missense_snp dataframe
missense_snp_vcf = missense_snp[['Chrom', 'Pos', 'DbsnpID',  'Ref', 'Alt']].copy()

# Rename the columns to match VCF format requirements
missense_snp_vcf.rename(columns={'Chrom' : '#CHROM' , 'Pos': 'POS', 'Ref': 'REF', 'Alt': 'ALT', 'DbsnpID' : 'ID'}, inplace=True)

# Remove 'chr' string from the chromosome column values - Fathmm-XF won't recognise this this 
missense_snp_vcf['#CHROM'] = missense_snp_vcf['#CHROM'].str.replace('chr', '')

# Write to CSV
missense_snp_vcf.to_csv(data_path + '\\dependant\\depmap_mutations_for_fathmm.vcf', sep='\t', index=False)

print(len(missense_snp_vcf))
missense_snp_vcf.head()

In [None]:
# Get unique transcript IDs from missense SNP mutations dataframe
missense_snp_transcripts = pd.Series(missense_snp['Transcript'].unique(), name='TranscriptID')
missense_snp_transcripts

## Alpha Missense mutation analysis starts here (merge both full AM dfs with mutation df)

### Load and merge the primary data

In [None]:
# Load and preview the alpha missense data (primary assembly)
alpha_missense_primary = pd.read_csv(data_path + 'alphamissense\\AlphaMissense_hg38.tsv', skiprows=3, sep='\t')

In [None]:
alpha_missense_primary_len = len(alpha_missense_primary)
print(alpha_missense_primary_len)

alpha_missense_primary.head(20)

In [None]:
# Rename columns in alpha_missense_complete df to match those in missense_snp for the merging criteria
alpha_missense_primary.rename(columns={'#CHROM': 'Chrom', 'POS': 'Pos', 'REF': 'Ref', 'ALT': 'Alt'}, inplace=True)

# add a column so we can later identify the source of the data when merged
alpha_missense_primary['am_source'] = 'primary'

# Now, merge the DataFrames
missense_snp_extended = missense_snp.merge(alpha_missense_primary[['Chrom', 'Pos', 'Ref', 'Alt', 'transcript_id', 'am_pathogenicity', 'am_class', 'am_source']], 
                on=['Chrom', 'Pos', 'Ref', 'Alt'], 
                how='left')

del alpha_missense_primary

missense_snp_extended

In [None]:
# There are more records now after merging. why?
len(missense_snp_extended)

In [None]:
# Drop duplicates 
missense_snp_extended = missense_snp_extended.drop_duplicates(subset=['Chrom', 'Pos', 'Ref', 'Alt','am_class'], keep="first")

In [None]:
missense_snp_extended

In [None]:
# Save the rows that still don't have an alpha missense prediction - we will chek the isoform data for these:
missing_missense_predictions = missense_snp_extended[missense_snp_extended['am_class'].isnull()]

missing_missense_predictions

### Load and merge the isoform data

In [None]:
# Load and preview the alpha missense data (isoforms)
alpha_missense_isoforms = pd.read_csv(data_path + '\\alphamissense\\AlphaMissense_isoforms_hg38.tsv', skiprows=3, sep='\t')

In [None]:
alpha_missense_isoforms_len = len(alpha_missense_isoforms)
print(alpha_missense_isoforms_len)

alpha_missense_isoforms.head()

In [None]:
# Rename columns in alpha_missense_isoforms df to match those in missense_snp_extended for the merging criteria
alpha_missense_isoforms.rename(columns={'#CHROM': 'Chrom', 'POS': 'Pos', 'REF': 'Ref', 'ALT': 'Alt'}, inplace=True)

# Add a column, so we can later identify the source of the data when merged
alpha_missense_isoforms['am_source'] = 'isoform'

# These cols were added during the forst merge. Drop them so we can add them again, otherwise we wil  get _x _y appended cols
missing_missense_predictions = missing_missense_predictions.drop(columns=['transcript_id', 'am_pathogenicity', 'am_class', 'am_source']) 

# Now, merge the DataFrames
missing_missense_predictions = missing_missense_predictions.merge(alpha_missense_isoforms[['Chrom', 'Pos', 'Ref', 'Alt', 'transcript_id', 'am_pathogenicity', 'am_class', 'am_source']], 
                on=['Chrom', 'Pos', 'Ref', 'Alt'], 
                how='left')

#del alpha_missense_isoforms

missing_missense_predictions

In [None]:
# Drop duplicates
#missing_missense_predictions.drop_duplicates(subset=['Chrom', 'Pos', 'Ref', 'Alt', 'DNAChange', 'ProteinChange', 'HugoSymbol', 'Transcript', 'am_class'], keep=False)
missing_missense_predictions = missing_missense_predictions.drop_duplicates(subset=['Chrom', 'Pos', 'Ref', 'Alt', 'am_class'], keep="first")

In [None]:
missense_snp_complete = pd.concat([missense_snp_extended, missing_missense_predictions], axis=0, ignore_index=True)

In [None]:
missense_snp_complete

In [None]:
end_time = datetime.datetime.now()

running_time = end_time - start_time

print(f'Running time: {running_time}')

In [None]:
# Function assigns lof and gof label depending on whether protein is onc/tsg/kinase/other
def lof_gof(x):

    if x in tsgs:
        return 'lof'
    elif x in oncs:
        return 'gof'
    elif x in kinases:
        return 'gof'
    else:
        return 'lof'

In [None]:
# Add lof_gof column and map to lof_gof function. Each mutation (row) in df3_sm will be labelled lof/gof
df3_sm['lof_gof'] = df3_sm['Protein stable ID'].map(lof_gof)