In [None]:
import pandas as pd
#show all columns
pd.set_option("display.max_columns", None)
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
#for ROC curves
from sklearn.metrics import RocCurveDisplay


In [None]:
import sys
sys.version

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
"""
load the training data set
"""
training_df = pd.read_csv('training_df_v1_2022_02_04.csv')

In [None]:
"""
open cbiomutations and remove whitespace
"""
cbio_cancer_count = pd.read_csv('mutations_for_prediction_final_v1_2022_02_04.csv', low_memory=False, skipinitialspace = True)


In [None]:
len(cbio_cancer_count)

In [None]:
"""
assign x, and y values
x columns 
1) 'Total_Samples' --> total cancer samples with any mutation in the gene
2) 'Specific_Samples' --> cancer samples count with specific mutations
3) pro_len --> total aa of the protein
4) startposition --> position of the chromosome
5) chromasome --> chomosome (1-X)

"""
x_train= training_df.filter(items= ['Total_Samples', 'Specific_Samples', 'pro_len', 'startPosition', 'chromasome'])
y= training_df['hotspot']

In [None]:
"""
get dummy variables
"""
x_dummies= pd.get_dummies(x_train['chromasome'])

In [None]:
"""
merge dummies with training and _for_prection DFs
"""
x_concat = pd.concat([x_train, x_dummies], axis= 'columns')

In [None]:
"""
to avoid multy core variable
drop  
"""
x = x_concat.drop(['chromasome', 'Y'], axis= 'columns')


In [None]:
x['M']=0

In [None]:
x=x.filter(items=['Total_Samples', 'Specific_Samples', 'pro_len', 'startPosition', '1',
       '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20',
       '21', '22', '3', '4', '5', '6', '7', '8', '9', 'M', 'X'])

In [None]:
x.columns

## Mutations

In [None]:
"""
select columns for prediction
"""
pred_df_set = cbio_cancer_count.filter(items= ['Total_Samples', 'Specific_Samples', 'pro_len', 'startPosition', 'chromasome'])

In [None]:
pred_df_dummies= pd.get_dummies(pred_df_set['chromasome'])

In [None]:
pred_df_concat = pd.concat([pred_df_set, pred_df_dummies], axis= 'columns')

In [None]:
pred_df = pred_df_concat.drop(['chromasome', 'Y'], axis= 'columns')

In [None]:
pred_df.columns

In [None]:
"""
one hot encording
"""

In [None]:
"""
separate data for training and testing(80% training, 20% testing)
IMPORTANT UN NORMALAZED DATA
"""
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, train_size= 0.8, test_size= 0.2, random_state=42)
len(x)

In [None]:
"""
Normalize data for Support vector machine, logistic classifier, 
"""
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_i = sc.fit_transform(x_train)
x_test_i = sc.fit_transform(x_test)

# SVM support vector machine classifier- NORMALIZED

In [None]:
from sklearn.svm import SVC
sv = SVC()

In [None]:
sv.fit(x_train, y_train)

In [None]:
"""
calculate the score unnormalized
"""
sv.score(x_test, y_test)

In [None]:
"""
classification report, SVM
"""
pred_sv= sv.predict(x_test)
print(classification_report(y_test,pred_sv))

### Normalized

In [None]:
sv.fit(x_train_i, y_train)

In [None]:
"""
calculate the score
"""
sv.score(x_test_i, y_test)

In [None]:
"""
classification report, SVM
"""
pred_sv_i= sv.predict(x_test_i)
print(classification_report(y_test,pred_sv_i))

# Random forest classifier-UN NORMALIZED

In [None]:
"""
Use Random Forest Classifier 
"""
from sklearn.ensemble import RandomForestClassifier




In [None]:
"""
No normalization
"""
rfc = RandomForestClassifier(n_estimators = 1000, criterion='entropy')
rfc.fit(x_train, y_train)
pred_rfc= rfc.predict(x_test)

In [None]:
"""
classification report, Random Forest
"""
"""
Accuracy- Overall, how often is our model correct?
Accuracy = truepositives+truenegatives/totalsamples

Precision = truepositives/(truepositives+falsepositives)
When the model predicts positive, how often is it correct?- helps to identify falsepositives

recall- helps to identify falsenegatives
recall= truepositives/(truepositives+falseNEGATIVES)

F1 Score- F1 is an overall measure of a model’s accuracy that combines precision and recall
F1 score = 2(precision*recall)/(precision+recall)
"""
print(classification_report(y_test,pred_rfc))

In [None]:
"""
confusion matrix Random forest
"""
cm_rfc = confusion_matrix (y_test,pred_rfc)
cm_rfc

In [None]:
"""
use seaborn for better visualization of confusion matrix
"""
import seaborn as sn
import matplotlib.pyplot as plt
sn.heatmap(cm_rfc, annot = True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('Random_forrest_confusion_matrix.png', dpi = 1200)

### Normalized

In [None]:
rfc.fit(x_train_i, y_train)
"""
calculate the score
"""
rfc.score(x_test_i, y_test)

In [None]:
"""
Classification report
"""
"""
classification report, SVM
"""
pred_rfc_i= rfc.predict(x_test_i)
print(classification_report(y_test,pred_rfc_i))

# Decision Tree - UN NORMALIZED

In [None]:
from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='entropy')

In [None]:
"""
Un normaized 
"""
dt.fit(x_train, y_train)
dt_pred = dt.predict(x_test)

In [None]:
"""
Classification score, decission tree classifier
"""
print(classification_report(y_test,dt_pred))

In [None]:
"""
confusion matrix decission tree classifier
"""
cm_dt = confusion_matrix (y_test,dt_pred)
cm_dt

In [None]:
"""
calculate the score
"""
dt.score(x_test, y_test)

### NORMALIZED

In [None]:
dt.fit(x_train_i, y_train)
"""
calculate the score
"""
dt.score(x_test_i, y_test)

# Kneighbors classifier- NORMALIZED

In [None]:
"""
Kneighbors classifier
"""
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=2, weights= 'distance')
neigh.fit(x_train, y_train)

In [None]:
neigh_pred = neigh.predict(x_test)

In [None]:
"""
Classification score, decission tree classifier
"""
print(classification_report(y_test,neigh_pred))

### NORMALIZED

In [None]:
neigh.fit(x_train_i, y_train)
"""
calculate the score
"""
neigh.score(x_test_i, y_test)

In [None]:
neigh= KNeighborsClassifier(n_neighbors=2)
neigh.fit(x_train_i, y_train)

In [None]:
neigh_pred_i = neigh.predict(x_test_i)
"""
Classification score, decission tree classifier
"""
print(classification_report(y_test,neigh_pred_i))

# Logistic Regression - NORMALIZED

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='none', class_weight='balanced', solver= 'lbfgs')

In [None]:
lr.fit(x_train, y_train)

In [None]:
lr_pred = lr.predict(x_test)
lr_pred
# For ROC 
y_pred_lg = lr.decision_function(x_test)

In [None]:
lr.score(x_test, y_test)

In [None]:
"""
Classification score, logistic regression
"""
print(classification_report(y_test,lr_pred))

In [None]:
"""
confusion matrix logistic regression
"""
cm_lr = confusion_matrix (y_test,lr_pred)
cm_lr

### NORMALIZED

In [None]:
lr.fit(x_train_i, y_train)

In [None]:
lr_pred_i = lr.predict(x_test_i)
# For ROC 
# y_pred_lg = lr.decision_function(x_test)

In [None]:
lr.score(x_test_i, y_test)

In [None]:
"""
Classification score, logistic regression
"""
print(classification_report(y_test,lr_pred_i))

# ROC, AUC

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay

y_score = rfc.decision_function(x_test)

fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=clf.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
# rfc = RandomForestClassifier(n_estimators=10, random_state=42)
# rfc.fit(X_train, y_train)
from sklearn import metrics

ax = plt.gca()
rfc_disp = RocCurveDisplay.from_estimator(rfc, x_test, y_test, ax=ax, alpha=0.8)
lr_disp = RocCurveDisplay.from_estimator(lr, x_test_i, y_test, ax=ax, alpha=0.8)
dt_disp = RocCurveDisplay.from_estimator(dt, x_test, y_test, ax=ax, alpha=0.8)
neigh_disp = RocCurveDisplay.from_estimator(neigh, x_test_i, y_test, ax=ax, alpha=0.8)
svc_disp = RocCurveDisplay.from_estimator(sv, x_test_i, y_test, ax=ax, alpha=0.8)

# plt.figure(figsize=(5, 5), dpi=600)
# svc_disp.plot(ax=ax, alpha=0.8)
# plt.show()
plt.savefig("ROC.png", dpi=600, bbox_inches="tight", pad_inches=1, transparent=True)


# K-fold cross validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
"""
cv = number of folds
Random Forest
"""
# from sklearn.ensemble import RandomForestClassifier
rf = cross_val_score(RandomForestClassifier(n_estimators= 1000),x_test, y_test, cv = 10)

In [None]:
"""
calculate the mean accuracy score
"""
np.mean(rf)

In [None]:
"""
desicion tree
"""
from sklearn import tree
tree = cross_val_score(tree.DecisionTreeClassifier(criterion='entropy', splitter='best'), x_test, y_test, cv = 10)
"""
calculate the mean accuracy score
"""
np.mean(tree)

In [None]:
"""
SVM
"""
from sklearn.svm import SVC
from sklearn import svm
svm_new = cross_val_score(svm.SVC(C=2.5, kernel='linear'), x_test_i, y_test, cv = 10)
"""
calculate the mean accuracy score
"""
np.mean(svm_new)

In [None]:
"""
logistic regression
"""
from sklearn.linear_model import LogisticRegression
lr = cross_val_score(LogisticRegression(penalty='none', class_weight='balanced', solver= 'lbfgs'), x_test_i, y_test, cv =10)
"""
calculate the mean accuracy score
"""
np.mean(lr)

In [None]:
"""
KNeighbors
"""
from sklearn.neighbors import KNeighborsClassifier
kn = cross_val_score(KNeighborsClassifier(n_neighbors=2, weights= 'distance'), x_test_i, y_test, cv =10)
"""
calculate the mean accuracy score
"""
np.mean(kn)

# prediction

In [None]:
"""
retrain the RFC with training set (all the data)
"""
# """
# assign x, and y values
# x columns 
# 1) 'Total_Samples' --> total cancer samples with any mutation in the gene
# 2) 'Specific_Samples' --> cancer samples count with specific mutations
# 3) startposition --> position of the genome

# """
# x= training_df.filter(items= ['Total_Samples', 'Specific_Samples', 'startPosition' ])
# y= training_df['hotspot']

In [None]:
"""
Use Random Forest Classifier 
"""
from sklearn.ensemble import RandomForestClassifier


In [None]:
"""
No normalization
"""
rfc_final = RandomForestClassifier(n_estimators = 1000)
rfc_final.fit(x, y)
# pred_rfc= rfc.predict(x_test)

# Save Model To a File Using Python Pickle

In [None]:
import pickle
"""
open a file
"""
# wb - write in binary mode
with open('rfc_model_pickle_v1_2022_02_04','wb') as file:
    #dump rfc model into the file
    pickle.dump(rfc_final,file)

#### Load model

In [None]:
# rb -read in binary mode
with open('rfc_model_pickle_v1_2022_02_04','rb') as file:
    rf_pickle = pickle.load(file)

# Prediction

In [None]:
pred_df.head(2)

In [None]:
"""
actual prediction
['Total_Samples', 'Specific_Samples', 'startPosition' ]
"""
pred_rfc = rfc_final.predict(pred_df)

In [None]:
len(pred_rfc)

In [None]:
"""
Add prediction to the all_test DF
"""
hot_prediction_df = pd.concat([cbio_cancer_count, pd.DataFrame(pred_rfc)], axis=1)

# probability

In [None]:
"""
calculate the prediction probabilities
"""
rfc_probability = rfc_final.predict_proba(pred_df)

In [None]:
pred_with_pred_df = pd.concat([hot_prediction_df, pd.DataFrame(rfc_probability)], axis=1)

In [None]:
pred_with_pred_df.head(3)

In [None]:
pred_with_pred_df.columns

In [None]:
pred_with_pred_df.columns = [ 'gene_id',         'Total_Samples',
                'mutationType',    'mutproteinPosStart',
            'Specific_Samples',        'Total_Patients',
           'Specific_Patients',             'gene_name',
             'aminoAcidChange',      'mutproteinPosEnd',
             'uniqueSampleKey',      'uniquePatientKey',
          'molecularProfileId',              'sampleId',
                   'patientId',               'studyId',
                      'center',        'mutationStatus',
            'validationStatus',         'tumorAltCount',
               'tumorRefCount',        'normalAltCount',
              'normalRefCount',         'startPosition',
                 'endPosition',       'referenceAllele',
       'functionalImpactScore',              'fisValue',
                    'linkXvar',               'linkPdb',
                     'linkMsa',             'ncbiBuild',
                 'variantType',               'keyword',
                        'chrm',         'variantAllele',
                'refseqMrnaId',             'accession',
                     'pro_len',            'chromasome',
                        'gene',                       'driver_mut',
                             'prob_0',                       'probability']

In [None]:
pred_with_pred_df.head(3)

# Save final prediction as csv

In [None]:
cols = pred_with_pred_df.columns.to_list()
pred_with_pred_df.to_csv('driv_pred_final_v1_2022_02_04.csv', columns= cols, index=False)

In [None]:
pred_with_pred_df[(pred_with_pred_df['probability']>0.85)]

In [None]:
import pandas as pd

In [None]:
driver_mut = pd.read_csv('driv_pred_final_v1_2022_02_04.csv', low_memory=False)

In [None]:
# driver_mut.head(2)

In [None]:
"""['Missense Mutation', "5'Flank", 'Frame Shift', 'Nonsense_Mutation',
       'InFrame Deletions', 'Splice Site', 'Translation_Start_Site',
       'Splice Region', 'InFrame Insersions', 'Targeted_Region',
       'Nonstop_Mutation', 'Intergenic_variant', 'Start_Codon_Ins']"""
#Select Misssense, and nonsense muatations
driver_mut_mis_non = driver_mut.loc[(driver_mut['mutationType']=='Missense Mutation') | 
                               (driver_mut['mutationType']=='Nonsense_Mutation')]

In [None]:
"""
remove last string (AA) 
"""
driver_mut_mis_non['AA'] = driver_mut_mis_non['aminoAcidChange'].str[:-1]

In [None]:
#Select Frameshifts
driver_mut_fs= driver_mut.loc[driver_mut['mutationType']=='Frame Shift']

In [None]:
driver_mut_fs['aa_number'] = driver_mut_fs['aminoAcidChange'].str.extract('(\d+)')

In [None]:
"""
extract 1st string/AA
"""
driver_mut_fs['aa_letter'] = driver_mut_fs['aminoAcidChange'].astype(str).str[0]

In [None]:
driver_mut_fs["AA"] = driver_mut_fs['aa_letter'] + driver_mut_fs['aa_number'].astype(str)

In [None]:
"""
filter driver_mut_fs with the same columns in driver_mut_mis_non
"""
driver_mut_fs_filt = driver_mut_fs_filt.drop(['aa_number', 'aa_letter'], axis=1)

In [None]:
# driver_mut_fs_filt.head(2)

In [None]:
"""
select the mutations types that are not frame shift, missense, and nonsense
"""
driver_mut_other = driver_mut.loc[(driver_mut['mutationType']!= 'Missense Mutation')
                                 & (driver_mut['mutationType']!= 'Nonsense_Mutation')
                                 & (driver_mut['mutationType']!= 'Frame Shift')]

In [None]:
driver_mut_other['AA']= driver_mut_other['aminoAcidChange']

In [None]:
# driver_mut_other.head(2)

In [None]:
"""
concat driver_mut_mis_non, driver_mut_fs_filt, driver_mut_other
"""
frames = [driver_mut_mis_non, driver_mut_fs_filt, driver_mut_other]

driver_mut_filt = pd.concat(frames)

In [None]:
"""
Save as csv
"""
cols = driver_mut_filt.columns.to_list()
driver_mut_filt.to_csv('driv_pred_final_v1_2022_02_15.csv', columns=cols, index=False)

In [None]:
"""
open varient mutations
"""
varient_sample_chr = pd.read_csv('varient_sample_chr_02_16.csv', low_memory= False)

In [None]:
# varient_sample_chr.head(2)

In [None]:
driver_mut_filt = pd.read_csv('driv_pred_final_v1_2022_02_15.csv', low_memory=False)

In [None]:
# driver_mut_filt.head(2)

In [None]:
"""
merge driver_mut_filt and varient_sample_chr
"""
driv_pred_vari_final = pd.merge(driver_mut_filt, varient_sample_chr, on= ['gene_name', 'gene_id', 'mutationType', 'mutproteinPosStart',
                                                                         'Specific_Samples', 'Total_Samples', 'Specific_Samples',
                                                                         'chromasome', 'gene'] )

In [None]:
driv_pred_vari_final

In [None]:
GNAS = driv_pred_vari_final.loc[(driv_pred_vari_final['gene_name']== 'GNAS')&
                        (driv_pred_vari_final['mutproteinPosStart'] == 201)]

In [None]:
GNAS.head(10)

In [None]:
driv_pred_vari_final.sort_values(by=['Specific_Samples', 'Varient_Count'],ascending=False)

In [None]:
"""
save as csv
"""
cols= driv_pred_vari_final.columns.to_list()
driv_pred_vari_final.to_csv('driv_pred_vari_final_02_16.csv', columns= cols, index= False)

In [None]:
driv_pred_vari_final = pd.read_csv('driv_pred_vari_final_02_16.csv', low_memory= False)

In [None]:
len(driv_pred_vari_final)

In [None]:
driv_pred_vari_final = driv_pred_vari_final.sort_values(by=['Specific_Samples'], ascending=False).reset_index(drop=True)

In [None]:
sorted_driver_vari_only = driv_pred_vari_final.loc[driv_pred_vari_final['driver_mut']==1]

In [None]:
sorted_driver_vari_only.head(2)

In [None]:
"""
count driver mutations
"""
len(sorted_driver_vari_only)

In [None]:
driver_uniq = sorted_driver_vari_only.filter(items = ['gene_name', 'mutationType', 'mutproteinPosStart', 'driver_mut', 'probability'])

In [None]:
driver_uniq = driver_uniq.drop_duplicates()

In [None]:
len(driver_uniq)

In [None]:
driver_uniq_high = driver_uniq.loc[driver_uniq['probability']>0.95]

In [None]:
len(driver_uniq_high)

In [None]:
len(driver_uniq_high['gene_name'].unique())

In [None]:
"""
load positive training data set
"""
positive_df = pd.read_csv('positive_training_v1_2022_02_04.csv')

In [None]:
"""
rename columns
"""
positive_df = positive_df.rename(columns = {"aa_position": 'mutproteinPosStart'})

In [None]:
sorted_driver_vari_only_filt = sorted_driver_vari_only.filter(items= ['gene_id', 'gene_name', 'mutationType', 'mutproteinPosStart','Total_Samples', 'Specific_Samples', 'Total_Patients', 'Specific_Patients', 'accession', 'sampleId', 'patientId'])

In [None]:
positive_df_filt = positive_df.filter(items= ['gene_id', 'gene_name', 'mutationType', 'mutproteinPosStart','Total_Samples', 'Specific_Samples', 'Total_Patients', 'Specific_Patients', 'accession', 'sampleId', 'patientId'])

In [None]:
"""
select novel driver mutations
"""
novel_driver_df = sorted_driver_vari_only_filt.merge(positive_df_filt, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']

In [None]:
novel_driver_df

# Driver mutations/Start from here

In [None]:
driver_mut_filt = pd.read_csv('driv_pred_final_v1_2022_02_15.csv', low_memory=False)

In [None]:
len(driver_mut_filt)

In [None]:
driver_mut_filt.columns

In [None]:
driver_mut_filt.head(2)

In [None]:
driver_mut_filt_2 =driver_mut_filt.loc[driver_mut_filt['driver_mut']==1]

In [None]:
len(driver_mut_filt_2)

In [None]:
driver_mut_filt_short = driver_mut_filt_2.filter(items = ['gene_id', 'gene_name', 'mutationType', 'mutproteinPosStart', 
                                                        'Total_Samples', 'Specific_Samples', 'Total_Patients', 
                                                        'Specific_Patients', 'aminoAcidChange', 'mutproteinPosEnd', 
                                                        'molecularProfileId', 'sampleId', 'patientId',
                                                        'studyId', 'startPosition', 'endPosition', 'accession', 
                                                        'pro_len', 'chromasome', 'gene', 'driver_mut', 'probability', 'AA'])

In [None]:
driver_mut_filt_short

In [None]:
"""
save as csv for the website
"""
cols = driver_mut_filt_short.columns.to_list()
driver_mut_filt_short.to_csv('DriverMut.csv', columns=cols, index=False)

In [None]:
"""
open DriverMut.csv
"""
DriverMut = pd.read_csv('DriverMut.csv', low_memory=False)

In [None]:
DriverMut.head(2)

In [None]:
len(DriverMut)

In [None]:
DriverMut.loc[DriverMut['gene_name']=='TNK2']

In [None]:
DriverMut['mutationType'].unique()

In [None]:
"""
count driver mutations in a gene
"""
DriverMut_groupby = DriverMut.groupby(['gene_name']).size().reset_index(name='Driver Count*')

In [None]:
DriverMut_groupby

In [None]:
"""
save as csv for WEBsite
"""
cols= DriverMut_groupby.columns.to_list()
DriverMut_groupby.to_csv('DriverMut_num.csv', columns=cols, index=False)

In [None]:
DriverMut_filt = driver_mut_filt_short.filter(items=['gene_id', 'gene_name', 'mutationType', 'mutproteinPosStart', 
                                                        'Total_Samples', 'Specific_Samples', 'Total_Patients', 
                                                        'Specific_Patients', 'startPosition', 'accession', 
                                                        'pro_len', 'chromasome', 'gene', 'driver_mut', 'probability', 'AA'])

In [None]:
DriverMut_filt

In [None]:
DriverMut_filt.loc[DriverMut_filt['probability']>0.9]

In [None]:
DriverMut_filt_no_dup = DriverMut_filt.drop_duplicates(subset=['gene_name', 'mutationType', 'mutproteinPosStart'])

In [None]:
DriverMut_filt_no_dup

In [None]:
driver_mut_filt_no_dup = driver_mut_filt.drop_duplicates()

In [None]:
len(driver_mut_filt_no_dup)

In [None]:
"""
EGFR
"""
egfr= driver_mut_filt.loc[(driver_mut_filt['gene_name']=='NPM1')]

In [None]:
egfr.sort_values(by='Specific_Samples', ascending=False)

In [None]:
driver_mut_filt.loc[(driver_mut_filt['gene_name']=='CHEK1')
                   & (driver_mut_filt['mutproteinPosStart']==471)]

In [None]:
driver_mut_filt['mutationType'].unique()

In [None]:
gene_list = 'ACVR2A', 'APC', 'ASXL1', 'B2M', 'DOCK3', 'EGFR', 'ESRP1', 'EZH2', 'GATA3', 'JAK2', 'LARP4B', 'MAFA', 'MSH3', 'NOTCH1', 'NPM1', 'RNF43', 'RPL22', 'TERT', 'UBR5', 'XYLT2', 'ZBTB20'

In [None]:
ack_lolli = driver_mut_filt.loc[driver_mut_filt['gene_name']== 'TNK2']

In [None]:
JAK2_lolli = driver_mut_filt.loc[driver_mut_filt['gene_name']== 'JAK2']

In [None]:
JAK1_lolli = driver_mut_filt.loc[driver_mut_filt['gene_name']== 'JAK1']

In [None]:
JAK2_lolli.head(2)

In [None]:
"""
save as csv
"""
cols=JAK1_lolli.columns.to_list()
JAK1_lolli.to_csv('JAK1_lollipop_03_03.csv', columns=cols, index=False)

In [None]:
"""
save as csv
"""
cols=JAK2_lolli.columns.to_list()
JAK2_lolli.to_csv('JAK2_lollipop_03_03.csv', columns=cols, index=False)

In [None]:
ack_lolli.head(2)

In [None]:
# ack_lolli.loc[ack_lolli['mutationType']=='Frame Shift']

In [None]:
ack_lolli['mutationType'].unique()

In [None]:
"""
Save as csv
"""
cols=ack_lolli.columns.to_list()
ack_lolli.to_csv('Fig3_ack_lollipop_02_25.csv', columns= cols, index=False)

In [None]:
sorted_driver_mut = driver_mut_filt.sort_values(by=['Specific_Samples'], ascending=False).reset_index(drop=True)

In [None]:
# sorted_driver_mut.loc[sorted_driver_mut['gene_name']=='DNMT3A']

In [None]:
sorted_driver_mut_only = sorted_driver_mut.loc[sorted_driver_mut['driver_mut']==1]


In [None]:
# sorted_driver_mut_only.head(2)

In [None]:
"""
load positive training data set
"""
positive_df = pd.read_csv('positive_training_v1_2022_02_04.csv')

In [None]:
# positive_df.head(2)

In [None]:
"""
rename columns
"""
positive_df = positive_df.rename(columns = {"aa_position": 'mutproteinPosStart'})

In [None]:
# positive_df.columns

In [None]:
sorted_driver_mut_only_filt = sorted_driver_mut_only.filter(items= ['gene_id', 'gene_name', 'mutationType', 'mutproteinPosStart','Total_Samples', 'Specific_Samples', 'Total_Patients', 'Specific_Patients', 'accession', 'sampleId', 'patientId'])


In [None]:
positive_df_filt = positive_df.filter(items= ['gene_id', 'gene_name', 'mutationType', 'mutproteinPosStart','Total_Samples', 'Specific_Samples', 'Total_Patients', 'Specific_Patients', 'accession', 'sampleId', 'patientId'])

In [None]:
# sorted_driver_mut_only_filt.head(5)

In [None]:
# positive_df_filt.head(5)

In [None]:
"""
select novel driver mutations
"""
novel_driver_df = sorted_driver_mut_only_filt.merge(positive_df_filt, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']

In [None]:
novel_driver_df_filt = novel_driver_df.drop_duplicates()

In [None]:
# novel_driver_df = novel_driver_df.drop(['_merge'], axis=1)

In [None]:
novel_driver_df_filt.head(3)

In [None]:
novel_driver_df.columns

In [None]:
"""
merge novel_driver_df with driver_mut_filt
"""
novel_driver_full = pd.merge(novel_driver_df_filt, driver_mut_filt, on = ['gene_id', 'gene_name', 'mutationType', 'mutproteinPosStart','Total_Samples', 'Specific_Samples', 'Total_Patients', 'Specific_Patients', 'accession', 'sampleId', 'patientId'])

In [None]:
novel_driver_full_filt = novel_driver_full.drop_duplicates()

In [None]:
# df.drop_duplicates(subset=['brand'])

In [None]:
len(driver_mut_filt)

In [None]:
len(novel_driver_df)

In [None]:
novel_driver_df_filt = novel_driver_df.drop_duplicates()

In [None]:
len(novel_driver_df_filt)

In [None]:
len(novel_driver_full)

In [None]:
len(novel_driver_full_filt)

In [None]:
novel_driver_full_filt.head(3)

In [None]:
novel_driver_full_filt['specific_fraction'] = novel_driver_full_filt['Specific_Samples']/novel_driver_full_filt['Total_Samples']

In [None]:
novel_driver_full_filt.head(2)

In [None]:
"""
save as csv
"""
cols=novel_driver_full_filt.columns.to_list()
novel_driver_full_filt.to_csv('novel_driver_df_v1_02_15.csv', columns=cols, index=False)

In [None]:
"""
filter for ML_ACK1 paper Table 1
"""
novel_driver_table = novel_driver_full_filt.filter(items= ['accession', 'gene_name', 'mutationType', 'mutproteinPosStart', 'AA', 'Specific_Samples', 'Total_Samples', 'probability', 'specific_fraction'])

In [None]:
novel_driver_table.head(2)

In [None]:
"""
save as csv
"""
cols = novel_driver_table.columns.to_list()
novel_driver_table.to_csv('table_1_novel_driver_table_02_15.csv', columns=cols, index=False)

In [None]:
"""
select misssense
"""
novel_missense_driver_table = novel_driver_table.loc[novel_driver_table['mutationType'] =='Missense Mutation']

In [None]:
novel_missense_driver_table

In [None]:
"""
save as csv
"""
cols= novel_missense_driver_table.columns.to_list()
novel_missense_driver_table.to_csv('table1_novel_missense_driver_table_02_15.csv', columns= cols, index=False)

#

# With variations

In [None]:
"""
open varient mutations
"""
varient_sample_chr = pd.read_csv('varient_sample_chr_02_16.csv', low_memory= False)

In [None]:
"""
Open novel missense drivers
"""
novel_missense_driver_table= pd.read_csv('table1_novel_missense_driver_table_02_15.csv')



In [None]:
gene='TP53'
type_m='Missense Mutation'
aa = 213

In [None]:
varient_sample_chr.loc[(varient_sample_chr['gene_name']==gene)
                      & (varient_sample_chr['mutationType']==type_m)
                      & (varient_sample_chr['mutproteinPosStart']== aa)]

In [None]:
len(varient_sample_chr)

In [None]:
varient_sample_chr.columns

In [None]:
varient_sample_chr_filt = varient_sample_chr.filter(items= ['gene_name', 'Total_Samples', 'mutationType',
       'mutproteinPosStart', 'Specific_Samples', 'aminoAcidChange',
       'Varient_Count'])

In [None]:
novel_missense_driver_table.head(2)

In [None]:
novel_missense_driver_varient_table = pd.merge(novel_missense_driver_table, varient_sample_chr_filt, on = ['gene_name', 'Total_Samples', 'mutationType',
       'mutproteinPosStart', 'Specific_Samples'])

In [None]:
novel_missense_driver_varient_table

In [None]:
"""
save as csv
"""
cols = novel_missense_driver_varient_table.columns.to_list()
novel_missense_driver_varient_table.to_csv('table1_novel_missense_driver_varient_table_02_16.csv', columns= cols, index= False)

In [None]:
"""
open functional PTMs
"""
functional_ptm_final = pd.read_csv('functional_ptm_final_v1_2022_02_05.csv', low_memory= False)

In [None]:
all_driv_ptm = pd.merge()

# Functional PTM

In [None]:
# """
# open functional PTMs
# """
# functional_ptm_final = pd.read_csv('functional_ptm_final_v1_2022_02_05.csv', low_memory= False)

In [None]:
driver_mut_filt = pd.read_csv('driv_pred_final_v1_2022_02_15.csv', low_memory=False)

In [None]:
all_ptm_protein_lenghs= pd.read_csv('all_ptm_protein_lenghs_03_01.csv', low_memory= False)

In [None]:
all_ptm_protein_lenghs.head(2)

In [None]:
all_ptm_protein_lenghs_filt = all_ptm_protein_lenghs.filter(items=['accession', 'gene_name', 'pro_len'])

In [None]:
all_ptm_protein_lenghs_filt_re = all_ptm_protein_lenghs_filt.rename(columns={'accession':'ACC_ID', 'gene_name':'GENE'})

In [None]:
all_ptm_protein_lenghs_filt_re

In [None]:
"""
remove rows contating '-'
"""
#identify partial string to look for
discard = ['-']

#drop rows that contain the partial string "Wes" in the conference column
all_ptm_protein_lenghs_filt_clean = all_ptm_protein_lenghs_filt_re[~all_ptm_protein_lenghs_filt_re['ACC_ID'].str.contains('|'.join(discard))]

In [None]:
all_ptm_protein_lenghs_filt_clean

In [None]:
"""
unique genes
"""
len(all_ptm_protein_lenghs_filt_clean['GENE'].unique())

In [None]:
all_ptm_protein_lenghs_filt_gene = all_ptm_protein_lenghs_filt_re.drop_duplicates(subset=['GENE'])

In [None]:
all_ptm_protein_lenghs_filt_gene

In [None]:
"""
unique genes
"""
len(all_ptm_protein_lenghs_filt_re['GENE'].unique())

In [None]:
# Find Rows in DF1 Which Are Not Available in DF2
dup_values = all_ptm_protein_lenghs_filt_re.merge(all_ptm_protein_lenghs_filt_gene, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']



In [None]:
dup_values

In [None]:
#identify partial string to look for
discard = ['-']

#drop rows that contain the partial string "Wes" in the conference column
clean_dup = dup_values[~dup_values['ACC_ID'].str.contains('|'.join(discard))]

In [None]:
clean_dup

In [None]:
"""
open functional PTMs
"""
functional_ptm_final = pd.read_csv('functional_ptm_final_v1_2022_02_24.csv', low_memory= False)

In [None]:
functional_ptm_final.loc[(functional_ptm_final['GENE']=='MDM4') &
                        (functional_ptm_final['dtiv_mut_site']==367)]

In [None]:
functional_ptm_final_drop_dup = functional_ptm_final.drop_duplicates(subset = ['GENE', 'MOD_RSD', 'type', 'residue', 'driv_mut_type', 'dtiv_mut_site'])

In [None]:
driver_mut_filt_driv= driver_mut_filt.loc[driver_mut_filt['driver_mut']==1]

In [None]:
driver_mut_filt_driv.head(2)

In [None]:
driver_mut_filt_driv.loc[(driver_mut_filt_driv['gene_name']=='MDM4') &
                        (driver_mut_filt_driv['mutproteinPosStart']==367)]

In [None]:
ptm_driv = pd.merge(functional_ptm_final_drop_dup, driver_mut_filt_driv, left_on = ['GENE', 'driv_mut_type', 'dtiv_mut_site'], right_on=['gene_name', 'mutationType', 'mutproteinPosStart'])

In [None]:
ptm_driv.loc[(ptm_driv['GENE']=='MDM4') &
                        (ptm_driv['dtiv_mut_site']==367)]

In [None]:
ptm_driv.head(2)

In [None]:
ptm_driv_filt = ptm_driv.filter(items=['GENE', 'type', 'residue',  'MOD_RSD', 'mutationType', 'mutproteinPosStart', 'Specific_Samples', 'Total_Samples', 'AA', 'LT_LIT', 'MS_LIT', 'MS_CST', 'CST_CAT', 'probability'])

In [None]:
ptm_driv_filt .loc[(ptm_driv_filt ['GENE']=='MDM4') &
                        (ptm_driv_filt ['mutproteinPosStart']==367)]

In [None]:
gene = 'TP53'
site = 215


In [None]:
ptm_driv_filt.loc[(ptm_driv_filt['GENE'] == gene) & (ptm_driv_filt['residue']== site) & (ptm_driv_filt['mutationType']== 'Missense Mutation' )]

In [None]:
functional_ptm_final_miss = functional_ptm_final_drop_dup.loc[functional_ptm_final_drop_dup['driv_mut_type']== 'Missense Mutation']

In [None]:
common_PTM_driv = functional_ptm_final_miss.groupby(['GENE', 'residue', 'type']).size().reset_index(name='com_ptm_num')




In [None]:
common_PTM_driv= common_PTM_driv.sort_values(by='com_ptm_num', ascending=False)

In [None]:
common_PTM_driv.loc[(common_PTM_driv['GENE']=='MDM4') &
                        (common_PTM_driv['residue']==367)]

In [None]:
"""
select the frivers with 
"""
common_PTM_driv_filt= common_PTM_driv.loc[common_PTM_driv['com_ptm_num']>1]

In [None]:
selected_ptm_driv  = pd.merge(ptm_driv_filt, common_PTM_driv_filt, on = ['GENE', 'residue', 'type'])

In [None]:
selected_ptm_driv

In [None]:
selected_ptm_driv_filt= selected_ptm_driv.loc[selected_ptm_driv['mutationType']== 'Missense Mutation']

In [None]:
selected_ptm_driv_filt = selected_ptm_driv_filt.drop_duplicates()

In [None]:
selected_ptm_driv_filt

In [None]:
max_selected_ptm_driv_filt = selected_ptm_driv_filt.groupby(['GENE', 'residue', 'type'])['Specific_Samples'].max().reset_index()

In [None]:
max_selected_ptm_driv_filt

In [None]:
sum_selected_ptm_driv_filt = selected_ptm_driv_filt.groupby(['GENE', 'residue', 'type'])['Specific_Samples'].sum().reset_index(name='Total_cluster_samples')

In [None]:
sum_selected_ptm_driv_filt

In [None]:
max_selected_ptm_driv_filt_new= pd.merge(max_selected_ptm_driv_filt, sum_selected_ptm_driv_filt, on = ['GENE', 'residue', 'type'] )

In [None]:
max_all_ptm_driv = pd.merge(max_selected_ptm_driv_filt_new, selected_ptm_driv_filt, on=['GENE', 'residue', 'type', 'Specific_Samples'])

In [None]:
max_driv_clust = max_all_ptm_driv.loc[max_all_ptm_driv['residue']== max_all_ptm_driv['mutproteinPosStart']]

In [None]:
max_driv_clust = max_driv_clust.drop_duplicates(subset= ['GENE', 'residue', 'type'])

In [None]:
selected_ptm_driv_filt.head(2)

In [None]:
max_driv_clust.head(2)

In [None]:
driver_clusters = pd.merge(max_driv_clust, selected_ptm_driv_filt, on= ['GENE', 'residue', 'type'])

In [None]:
driver_clusters

In [None]:
driver_clusters_filt = driver_clusters.filter(items=['GENE', 'type', 'residue',  'MOD_RSD_x', 'mutationType_x', 'mutproteinPosStart_x', 'Specific_Samples_x', 'Total_Samples_x', 'AA_x', 'LT_LIT_x', 'MS_LIT_x', 'MS_CST_x', 'CST_CAT_x', 'probability_x'])

In [None]:
driver_clusters_filt.loc[(driver_clusters['GENE']=='SF3B1') &
                        (driver_clusters['residue']== 700)]

In [None]:
"""
save as csv
"""
cols= max_driv_clust.columns.to_list()
max_driv_clust.to_csv('max_driv_clust.csv', columns=cols, index=False)

In [None]:
len(max_driv_clust)

In [None]:
"""
save as csv
"""
cols= selected_ptm_driv_filt.columns.to_list()
selected_ptm_driv_filt.to_csv('selected_ptm_driv_filt.csv', columns=cols, index=False)

In [None]:
"""
save as csv
"""
cols = common_PTM_driv.columns.to_list()
common_PTM_driv.to_csv('common_PTM_driv.csv', columns=cols, index= False)

In [None]:
"""
merge functional_ptm_final and all_ptm_protein_lenghs_filt_re to add protein lengths
"""
PTM_driver_dis = pd.merge(functional_ptm_final, all_ptm_protein_lenghs_filt_gene, on= ['GENE'])

In [None]:
# PTM_driver_dis

In [None]:
PTM_driver_dis = PTM_driver_dis.rename(columns= {'ACC_ID_y':'ACC_ID'})

In [None]:
PTM_driver_dis.columns

In [None]:
PTM_driver_dis_filt = PTM_driver_dis.filter(items=['GENE', 'ACC_ID', 
        'type', 'residue', 'driv_mut_type',
       'dtiv_mut_site', 'pro_len'])

In [None]:
PTM_driver_dis_filt

In [None]:
PTM_driver_dis_filt_miss = PTM_driver_dis_filt.loc[PTM_driver_dis_filt['driv_mut_type']=='Missense Mutation']

In [None]:
PTM_driver_dis_filt_miss

In [None]:
"""
save as csv
"""
cols=PTM_driver_dis_filt_miss.columns.to_list()
PTM_driver_dis_filt_miss.to_csv('PTM_disrupting_missense_drivers_03_07.csv', columns=cols, index=False)

In [None]:
"""
read csv
"""
PTM_driver_dis_filt_miss= pd.read_csv('PTM_disrupting_missense_drivers.csv', low_memory=False)

In [None]:
PTM_driver_dis_filt_miss_gene_acc = PTM_disrupting_missense_drivers.drop_duplicates(subset=['GENE', 'ACC_ID'])

In [None]:
PTM_driver_dis_filt_miss_gene_acc

In [None]:
PTM_driver_dis_filt_miss_gene = PTM_driver_dis_filt_miss.drop_duplicates(subset=['GENE'])

In [None]:
PTM_driver_dis_filt_miss_gene

In [None]:
driver_mut_filt = pd.read_csv('driv_pred_final_v1_2022_02_15.csv', low_memory=False)

In [None]:
driver_mut_filt_mis = driver_mut_filt.loc[(driver_mut_filt['mutationType']=='Missense Mutation')
                                         & (driver_mut_filt['driver_mut']==1)]

In [None]:
driver_mut_filt_mis.head(2)

In [None]:
driver_mut_filt_mis_filt = driver_mut_filt_mis.filter(items=['gene_name', 'mutationType', 'mutproteinPosStart', 'Specific_Samples', 'pro_len', 'probability'])

In [None]:
driver_mut_filt_mis_filt = driver_mut_filt_mis_filt.drop_duplicates()

In [None]:
driver_mut_filt_mis_filt

In [None]:
"""
save as csv
"""
cols=driver_mut_filt_mis_filt.columns.to_list()
driver_mut_filt_mis_filt.to_csv('missense_driver_mutations.csv', columns=cols, index= False)

In [None]:
"""
open all the human PTMs
"""
ptm = pd.read_csv('ptm_df_hum.csv', low_memory=False)

In [None]:
ptm

In [None]:
human_ptm_pro_len = pd.merge(ptm, all_ptm_protein_lenghs_filt_re, on= ['GENE', 'ACC_ID'])

In [None]:
human_ptm_pro_len

In [None]:
"""
filter
"""
human_ptm_pro_len_filt = human_ptm_pro_len.filter(items= ['ACC_ID', 'GENE', 'residue', 'type', 'pro_len'] )

In [None]:
human_ptm_pro_len_filt

In [None]:
human_ptm_pro_len_filt_filt = human_ptm_pro_len_filt.drop_duplicates(subset=['GENE', 'residue', 'type'])

In [None]:
human_ptm_pro_len_filt_filt

In [None]:
"""
save as csv
"""
cols=human_ptm_pro_len_filt.columns.to_list()
human_ptm_pro_len_filt.to_csv('human_PTMs_phosphositeplus.csv', columns=cols, index=False)

In [None]:
"""
open csv
"""
cols=human_ptm_pro_len_filt_filt.columns.to_list()
human_ptm_pro_len_filt_filt.to_csv('human_PTMs_phosphositeplus_03_07.csv', columns=cols, index=False)

In [None]:
functional_ptm_final_filt = functional_ptm_final.filter(items= ['ACC_ID', 'GENE', 'residue', 'type', 'driv_mut_type', 'dtiv_mut_site'])

In [None]:
functional_ptm_final_filt = functional_ptm_final.filter(items= ['ACC_ID', 'GENE', 'residue', 'type', 'driv_mut_type', 'dtiv_mut_site'])

In [None]:
functional_ptm_final_filt = functional_ptm_final_filt.rename(columns= {'dtiv_mut_site':'driv_mut_site', 'GENE':'gene_name', 'ACC_ID':'accession'})

In [None]:
functional_ptm_final_filt_miss = functional_ptm_final_filt.loc[functional_ptm_final_filt['driv_mut_type']== 'Missense Mutation']

In [None]:
"""
oprn protein lengths
"""
prot_len_df = pd.read_csv('protein_len_total.csv', skipinitialspace = True)

In [None]:
prot_len_df= prot_len_df.filter(items= ['accession', 'gene_name', 'pro_len'])

In [None]:
"""
merge
"""
functional_ptm_final_filt_miss = pd.merge(functional_ptm_final_filt_miss, prot_len_df, on=['accession', 'gene_name'] )

In [None]:
functional_ptm_final_filt_miss

In [None]:
functional_ptm_final_filt_miss

In [None]:
driver_mut_filt_miss

In [None]:
driver_mut_filt_miss_filt = driver_mut_filt_miss.filter(items=['gene_name', 'mutationType', 'mutproteinPosStart', 'pro_len', 'probability'])

In [None]:
driver_mut_filt_miss_filt = driver_mut_filt_miss_filt.rename(columns={'mutproteinPosStart':'dtiv_mut_site', 'mutationType':'driv_mut_type'})

In [None]:
driver_mut_filt_miss_filt

In [None]:
"""
save as csv
"""
cols=driver_mut_filt_miss_filt.columns.to_list()
driver_mut_filt_miss_filt.to_csv('cancer_driver_missense_mutations.csv', columns=cols, index=False)

In [None]:
"""
open csv
"""
driver_mut_filt_miss_filt = pd.read_csv('cancer_driver_missense_mutations.csv', low_memory=False)

In [None]:
driver_mut_filt_miss_filt

In [None]:
"""
open all the human PTMs
"""
ptm_df_hum= pd.read_csv('ptm_df_hum.csv', low_memory=False)

In [None]:
"""
merge functional_ptm_final and all_ptm_protein_lenghs_filt_re to add protein lengths
"""
PTM_driver_dis = pd.merge(functional_ptm_final, all_ptm_protein_lenghs_filt_re, on= ['GENE', 'ACC_ID'])

In [None]:
len(functional_ptm_final)

In [None]:
"""
Identify PTMs disrupted by missense mutations
"""
Miss_functional_ptm_final_filt = functional_ptm_final.loc[functional_ptm_final['driv_mut_type'] == 'Missense Mutation']

In [None]:
Miss_functional_ptm_final_filt_Driv = Miss_functional_ptm_final_filt.filter(items=['GENE', 'driv_mut_type', 'dtiv_mut_site'])

In [None]:
Miss_functional_ptm_final_filt_Driv.head(2)

In [None]:
Miss_functional_ptm_final_filt_Driv = Miss_functional_ptm_final_filt_Driv.drop_duplicates()

In [None]:
len(Miss_functional_ptm_final_filt_Driv)

In [None]:
Miss_functional_ptm_final_filt_Driv['driv_mut_type'].unique()

In [None]:
"""
identify unique PTMs 

"""
functional_ptm_final_filt = Miss_functional_ptm_final_filt.drop(columns=['driv_mut_type', 'dtiv_mut_site'])


In [None]:
"""
unique PTMs
"""
functional_ptm_final_filt = functional_ptm_final_filt.drop_duplicates()

In [None]:
len(functional_ptm_final_filt)

In [None]:
"""
Missense mutations/PTMs
"""
functional_drivers_per_PTM = len(Miss_functional_ptm_final_filt_Driv)/len(functional_ptm_final_filt)

In [None]:
functional_drivers_per_PTM

In [None]:
ptm_df_hum.head(2)

In [None]:
len(ptm_df_hum)

In [None]:
"""
read all the driver mutations
"""
driver_mut_filt = pd.read_csv('driv_pred_final_v1_2022_02_15.csv', low_memory=False)

In [None]:
driver_mut_filt_miss = driver_mut_filt.loc[(driver_mut_filt['mutationType'] == 'Missense Mutation') &  
                                          (driver_mut_filt['driver_mut'] == 1)]

In [None]:
ACK1_mut = driver_mut_filt.loc[(driver_mut_filt['gene_name'] == 'TNK2')]

In [None]:
ACK1_mut_sort = ACK1_mut.sort_values(by= 'Specific_Samples', ascending= False)

In [None]:
ACK1_mut_sort.head(2)

In [None]:
ACK1_mut_sort.columns

In [None]:
ACK1_filt = ACK1_mut_sort.filter(items=['gene_name', 'mutationType', 'mutproteinPosStart',
       'Specific_Samples', 'gene',
       'driver_mut', 'probability', 'AA'])

In [None]:
ACK1_filt.head(10)

In [None]:
driver_mut_filt_miss= driver_mut_filt_miss.drop_duplicates()

In [None]:
len(driver_mut_filt_miss)

In [None]:
"""
Missense mutations/PTMs
"""
functional_drivers_per_PTM = len(Miss_functional_ptm_final_filt_Driv)/len(functional_ptm_final_filt)

In [None]:
functional_drivers_per_PTM

In [None]:
"""
Expected drivers/PTMs
"""
expected_drivers_per_PTM = len(driver_mut_filt_miss)/len(ptm_df_hum)

In [None]:
expected_drivers_per_PTM

In [None]:
"""
fraction of PTMs distruped by drivers/total PTMs
"""
PTM_fraction = len(functional_ptm_final_filt)/len(ptm_df_hum)*100

In [None]:
PTM_fraction

In [None]:
"""
fraction of PTM distrupting drivers/total drivers
"""
driver_fraction = len(Miss_functional_ptm_final_filt_Driv)/(driver_mut_filt_miss['Specific_Samples'].sum())*100

In [None]:
"""
calculate the ratio of driv_PTMs/expected
"""
ratio_driv_PTM_to_expected = functional_drivers_per_PTM/expected_drivers_per_PTM

In [None]:
ratio_driv_PTM_to_expected

In [None]:
"""
identify unique PTMs 

"""
functional_ptm_missense_driv = Miss_functional_ptm_final_filt.filter(items=['GENE', 'driv_mut_type', 'dtiv_mut_site'])



In [None]:
functional_ptm_missense_driv = functional_ptm_missense_driv.drop_duplicates()

In [None]:
len(functional_ptm_missense_driv)

In [None]:
functional_ptm_final['driv_mut_type'].unique()

In [None]:
funct_mis_ptm_final = functional_ptm_final.loc[functional_ptm_final['driv_mut_type']=='Missense Mutation']

In [None]:
funct_mis_ptm_final.head(2)

In [None]:
len(funct_mis_ptm_final)

In [None]:
funct_mis_ptm_final = funct_mis_ptm_final.rename(columns={'GENE':'gene_name',
                                                         'driv_mut_type': 'mutationType', 
                                                         'dtiv_mut_site': 'mutproteinPosStart'})

In [None]:
novel_missense_driver_varient_table.head(2)

In [None]:
"""
merge
"""
miss_novel_driv_ptm = pd.merge(novel_missense_driver_varient_table, funct_mis_ptm_final, on = ['gene_name', 'mutationType',
                                                                                              'mutproteinPosStart'] )

In [None]:
miss_novel_driv_ptm.head(2)

In [None]:
miss_novel_driv_ptm.columns

In [None]:
"""
save as csv all the columns
"""
cols= miss_novel_driv_ptm.columns.to_list()
miss_novel_driv_ptm.to_csv('miss_novel_driv_ptm_02_16.csv', columns=cols, index=False)

In [None]:
miss_novel_driv_ptm_filt = miss_novel_driv_ptm.filter(items = ['accession', 'gene_name', 'mutationType', 
                                                               'mutproteinPosStart', 'AA','aminoAcidChange', 
                                                               'Varient_Count', 'MOD_RSD',
                                                               'Specific_Samples', 'Total_Samples', 'probability', 
                                                               'specific_fraction',
                                                               'PROTEIN', 'DOMAIN',
                                                               'LT_LIT', 'MS_LIT', 'MS_CST', 
                                                               'type', 'residue'])

In [None]:
miss_novel_driv_ptm_filt.head(2)

In [None]:
len(miss_novel_driv_ptm_filt)

In [None]:
"""
save as csv table
"""
cols= miss_novel_driv_ptm_filt.columns.to_list()
miss_novel_driv_ptm_filt.to_csv('table3_miss_novel_driv_ptm_filt_02_16.csv', columns=cols, index= False)

In [None]:
miss_novel_driv_ptm_uniq = miss_novel_driv_ptm_filt.filter(items= ['accession', 'gene_name', 'mutationType', 
                                                               'mutproteinPosStart', 'AA','MOD_RSD',
                                                               'Specific_Samples', 'Total_Samples', 'probability', 
                                                               'specific_fraction',
                                                               'PROTEIN', 'DOMAIN',
                                                               'LT_LIT', 'MS_LIT', 'MS_CST', 
                                                               'type', 'residue'])

In [None]:
miss_novel_driv_ptm_uniq = miss_novel_driv_ptm_uniq.drop_duplicates()

In [None]:
len(miss_novel_driv_ptm_uniq)

In [None]:
miss_novel_driv_ptm_uniq.head(2)

In [None]:
"""
save as csv Table
"""
cols= miss_novel_driv_ptm_uniq.columns.to_list()
miss_novel_driv_ptm_uniq.to_csv('table3_miss_novel_driv_ptm_uniq_02_16.csv', columns=cols, index= False)

In [None]:
miss_novel_driv_ptm_filt = pd.read_csv('table3_miss_novel_driv_ptm_filt_02_16.csv', low_memory=False)

In [None]:
miss_novel_driv_ptm_filt.head(2)

# Figure 2 collect data for the lollypop grapgh - 

In [None]:
"""
open all PTM dristrupting driver mutations
"""
miss_all_driv_ptm_uniq = pd.read_csv('table3_extra_miss_all_driv_ptm_filt_uniq_02_15.csv')

In [None]:
sum_list = ['LT_LIT', 'MS_LIT', 'MS_CST']

In [None]:
miss_all_driv_ptm_uniq['PTM_references'] = miss_all_driv_ptm_uniq[sum_list].sum(axis=1)

In [None]:
# miss_novel_driv_ptm_uniq = pd.read_csv('table3_miss_novel_driv_ptm_uniq_02_16.csv')

In [None]:
miss_all_driv_ptm_uniq.head(2)

In [None]:
def my_function(fname):
  print(fname + " Refsnes")

my_function("Emil")
my_function("Tobias")
my_function("Linus")

In [None]:
def miss_novel_ptm(gene_name):
    df = miss_all_driv_ptm_uniq.loc[miss_all_driv_ptm_uniq['gene_name']== gene_name]
#     filter cancer related columns
    df_cancer = df.filter(items = ['gene_name', 'AA', 'mutproteinPosStart', 'Specific_Samples'])
#     drop duplicates
    df_cancer = df_cancer.drop_duplicates()
    """
    Save as csv Fig
    """
    cols = df_cancer.columns.to_list()
    df_cancer.to_csv('Fig2_'+gene_name+'_df_cancer_02_19.csv', columns=cols, index=False)
#     Select PTM related columns
    ptm = df.filter(items = ['gene_name', 'AA','residue', 'type', 'PTM_references'])
#     drop duplicates
    ptm = ptm.drop_duplicates()
#     Some genes has different references for the same PTM
    ptm = ptm.groupby(['gene_name', 'residue','type'])['PTM_references'].agg('sum').reset_index()
#     pivot
    ptm_pivot = ptm.pivot(index='residue', columns='type', values='PTM_references').reset_index()
    
    ptm_pivot['gene_name'] = gene_name
    ptm_pivot_filt = ptm_pivot.filter(items=['gene_name', 'AA', 'residue', 'Phosphorylation', 'Acetylation', 'Ubiquitination', 'Methylation',
       'Sumoylation', 'O_GlcNAc'])
    """
    Save as csv Fig
    """
    cols_two = ptm_pivot_filt.columns.to_list()
    ptm_pivot_filt.to_csv('Fig2_'+gene_name+'_ptm_pivot_filt_02_19.csv', columns=cols_two, index=False)
    
    
    
    
    



In [None]:
"""
1. EZH2
"""
miss_novel_ptm('EZH2')

In [None]:
"""
2. CHEK2
"""
miss_novel_ptm('CHEK2')

In [None]:
"""
3.CD79B
"""
miss_novel_ptm('CD79B')

In [None]:
"""
4.H3-3A
"""
miss_novel_ptm("H3-3A")

In [None]:
"""
5.KRT8
"""
miss_novel_ptm('KRT8')

In [None]:
"""
6.EEF1B2
"""
miss_novel_ptm('EEF1B2')

In [None]:
"""
7.MDM4
"""
miss_novel_ptm('MDM4')

In [None]:
"""
7.CTNNB1
"""
miss_novel_ptm('CTNNB1')

In [None]:
EZH2_df = miss_all_driv_ptm_uniq.loc[miss_all_driv_ptm_uniq['gene_name']== 'CTNNB1']

In [None]:
EZH2_df.sort_values(by= 'Specific_Samples', ascending=False)

In [None]:
EZH2_df_cancer = EZH2_df.filter(items = ['gene_name', 'AA', 'mutproteinPosStart', 'Specific_Samples'])

In [None]:
EZH2_df_cancer = EZH2_df_cancer.drop_duplicates()

In [None]:
EZH2_df_cancer.head(2)

In [None]:
"""
Save as csv Fig
"""         
cols = EZH2_df_cancer.columns.to_list()
EZH2_df_cancer.to_csv('EZH2_df_cancer.csv', columns=cols, index=False)

In [None]:
EZH2_ptm = EZH2_df.filter(items = ['gene_name', 'AA', 'residue', 'type', 'PTM_references'])

In [None]:
EZH2_ptm = EZH2_ptm.drop_duplicates()

In [None]:
EZH2_ptm

In [None]:
EZH2_ptm = EZH2_ptm.groupby(['gene_name', 'AA', 'residue','type'])['PTM_references'].agg('sum').reset_index()

In [None]:
EZH2_ptm

In [None]:
EZH2_ptm_pivot = EZH2_ptm.pivot(index='residue', columns='type', values='PTM_references').reset_index()

In [None]:
EZH2_ptm_pivot['gene_name'] = 'EZH2'

In [None]:
miss_all_driv_ptm_uniq['type'].unique()

In [None]:
EZH2_ptm_pivot_filt = EZH2_ptm_pivot.filter(items=['gene_name', 'residue', 'Phosphorylation', 'Acetylation', 'Ubiquitination', 'Methylation',
       'Sumoylation', 'O_GlcNAc'])

In [None]:
EZH2_ptm_pivot_filt.columns

In [None]:
EZH2_ptm_pivot_filt

In [None]:
"""
Save as csv Fig
"""
cols = EZH2_ptm_pivot_filt.columns.to_list()
EZH2_ptm_pivot_filt.to_csv('Fig2_EZH2_ptm_pivot_filt.csv', columns=cols, index=False)

In [None]:
"""
selct driver s at the same PTM
"""
miss_novel_driv_ptm_uniq = pd.read_csv('table3_miss_novel_driv_ptm_uniq_02_16.csv')

In [None]:
miss_novel_driv_ptm_uniq.head(2)

In [None]:
miss_novel_driv_ptm_uniq_exact = miss_novel_driv_ptm_uniq.loc[miss_novel_driv_ptm_uniq['mutproteinPosStart']== miss_novel_driv_ptm_uniq['residue']]


In [None]:
miss_novel_driv_ptm_uniq_exact.head(20)

# Functional PTM + all the drivers

In [None]:
varient_sample_chr.head(2)

In [None]:
varient_sample_chr.loc[(varient_sample_chr['gene_name']=='MDM4')& (varient_sample_chr['mutproteinPosStart']==367)]

In [None]:
funct_mis_ptm_final.head(2)

In [None]:
"""
Open all the drivers
"""
pred_with_pred_df = pd.read_csv('driv_pred_final_v1_2022_02_04.csv', low_memory = False)

In [None]:
pred_with_pred_df.head(1)

In [None]:
varient_sample_chr.columns

In [None]:
pred_with_pred_df.columns

In [None]:
pred_with_pred_df_filt = pred_with_pred_df.filter(items= ['gene_name', 'gene_id', 'Total_Samples', 'mutationType',
       'mutproteinPosStart', 'Specific_Samples', 'probability' ] )

In [None]:
pred_with_pred_df_filt.head(2)

In [None]:
driv_pred_with_pred_df_filt = pred_with_pred_df_filt.loc[pred_with_pred_df_filt['probability']>0.5]

In [None]:
driv_vari_df = pd.merge(varient_sample_chr, driv_pred_with_pred_df_filt, on = ['gene_name', 'gene_id', 'Total_Samples', 'mutationType',
       'mutproteinPosStart', 'Specific_Samples'])

In [None]:
driv_vari_df.head(2)

In [None]:
funct_mis_ptm_final.head(2)

In [None]:
"""
identify PTM distrupting drivers
"""
all_driv_ptm = pd.merge(driv_vari_df, funct_mis_ptm_final, on = ['gene_name', 'mutationType',
                                                                 'mutproteinPosStart'])

In [None]:
all_driv_ptm.head(2) 

In [None]:
all_driv_ptm['amino_acid'] = all_driv_ptm['aminoAcidChange'].astype(str).str[0]

In [None]:
all_driv_ptm['AA'] = all_driv_ptm['amino_acid'] + all_driv_ptm['mutproteinPosStart'].astype(str)

In [None]:
all_driv_ptm.head(2)

In [None]:
"""
save as csv
"""
cols = all_driv_ptm.columns.to_list()
all_driv_ptm.to_csv('miss_all_driv_ptm_02_16.csv', columns=cols, index= False)

In [None]:
"""
save as csv
"""

all_driv_ptm= pd.read_csv('miss_all_driv_ptm_02_16.csv')

In [None]:
all_driv_ptm.head(2)

In [None]:
all_driv_ptm_filt = all_driv_ptm.filter(items = ['accession', 'gene_name', 'mutationType', 
                                                               'mutproteinPosStart', 'AA','aminoAcidChange', 
                                                               'Varient_Count', 'MOD_RSD',
                                                               'Specific_Samples', 'Total_Samples', 'probability', 
                                                               'specific_fraction',
                                                               'PROTEIN', 'DOMAIN',
                                                               'LT_LIT', 'MS_LIT', 'MS_CST', 
                                                               'type', 'residue'])

In [None]:
all_driv_ptm_filt

In [None]:
gene= 'TP53'
aa = 110

In [None]:
all_driv_ptm_filt.loc[(all_driv_ptm_filt['gene_name']==gene)& (all_driv_ptm_filt['mutproteinPosStart']== aa)]

In [None]:
"""
save as csv Table
"""
cols= all_driv_ptm_filt.columns.to_list()
all_driv_ptm_filt.to_csv('table3_extra_miss_all_driv_ptm_02_16.csv', columns= cols, index= False)

In [None]:
len(all_driv_ptm_filt)

In [None]:
all_driv_ptm_filt_uniq = all_driv_ptm_filt.filter(items= ['accession', 'gene_name', 'mutationType', 
                                                               'mutproteinPosStart', 'AA','MOD_RSD',
                                                               'Specific_Samples', 'Total_Samples', 'probability', 
                                                               'specific_fraction',
                                                               'PROTEIN', 'DOMAIN',
                                                               'LT_LIT', 'MS_LIT', 'MS_CST', 
                                                               'type', 'residue'])

In [None]:
len(all_driv_ptm_filt_uniq)

In [None]:
all_driv_ptm_filt_uniq = all_driv_ptm_filt_uniq.drop_duplicates()

In [None]:
all_driv_ptm_filt_uniq.head(2)

In [None]:
"""
save as csv Table
"""
cols= all_driv_ptm_filt_uniq.columns.to_list()
all_driv_ptm_filt_uniq.to_csv('table3_extra_miss_all_driv_ptm_filt_uniq_02_15.csv', columns=cols, index= False)

## Table for Pie chart - PTM distrupting misssense Drivers

In [None]:
import pandas as pd

In [None]:
"""
open
"""
all_driv_ptm_filt_uniq = pd.read_csv('table3_extra_miss_all_driv_ptm_filt_uniq_02_15.csv')


In [None]:
all_driv_ptm_filt_uniq.head(2)

In [None]:
"""
PTMs distrupted by driver mutations
"""
all_driv_PTM_count = all_driv_ptm_filt_uniq.groupby(['type']).size().reset_index(name='PTM Count')

In [None]:
all_driv_PTM_count

# NRTK PTM distupting drivers

In [None]:
"""
open functional PTMs
"""
functional_ptm_final = pd.read_csv('functional_ptm_final_v1_2022_02_05.csv', low_memory= False)

In [None]:
functional_ptm_final.head(2)

In [None]:
"""
Open all the drivers
"""
pred_with_pred_df = pd.read_csv('driv_pred_final_v1_2022_02_04.csv', low_memory = False)

In [None]:
pred_with_pred_df['specific_fraction'] = (pred_with_pred_df['Specific_Samples']/pred_with_pred_df['Total_Samples'])*100

In [None]:
pred_with_pred_df = pred_with_pred_df.round({'specific_fraction': 1})

In [None]:
"""
select drivers
"""
driver_df = pred_with_pred_df.loc[pred_with_pred_df['driver_mut']== 1]

In [None]:
driver_df.head(2)

In [None]:
len(driver_df )

In [None]:
nrtk_list = pd.read_csv('NRTK_list.csv')

In [None]:
nrtk_list['HUGO symbol'] = nrtk_list['HUGO symbol'].str.upper()

In [None]:
"""Identify driver NRTKs
"""
driv_nrtks_df = pd.merge(nrtk_list, driver_df, left_on = 'HUGO symbol', right_on=['gene_name'])


In [None]:
driv_nrtks_df['gene_name'].unique()

In [None]:
"""
sort
"""
driv_nrtks_df = driv_nrtks_df.sort_values(by='Specific_Samples', ascending=False)

In [None]:
driv_nrtks_df.sort_values(by = 'probability', ascending= False)

In [None]:
"""
save as csv FIG
"""
cols=driv_nrtks_df.columns.to_list()
driv_nrtks_df.to_csv('fig3_driv_nrtks_df_02_23.csv', columns= cols, index=False)

In [None]:
"""
merge nrtk_list and ptms
"""
nrtk_PTMs = pd.merge(nrtk_list, functional_ptm_final, left_on = ['HUGO symbol'], right_on=['GENE'])

In [None]:
nrtk_PTMs.head(2)

In [None]:
nrtk_PTMs['driv_mut_type'].unique()

In [None]:
nrtk_PTMs.columns

In [None]:
nrtk_PTMs_Misse = nrtk_PTMs.loc[nrtk_PTMs['driv_mut_type']=='Missense Mutation']

In [None]:
nrtk_PTMs_Misse

In [None]:
nrtk_PTMs_filt = nrtk_PTMs.filter(items= ['FAMILY', 'HUGO symbol', 'GENE', 'PROTEIN', 'ACC_ID', 'HU_CHR_LOC',
       'MOD_RSD', 'SITE_GRP_ID', 'ORGANISM', 'MW_kD', 'DOMAIN', 'SITE_+/-7_AA',
       'LT_LIT', 'MS_LIT', 'MS_CST', 'CST_CAT#', 'type', 'residue'])

In [None]:
nrtk_PTMs_filt = nrtk_PTMs_filt.drop_duplicates()

In [None]:
nrtk_PTMs_filt.head(2)

In [None]:
nrtk_PTMs_filt['GENE'].unique()