In [2]:
import numpy as np
import pandas as pd 
import matplotlib as plt 
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


# For the WGS data 

# For the somatic mutations 

In [3]:
df_somatic_mutations = pd.read_csv("./project_data/catalogs/WGS/WGS_PCAWG.96.csv")
df_somatic_mutations = df_somatic_mutations.set_index (df_somatic_mutations['Mutation type'] + "_" + df_somatic_mutations['Trinucleotide'])

df_somatic_mutations.head()

Unnamed: 0,Mutation type,Trinucleotide,Biliary-AdenoCA::SP117655,Biliary-AdenoCA::SP117556,Biliary-AdenoCA::SP117627,Biliary-AdenoCA::SP117775,Biliary-AdenoCA::SP117332,Biliary-AdenoCA::SP117712,Biliary-AdenoCA::SP117017,Biliary-AdenoCA::SP117031,...,Uterus-AdenoCA::SP94540,Uterus-AdenoCA::SP95222,Uterus-AdenoCA::SP89389,Uterus-AdenoCA::SP90503,Uterus-AdenoCA::SP92460,Uterus-AdenoCA::SP92931,Uterus-AdenoCA::SP91265,Uterus-AdenoCA::SP89909,Uterus-AdenoCA::SP90629,Uterus-AdenoCA::SP95550
C>A_ACA,C>A,ACA,269,114,105,217,52,192,54,196,...,117,233,94,114,257,139,404,97,250,170
C>A_ACC,C>A,ACC,148,56,71,123,36,139,54,102,...,90,167,59,64,268,75,255,78,188,137
C>A_ACG,C>A,ACG,25,13,13,29,8,31,12,15,...,12,29,14,19,51,13,52,14,49,32
C>A_ACT,C>A,ACT,154,70,73,126,31,119,41,122,...,82,213,66,68,271,68,281,80,202,116
C>A_CCA,C>A,CCA,215,63,71,129,30,190,54,133,...,119,188,67,89,307,69,339,204,194,127


In [4]:
df_somatic_mutations.shape

(96, 2782)

In [5]:
cancer_types = np.array([ s.split(":")[0].split("-")[1] for s in df_somatic_mutations.columns[2:] ])
cancer_types

array(['AdenoCA', 'AdenoCA', 'AdenoCA', ..., 'AdenoCA', 'AdenoCA',
       'AdenoCA'], shape=(2780,), dtype='<U9')

In [6]:
pd.unique(cancer_types)

array(['AdenoCA', 'TCC', 'Benign', 'Epith', 'Osteosarc', 'DCIS',
       'LobularCA', 'GBM', 'Medullo', 'Oligo', 'PiloAstro', 'SCC',
       'ChRCC', 'RCC', 'HCC', 'BNHL', 'CLL', 'AML', 'MDS', 'MPN',
       'Endocrine', 'Melanoma', 'Leiomyo', 'Liposarc'], dtype='<U9')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_somatic_mutations.iloc[:, 2:].T, cancer_types)

In [8]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002084 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20082
[LightGBM] [Info] Number of data points in the train set: 2085, number of used features: 96
[LightGBM] [Info] Start training from score -5.339939
[LightGBM] [Info] Start training from score -0.781860
[LightGBM] [Info] Start training from score -3.311791
[LightGBM] [Info] Start training from score -5.077575
[LightGBM] [Info] Start training from score -3.483641
[LightGBM] [Info] Start training from score -4.087176
[LightGBM] [Info] Start training from score -6.543912
[LightGBM] [Info] Start training from score -3.531650
[LightGBM] [Info] Start training from score -5.339939
[LightGBM] [Info] Start training from score -4.116164
[LightGBM] [Info] Start training from score -2.166061
[LightGBM] [Info] Start training from score -5.339939
[LightGBM] [Info] Start training from score -5.244629
[LightGBM

In [9]:
print(models)

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
LGBMClassifier                     0.82               0.57    None      0.81   
RandomForestClassifier             0.78               0.46    None      0.75   
ExtraTreesClassifier               0.77               0.44    None      0.74   
SGDClassifier                      0.73               0.41    None      0.71   
BaggingClassifier                  0.74               0.38    None      0.72   
KNeighborsClassifier               0.74               0.38    None      0.71   
NearestCentroid                    0.33               0.36    None      0.38   
LinearSVC                          0.75               0.36    None      0.70   
DecisionTreeClassifier             0.64               0.35    None      0.64   
GaussianNB                         0.29               0.35    None      0.29   
ExtraTreeClassifier                0.61 

# For Activities

In [10]:
df_activities = pd.read_csv("./project_data/activities/WGS/WGS_PCAWG.activities.csv")
df_activities.head()

Unnamed: 0,Cancer Types,Sample Names,Accuracy,SBS1,SBS2,SBS3,SBS4,SBS5,SBS6,SBS7a,...,SBS51,SBS52,SBS53,SBS54,SBS55,SBS56,SBS57,SBS58,SBS59,SBS60
0,Biliary-AdenoCA,SP117655,0.97,1496,1296,0,0,1825,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Biliary-AdenoCA,SP117556,0.96,985,0,0,0,922,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Biliary-AdenoCA,SP117627,0.97,1110,528,0,0,1453,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Biliary-AdenoCA,SP117775,0.99,1803,1271,0,0,2199,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Biliary-AdenoCA,SP117332,0.99,441,461,0,0,840,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#drop sample names and accuracy columns as they do not give any valuable information 
df_activities.drop(columns=["Sample Names", "Accuracy"], inplace=True)
df_activities.head()

Unnamed: 0,Cancer Types,SBS1,SBS2,SBS3,SBS4,SBS5,SBS6,SBS7a,SBS7b,SBS7c,...,SBS51,SBS52,SBS53,SBS54,SBS55,SBS56,SBS57,SBS58,SBS59,SBS60
0,Biliary-AdenoCA,1496,1296,0,0,1825,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Biliary-AdenoCA,985,0,0,0,922,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Biliary-AdenoCA,1110,528,0,0,1453,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Biliary-AdenoCA,1803,1271,0,0,2199,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Biliary-AdenoCA,441,461,0,0,840,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
cancer_types = np.array([ s.split(":")[0].split("-")[1] for s in df_activities["Cancer Types"]])
cancer_types

array(['AdenoCA', 'AdenoCA', 'AdenoCA', ..., 'AdenoCA', 'AdenoCA',
       'AdenoCA'], shape=(2780,), dtype='<U9')

In [13]:
pd.unique(cancer_types)

array(['AdenoCA', 'TCC', 'Benign', 'Epith', 'Osteosarc', 'DCIS',
       'LobularCA', 'GBM', 'Medullo', 'Oligo', 'PiloAstro', 'SCC',
       'ChRCC', 'RCC', 'HCC', 'BNHL', 'CLL', 'AML', 'MDS', 'MPN',
       'Endocrine', 'Melanoma', 'Leiomyo', 'Liposarc'], dtype='<U9')

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_activities.iloc[:, 3:], cancer_types)

In [15]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models_2, predictions_2 = clf.fit(X_train, X_test, y_train, y_test)

  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1394
[LightGBM] [Info] Number of data points in the train set: 2085, number of used features: 24
[LightGBM] [Info] Start training from score -5.563083
[LightGBM] [Info] Start training from score -0.792398
[LightGBM] [Info] Start training from score -3.273076
[LightGBM] [Info] Start training from score -5.157617
[LightGBM] [Info] Start training from score -3.365858
[LightGBM] [Info] Start training from score -4.208537
[LightGBM] [Info] Start training from score -6.949377
[LightGBM] [Info] Start training from score -3.437832
[LightGBM] [Info] Start training from score -5.445300
[LightGBM] [Info] Start training from score -4.208537
[LightGBM] [Info] Start training from score -2.149463
[LightGBM] [Info] Start training from score -5.339939
[LightGBM] [Info] Start training from score -4.934474
[LightGBM]

In [16]:
print(models_2)

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
DecisionTreeClassifier             0.64               0.37    None      0.64   
ExtraTreesClassifier               0.68               0.36    None      0.67   
ExtraTreeClassifier                0.63               0.35    None      0.63   
RandomForestClassifier             0.69               0.34    None      0.67   
GaussianNB                         0.31               0.33    None      0.37   
BaggingClassifier                  0.67               0.33    None      0.66   
KNeighborsClassifier               0.68               0.32    None      0.65   
Perceptron                         0.56               0.31    None      0.57   
LGBMClassifier                     0.66               0.30    None      0.64   
SGDClassifier                      0.63               0.27    None      0.58   
QuadraticDiscriminantAnalysis      0.37 

# For the WES data 

# Somatic mutations

In [17]:
df_somatic_mutations_wes = pd.read_csv("./project_data/catalogs/WES/WES_TCGA.96.csv")
df_somatic_mutations_wes = df_somatic_mutations_wes.set_index (df_somatic_mutations_wes['Mutation type'] + "_" + df_somatic_mutations_wes['Trinucleotide'])

df_somatic_mutations_wes.head()


Unnamed: 0,Mutation type,Trinucleotide,AML::TCGA-AB-2802-03B-01W-0728-08,AML::TCGA-AB-2803-03B-01W-0728-08,AML::TCGA-AB-2804-03B-01W-0728-08,AML::TCGA-AB-2805-03B-01W-0728-08,AML::TCGA-AB-2806-03B-01W-0728-08,AML::TCGA-AB-2807-03B-01W-0728-08,AML::TCGA-AB-2808-03B-01W-0728-08,AML::TCGA-AB-2809-03D-01W-0755-09,...,Eye-Melanoma::TCGA-WC-A885-01A-11D-A39W-08,Eye-Melanoma::TCGA-WC-A888-01A-11D-A39W-08,Eye-Melanoma::TCGA-WC-A88A-01A-11D-A39W-08,Eye-Melanoma::TCGA-WC-AA9A-01A-11D-A39W-08,Eye-Melanoma::TCGA-WC-AA9E-01A-11D-A39W-08,Eye-Melanoma::TCGA-YZ-A980-01A-11D-A39W-08,Eye-Melanoma::TCGA-YZ-A982-01A-11D-A39W-08,Eye-Melanoma::TCGA-YZ-A983-01A-11D-A39W-08,Eye-Melanoma::TCGA-YZ-A984-01A-11D-A39W-08,Eye-Melanoma::TCGA-YZ-A985-01A-11D-A39W-08
C>A_ACA,C>A,ACA,0,0,0,0,4,0,2,0,...,1,0,0,0,0,0,0,0,0,0
C>A_ACC,C>A,ACC,0,2,0,0,0,1,3,0,...,0,0,0,0,0,0,0,1,0,0
C>A_ACG,C>A,ACG,0,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,1,0,0
C>A_ACT,C>A,ACT,0,0,0,1,2,1,3,0,...,0,0,0,0,0,0,1,0,1,0
C>A_CCA,C>A,CCA,0,0,0,0,20,1,3,0,...,0,0,0,0,1,0,0,0,0,0


In [37]:
df_somatic_mutations_wes.iloc[:, 2:]
type(df_somatic_mutations_wes)

pandas.core.frame.DataFrame

In [40]:
cancer_types = np.array([
    s.split("::")[0]
    for s in df_somatic_mutations_wes.columns[2:]
])

In [44]:
pd.unique(cancer_types)

array(['AML', 'Adrenal-neoplasm', 'Transitional-cell-carcinoma', 'DLBC',
       'Breast-cancer', 'Cervix-CA', 'Biliary-AdenoCa',
       'ColoRect-AdenoCa', 'Eso-AdenoCa', 'CNS-GBM', 'Head-SCC',
       'Kidney-ChRCC', 'Kidney-RCC', 'Kidney-Papillary', 'Liver-HCC',
       'Lung-AdenoCa', 'Lung-SCC', 'Lymph-BNHL',
       'Mesothelium-Mesothelioma', 'Ovary-AdenoCa', 'Panc-AdenoCa',
       'Pheochromocytoma', 'Prost-AdenoCa', 'Sarcoma', 'Skin-Melanoma',
       'Stomach-AdenoCa', 'Testis-Ca', 'Thymoma', 'Thy-AdenoCa', 'UCS',
       'Uterus-AdenoCa', 'Eye-Melanoma'], dtype='<U27')

In [46]:
X = df_somatic_mutations_wes.iloc[:, 2:].T
y = cancer_types

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [47]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models3, predictions3 = clf.fit(X_train, X_test, y_train, y_test)

  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002829 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4767
[LightGBM] [Info] Number of data points in the train set: 7119, number of used features: 96
[LightGBM] [Info] Start training from score -4.404614
[LightGBM] [Info] Start training from score -4.636416
[LightGBM] [Info] Start training from score -5.436535
[LightGBM] [Info] Start training from score -2.315166
[LightGBM] [Info] Start training from score -3.261051
[LightGBM] [Info] Start training from score -3.582256
[LightGBM] [Info] Start training from score -2.891637
[LightGBM] [Info] Start training from score -2.912098
[LightGBM] [Info] Start training from score -3.980173
[LightGBM] [Info] Start training from score -4.881538
[LightGBM] [Info] Start training from score -2.976120
[LightGBM] [Info] Start training from score -5.344162
[LightGBM] [Info] Start training from score -3.666516
[LightGBM]

In [48]:
print(models3)

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
LGBMClassifier                     0.48               0.36    None      0.47   
RandomForestClassifier             0.45               0.32    None      0.44   
LinearSVC                          0.40               0.32    None      0.39   
ExtraTreesClassifier               0.43               0.30    None      0.41   
BaggingClassifier                  0.39               0.30    None      0.39   
LogisticRegression                 0.39               0.29    None      0.38   
BernoulliNB                        0.31               0.25    None      0.30   
CalibratedClassifierCV             0.37               0.25    None      0.36   
NearestCentroid                    0.24               0.24    None      0.26   
PassiveAggressiveClassifier        0.32               0.22    None      0.29   
SGDClassifier                      0.28 