# Cancer classification based on gene expression
 
- https://www.nature.com/articles/nmeth.2810#Sec15
- further reading: https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/

In [79]:
import numpy as np
import pandas as pd
import requests, zipfile, io, os
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import seaborn as sns

## Getting the data - Option 1: start from scratch

In [6]:
# download Dataset with Python

data_directory = "cancer_datasets_test"

headers = {'Connection': 'keep-alive',
           'Host': 'static-content.springer.com',
           'Referer': 'https://www.nature.com/',
           'Upgrade-Insecure-Requests': '1',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
           'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0',
           'Accept-Language': 'en-US,en;q=0.5'}

url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fnmeth.2810/MediaObjects/41592_2014_BFnmeth2810_MOESM206_ESM.zip'
request = requests.get(url, headers= headers, stream=True)
zip_reader = zipfile.ZipFile(io.BytesIO(request.content))
zip_reader.extractall(data_directory)


In [32]:
# Create dataframes
data_directory = "cancer_datasets" 

cancer_types = ["Breast", "Colon", "GBM", "Lung"]

breast_gene_exp_df = pd.read_csv(
    "{}/Breast/BREAST_Gene_Expression.txt".format(data_directory),
    delimiter=r"\s+", index_col=0)
colon_gene_exp_df = pd.read_csv(
    "{}/Colon/COLON_Gene_Expression.txt".format(data_directory), 
    delimiter=r"\s+", index_col=0) 
gbm_gene_exp_df = pd.read_csv(
    "{}/GBM/GLIO_Gene_Expression.txt".format(data_directory), 
    delimiter=r"\s+", index_col=0) 
lung_gene_exp_df = pd.read_csv(
    "{}/Lung/LUNG_Gene_Expression.txt".format(data_directory), 
    delimiter=r"\s+", index_col=0) 

breast_gene_exp_df

Unnamed: 0,TCGA-A1-A0SD-01A-11R-A115-07,TCGA-A2-A04N-01A-11R-A115-07,TCGA-A2-A04U-01A-11R-A115-07,TCGA-A2-A04W-01A-31R-A115-07,TCGA-A2-A0CL-01A-11R-A115-07,TCGA-A2-A0CS-01A-11R-A115-07,TCGA-A2-A0CV-01A-31R-A115-07,TCGA-A2-A0D3-01A-11R-A115-07,TCGA-A2-A0ES-01A-11R-A115-07,TCGA-A2-A0EW-01A-21R-A115-07,...,TCGA-E2-A15E-01A-11R-A12D-07,TCGA-E2-A15F-01A-11R-A115-07,TCGA-E2-A15G-01A-11R-A12D-07,TCGA-E2-A15H-01A-11R-A12D-07,TCGA-E2-A15M-01A-11R-A12D-07,TCGA-E2-A15O-01A-11R-A115-07,TCGA-E2-A15P-01A-11R-A115-07,TCGA-E2-A15R-01A-11R-A115-07,TCGA-E2-A15S-01A-11R-A115-07,TCGA-E2-A15T-01A-11R-A115-07
AKAP5,-0.856922,-0.559035,-0.266071,-0.261968,0.127010,2.542936,0.088850,0.401099,-0.063786,-1.006276,...,-0.090046,-1.356684,0.735915,0.312471,0.739197,-0.303409,1.196287,1.283683,0.670265,0.905785
G30,0.513849,0.871548,-0.160577,-0.673417,-0.209375,-2.064623,3.273241,-0.125590,0.914361,0.146482,...,-0.160117,2.053290,-1.383291,-0.083237,-0.155974,-0.703341,0.820908,0.001009,-0.473621,0.657941
TMC3,-0.942729,-0.448162,-0.362206,-0.103697,0.337629,0.793708,-0.103697,0.748806,-0.551437,-0.628412,...,0.648738,0.398568,-1.022911,-1.332737,-0.215953,0.266426,0.961129,-1.051136,-0.280099,0.048971
OR51A2,0.614422,0.810584,-1.234118,0.039013,-0.640247,-0.652555,0.611344,-0.552551,0.929819,-0.781791,...,-0.237922,-3.891153,0.105169,1.266757,0.382873,0.359795,-1.055649,-0.578706,0.289792,0.227482
HOXD4,0.588492,-0.382428,0.908324,-0.715968,0.198601,0.453705,-0.071734,-1.150788,0.728609,0.866442,...,0.175756,1.419295,-1.306897,0.578592,0.577831,0.063053,-0.324554,-0.312370,-0.271248,-0.961934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GOT1L1,-0.234977,-0.547133,1.052848,-0.258989,0.053941,-0.600634,0.181360,0.612413,2.651495,-0.437917,...,-2.012928,-0.044473,1.125185,1.053278,-0.128860,-0.028551,-0.993678,0.408312,0.210406,-0.945654
TRIM54,-0.244242,0.025004,0.323098,0.835949,-1.257923,-0.220203,-0.864471,0.049845,-0.432555,0.557888,...,-1.399759,-0.760298,-0.448581,0.424867,1.290302,1.701384,-1.584866,0.166038,-0.448581,-0.051924
TRIT1,-0.348131,1.158526,3.643876,0.616872,-0.281647,-0.683487,-0.200497,-0.003976,-0.322711,-0.484034,...,0.778195,0.438928,-0.755838,-1.238829,-0.999289,0.535722,0.713666,-1.905630,0.714644,0.243386
LRRC47,-0.461579,-0.314814,-0.719256,2.701133,-0.710188,-1.972064,-0.026724,0.539272,0.857636,0.286339,...,1.056300,0.853590,0.696082,0.651857,0.035219,0.099813,1.314534,-0.324300,0.615724,1.508733


In [55]:
# Not all genes are found in all expression sets - only overlap can be used
genes_breast = set(breast_gene_exp_df.index)
genes_colon = set(colon_gene_exp_df.index)
genes_gdm = set(gbm_gene_exp_df.index)
genes_lung = set(lung_gene_exp_df.index)
genes_found_in_all_sets = genes_breast.intersection(
    genes_colon).intersection(
    genes_gdm).intersection(
    genes_lung)
len(genes_found_in_all_sets)

11925

In [87]:
# Create universal dataframe with all cancer types

exp_df = pd.DataFrame()
for df, cancer_type in zip([breast_gene_exp_df, colon_gene_exp_df, 
                            gbm_gene_exp_df,lung_gene_exp_df],
                            cancer_types):
    transposed_df = pd.DataFrame(df.reindex(genes_found_in_all_sets).transpose())
    transposed_df["cancer_type"] = cancer_type
    exp_df = exp_df.append(transposed_df)

exp_df.to_csv("cancer_cells_expression.csv.bz2")

## Getting the data - Option 2: read the processed file

In [86]:
# The file can be fond in the git repo
exp_df = pd.read_csv("cancer_cells_expression.csv.bz2")

## Preparing the data

In [59]:
X = exp_df.drop(["cancer_type"], axis=1).values
targets = exp_df["cancer_type"].values

In [60]:
X.shape

(518, 11925)

In [61]:
targets.shape

(518,)

In [68]:
targets = LabelEncoder().fit_transform(targets)

In [73]:
targets

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [72]:
X_scaled = StandardScaler().fit_transform(X)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, targets, random_state=42)

In [74]:
model = RandomForestClassifier()

In [78]:
model.fit(X_train, y_train).score(X_test, y_test)

0.8923076923076924

In [85]:
# feature selection with SVC

#model =  SVC(gamma='auto')
#data_dict = {}

# for K in range(1000, len(overlap_genes), 100):
     #selector = SelectKBest(score_func=f_classif, k=K)
    # selected_input = selector.fit_transform(X_scaled, targets.ravel())
    #best_genes_index = selector.get_support(indices=True)
    #best_genes =[list(overlap_genes)[i] for i in best_genes_index]
#    X_train, X_test, y_train, y_test = train_test_split(selected_input,
#                                                    target_array.ravel(),
#                                                    test_size=0.2,
#                                                    random_state=42)
#    model.fit(X_train, y_train)
#    mean_accuracy = model.score(X_test, y_test)
#    data_dict[K] = mean_accuracy
    

#sns.lineplot(list(data_dict.keys()), list(data_dict.values()))
    

In [252]:
# benchmark with dummy classifier

dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_test, y_test)

0.25961538461538464