# 1. Read data

In [7]:
import numpy as np
import zipfile
import pandas as pd

## 1.1. GSE2034

In [15]:
raw_gse2034_df = pd.read_csv('Data/GSE2034.zip', compression='zip', header=0, sep='\t', quotechar='"') # Breast cancer
gse2034_df=raw_gse2034_df.copy()

# preprocess dataset
gse2034_df[['Gene','KEGG-ID']] = gse2034_df['Class'].str.split('#',expand=True)
gse2034_df.drop('Class', inplace=True, axis=1)
cols = gse2034_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
gse2034_df=gse2034_df[cols]
#gse2034_df

In [16]:
# Change column names (estrogen receptor)
labels=gse2034_df.columns[2:]
for x in range(len(labels)):
    if(labels[x].startswith('ERpos')):
        labels.values[x]="ERpos"
    elif(labels[x].startswith('ERneg')):
        labels.values[x]="ERneg"

In [17]:
# transpose dataframe so that the columns indicate the genes
# and rows correspond to samples (class: ERpos or ERneg)
genes=(gse2034_df['Gene']).copy()
gse2034_df.drop('KEGG-ID', inplace=True, axis=1)
gse2034_df=np.transpose(gse2034_df.iloc[:,1:])
gse2034_df.columns=genes.values.tolist()
#gse2034_df

In [18]:
# Because one gene might correspond to more than one KEGG-IDs, we calculate the average (or max)
# value and get the following simplified dataframe
gse2034_df=gse2034_df.groupby(level=0,axis=1).mean()
gse2034_df['noProbe']=gse2034_df.mean(axis=1) # Compute 'noProbe' for future use
genes=gse2034_df.columns
gse2034_df

Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-HSAC07/X00351_M_at,AFFX-HUMGAPDH/M33197_3_at,AFFX-HUMGAPDH/M33197_5_at,AFFX-HUMGAPDH/M33197_M_at,AFFX-HUMISGF3A/M97935_3_at,AFFX-HUMISGF3A/M97935_5_at,AFFX-HUMISGF3A/M97935_MA_at,AFFX-HUMISGF3A/M97935_MB_at,AFFX-HUMRGE/M10098_5_at,noProbe
ERpos,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,49.9,...,18021.8,19418.9,7449.3,14027.7,2908.8,29.8,403.9,389.9,802.5,951.084183
ERpos,6520.9,112.5,189.8,2083.3,145.8,802.8,278.4,28.3,449.0,122.9,...,1413.2,19474.9,408.7,2021.0,6087.7,16.1,79.3,274.5,247.8,1003.982691
ERpos,5285.7,178.4,269.7,1203.4,42.5,557.5,183.3,56.4,101.9,85.9,...,11882.3,26784.7,9109.6,20412.1,4264.5,96.3,837.3,937.3,2297.5,1034.462272
ERneg,4043.7,398.7,312.4,1104.4,108.2,568.5,187.7,42.1,899.1,90.7,...,38913.5,23342.8,9439.4,18474.2,3496.5,71.1,1024.2,982.5,807.1,1069.874739
ERpos,4263.6,417.7,327.1,1043.3,69.2,653.2,185.8,21.8,3629.3,96.0,...,30323.1,29313.6,8671.0,22684.5,3952.3,36.6,969.1,656.3,141.4,1017.023462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERpos,3066.9,265.5,347.9,1127.4,47.0,583.9,138.2,16.1,577.3,44.7,...,12251.5,24463.5,5378.0,14403.8,3991.3,46.7,399.2,329.5,413.1,1009.287513
ERpos,2773.0,209.8,226.7,1071.8,45.1,859.9,121.0,24.8,935.6,78.8,...,13415.3,14976.3,1593.7,5722.3,1361.4,11.1,143.1,281.9,231.2,1001.443436
ERpos,2984.3,160.0,252.9,1178.5,146.3,664.2,183.3,49.6,443.5,74.0,...,11946.3,22092.0,3503.0,9549.3,1917.7,33.1,119.7,295.9,269.2,1064.410080
ERneg,3540.0,285.7,135.1,1256.7,75.9,603.1,125.0,72.9,73.5,126.1,...,22759.9,15283.2,6157.2,12799.3,1880.2,8.4,533.8,491.1,113.0,1030.942125


## 1.2. GraphiteWeb score

In [19]:
zf = zipfile.ZipFile('Results/GraphiteWeb.zip') 
graphiteweb_df = pd.read_csv(zf.open('GraphiteWeb.csv'))
graphiteweb_df.index=gse2034_df.index
graphiteweb_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41598,41599,41600,41601,41602,41603,41604,41605,41606,41607
ERpos,12.044778,6.813041,3.906521,3.906521,17.857819,7.975649,11.462388,3.906521,28.321294,2.162608,...,5.650433,2.743912,3.906521,3.906521,3.325216,2.743912,6.813041,6.231737,6.813041,6.813041
ERpos,12.044778,6.813041,3.906521,3.906521,17.857819,7.975649,11.462388,3.906521,28.321294,2.162608,...,5.650433,2.743912,3.906521,3.906521,3.325216,2.743912,6.813041,6.231737,6.813041,6.764651
ERpos,12.044778,6.813041,3.906521,3.906521,17.857819,7.975649,11.462388,3.906521,28.321294,2.162608,...,5.650433,2.743912,3.906521,3.906521,3.325216,2.743912,6.813041,6.231737,6.813041,6.764651
ERneg,12.044778,6.813041,3.906521,3.906521,17.857819,7.975649,11.449034,3.906521,28.321294,2.162608,...,5.650433,2.743912,3.906521,3.906521,3.325216,2.743912,6.813041,6.231737,6.813041,6.813041
ERpos,12.044778,6.813041,3.906521,3.906521,17.857819,7.975649,11.463474,3.906521,28.321294,2.162608,...,5.650433,2.743912,3.906521,3.906521,3.325216,2.743912,6.813041,6.231737,6.813041,6.764651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERpos,12.044778,6.813041,3.906521,3.906521,17.857819,7.975649,11.462388,3.906521,28.321294,2.162608,...,5.650433,2.743912,3.906521,3.906521,3.325216,2.743912,6.813041,6.231737,6.813041,6.813041
ERpos,12.044778,6.813041,3.906521,3.906521,17.857819,7.975649,11.462388,3.906521,28.321294,2.162608,...,5.650433,2.743912,3.906521,3.906521,3.325216,2.743912,6.813041,6.231737,6.813041,6.813041
ERpos,12.044778,6.813041,3.906521,3.906521,17.857819,7.975649,11.462388,3.906521,28.321294,2.162608,...,5.650433,2.743912,3.906521,3.906521,3.325216,2.743912,6.813041,6.231737,6.813041,6.813041
ERneg,12.044778,6.813041,3.906521,3.906521,17.857819,7.975649,11.462388,3.906521,28.321294,2.162608,...,5.650433,2.743912,3.906521,3.906521,3.325216,2.743912,6.813041,6.231737,6.813041,6.813041


In [38]:
# Benjamini and Hochberg FDR at alpha=0.05
from statsmodels.stats.multitest import multipletests

y=[]
for sample in range(graphiteweb_df.shape[0]):
    y.append(multipletests(pvals=graphiteweb_df.iloc[0], alpha=0.05, method="fdr_bh")[1])

In [41]:
y_def=pd.DataFrame(y,index=graphiteweb_df.index)
y_def

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41598,41599,41600,41601,41602,41603,41604,41605,41606,41607
ERpos,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ERpos,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ERpos,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ERneg,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ERpos,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERpos,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ERpos,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ERpos,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ERneg,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [44]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Instantiating LabelEncoder
le=LabelEncoder()

# Using LabelEncoder to do the numeric transformation
y_def.index=le.fit_transform(y_def.index.astype(str))
y_def.index=le.fit_transform(y_def.index.astype(str))

# Split data
training_data, testing_data, training_labels, testing_labels = train_test_split(y_def, y_def.index, test_size = 0.3 ,random_state = 0)

In [50]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import tree
from sklearn.preprocessing import LabelEncoder

# Create Decision Tree classifer object
dt_clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
dt_clf = dt_clf.fit(training_data,training_labels)

# Predict the response for test dataset
y_pred = dt_clf.predict(testing_data)

decision_tree_score=metrics.accuracy_score(testing_labels, y_pred)
print("Accuracy: {} %".format(round((decision_tree_score*100),2)))

Accuracy: 80.23 %


In [51]:
features=[]
for i in range(len(dt_clf.feature_importances_)):
    if(dt_clf.feature_importances_[i]>0.):
        features.append(i)
decision_tree_features=features
print("Significant subpaths: "+str(decision_tree_features))

Significant subpaths: []


In [52]:
# get the text representation
text_representation = tree.export_text(dt_clf)
text_representation

'|--- class: 1\n'