### Read expression data

In [1]:
import os

import pandas as pd

from genometools.expression import ExpMatrix

cls1_expression_file = os.path.join('..', 'data', 'brca_expression_5yr_survive.tsv')
cls2_expression_file = os.path.join('..', 'data', 'brca_expression_5yr_dead.tsv')

num_variable = 2000

matrix1 = ExpMatrix.read_tsv(cls1_expression_file)
matrix2 = ExpMatrix.read_tsv(cls2_expression_file)

matrix = pd.concat([matrix1, matrix2], axis=1)

# filter 2,000 most variable genes
matrix = matrix.filter_variance(num_variable)
matrix1 = matrix1.loc[matrix.genes]
matrix2 = matrix2.loc[matrix.genes]
print(matrix.shape, matrix1.shape, matrix2.shape)

[2016-11-03 14:04:45] INFO: Selected the 2000 most variable genes (excluded 89.9% of genes, representing 58.5% of total variance).
(2000, 153) (2000, 123) (2000, 30)


### Split the data into training and test set

In [2]:
# for the test set, we're selecting five samples from each group at random
import pandas as pd
import numpy as np

# set seed for random number generator
seed = 0
np.random.seed(seed)

sel1 = np.random.choice(matrix1.n, size=41, replace=False)
sel2 = np.random.choice(matrix2.n, size=10, replace=False)

test = pd.concat([matrix1.iloc[:, sel1], matrix2.iloc[:, sel2]], axis=1)

# drop the samples from the tables
matrix1.drop(matrix1.columns[sel1], axis=1, inplace=True)
matrix2.drop(matrix2.columns[sel2], axis=1, inplace=True)
matrix = pd.concat([matrix1, matrix2], axis=1)
print(matrix1.shape, matrix2.shape, matrix.shape, test.shape)

(2000, 82) (2000, 20) (2000, 102) (2000, 51)


In [3]:
# Running random forest on the training data

num_estimators = 1000

seed = 0

model = RandomForestClassifier(n_estimators=num_estimators, class_weight='balanced_subsample',
                              random_state=seed, oob_score=True)

y = np.r_[matrix1.shape[1]*[0], matrix2.shape[1]*[1]]

print(y.size, y)
print(matrix.X.T.shape)

model.fit(matrix.X.T, y)

#print(model.oob_score_)

NameError: name 'RandomForestClassifier' is not defined

In [None]:
# predict and plot accuracy
import numpy as np

print(model.predict(test.X.T))

y_pred = model.predict(test.X.T)

print(y_pred)

### Generating the splits

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(matrix.X.T, y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    
    #X_train, X_test = X[train_index], X[test_index]
    #y_train, y_test = y[train_index], y[test_index]

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

num_estimators = 1000

seed = 0

model = RandomForestClassifier(n_estimators=num_estimators, class_weight='balanced_subsample',
                              random_state=seed, oob_score=True)

y = np.r_[matrix1.shape[1]*[0], matrix2.shape[1]*[1]]

print(y)

model.fit(matrix.X.T, y)

In [None]:
# the model accuracy
print(model.oob_score_)

In [None]:
print(matrix1.shape[1]/float(matrix.shape[1]))