In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
from numpy import genfromtxt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load data
gtex_gene_expression = pd.read_csv('gtex_gene_expression.csv')
genes = pd.read_csv('genes.csv')
gtex_donor = pd.read_csv('gtex_donor.csv')
gtex_gene_model = pd.read_csv('gtex_gene_model.csv')
gtex_sample = pd.read_csv('gtex_sample.csv')
gtex_tissue = pd.read_csv('gtex_tissue.csv')
tcga = pd.read_csv('tcga.csv') 
#gtex_sample_expression = pd.read_csv('gtex_sample_expression.csv')

In [3]:
# Preprocess cancer data
organ_cancer = {}
organs = ['Breast', 'HeadAndNeck', 'Kidney', 'Brain', 'Lung', 'Prostate',
       'Thyroid', 'Uterus']
for organ in organs:
    organ_temp = tcga[tcga['organ'] == organ]
    organ_temp = organ_temp.pivot(index = 'sample_number', columns = 'gene_id', values = 'fpkm_expression').sort_index()
    length = organ_temp.shape[0]
    organ_temp['cancer'] = pd.Series(np.ones(length), index=organ_temp.index)
    organ_cancer[organ] = organ_temp

In [4]:
# Description of cancer data
for organ in organs:
    print(organ, ": ", organ_cancer[organ].shape)

Breast :  (1222, 2843)
HeadAndNeck :  (546, 2843)
Kidney :  (611, 2843)
Brain :  (529, 2843)
Lung :  (594, 2843)
Prostate :  (551, 2843)
Thyroid :  (568, 2843)
Uterus :  (587, 2843)


In [6]:
#
df = organ_cancer['Brain']

In [8]:
# Train/test split 
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
train, test = df[df['is_train']==True], df[df['is_train']==False]
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 391
Number of observations in the test data: 138


In [13]:
# Features
features = df.columns[:-2]

In [14]:
features

Index(['ENSG00000005022', 'ENSG00000025772', 'ENSG00000031691',
       'ENSG00000044012', 'ENSG00000048544', 'ENSG00000068985',
       'ENSG00000070031', 'ENSG00000071677', 'ENSG00000073734',
       'ENSG00000076641',
       ...
       'ENSG00000273254', 'ENSG00000273302', 'ENSG00000273305',
       'ENSG00000273312', 'ENSG00000273320', 'ENSG00000273388',
       'ENSG00000273403', 'ENSG00000273423', 'ENSG00000273448',
       'ENSG00000273483'],
      dtype='object', name='gene_id', length=2842)

In [23]:
# Labels
y = pd.factorize(train['cancer'])[0]

In [24]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0, class_weight=None)
clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train[features], y)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [None]:
# Create actual english names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]
pd.crosstab(test['cancer'], preds, rownames=['Actual'], colnames=['Predicted'])

list(zip(train[features], clf.feature_importances_))