# per Tissue Analysis and Models
*Ben Imlay*

## Database Setup

In [272]:
import pandas as pd
from pathlib import Path
from os import listdir
import sklearn.model_selection
import sklearn.feature_selection
TISSUE='Liver'

In [273]:
data_dir=Path("data")
tissue_dir=Path("tissue-specific")
manifest={"data":"All_Tissue_Site_Details.combined.reads.gct",
              "sample_meta":"GTEx_v7_Annotations_SampleAttributesDS.txt",
              "subject_meta":"GTEx_v7_Annotations_SubjectPhenotypesDS.txt",
               "merged_meta":"merged_meta.tsv"}
meta=pd.read_csv(data_dir/manifest['merged_meta'],sep="\t",dtype={'SMUBRID':object,'SEX':object,'DTHHRDY':object})
meta=meta[~(meta['AGE'].isnull())] # removes all samples without age
meta.iloc[0:3,:]
meta.shape

(10346, 67)

## Avaiable Tissues

In [274]:
# To do python plots of counts
infiles=listdir(data_dir/tissue_dir)
TISSUE_files=[f for f in infiles if  TISSUE in f]
TISSUE_files

['Liver_c.tsv', 'Liver_cpm.tsv', 'Liver_lcpm.tsv']

## Reading CPM and LCPM files.

In [275]:
cpm=pd.read_csv(data_dir/tissue_dir/TISSUE_files[1],sep="\t",index_col=0)
lcpm=pd.read_csv(data_dir/tissue_dir/TISSUE_files[0],sep="\t",index_col=0)

In [276]:
cpm.iloc[0:5,0:20]

Unnamed: 0,ENSG00000223972,ENSG00000227232,ENSG00000243485,ENSG00000237613,ENSG00000268020,ENSG00000240361,ENSG00000186092,ENSG00000238009,ENSG00000233750,ENSG00000237683,ENSG00000268903,ENSG00000239906,ENSG00000241860,ENSG00000222623,ENSG00000241599,ENSG00000228463,ENSG00000237094,ENSG00000250575,ENSG00000233653,ENSG00000224813
GTEX-11DXY-0526-SM-5EGGQ,0.142941,9.691384,0.0,0.085764,0.0,0.114353,0.0,0.171529,0.714704,8.2048,0.0,0.0,0.857645,0.0,0.028588,2.1727,0.343058,0.0,0.200117,0
GTEX-11DXZ-0126-SM-5EGGY,0.175082,7.63358,0.140066,0.0,0.070033,0.140066,0.070033,0.49023,0.665312,1.785838,0.0,0.0,0.420197,0.0,0.035016,0.770361,0.420197,0.035016,0.385181,0
GTEX-11EQ9-0526-SM-5A5JZ,0.0,3.867529,0.021606,0.064819,0.021606,0.0,0.021606,0.043213,0.237669,3.370584,0.043213,0.021606,0.453732,0.0,0.0,0.345701,0.216063,0.021606,0.064819,0
GTEX-11GSP-0626-SM-5986T,0.13369,14.50537,0.033423,0.100268,0.0,0.13369,0.100268,0.13369,0.40107,5.046799,0.033423,0.167113,0.66845,0.0,0.100268,1.102943,0.969253,0.033423,0.26738,0
GTEX-11NUK-1226-SM-5P9GM,0.174075,9.008397,0.152316,0.021759,0.043519,0.043519,0.108797,0.130556,0.652782,4.373642,0.021759,0.065278,0.652782,0.0,0.021759,1.784272,0.674542,0.043519,0.36991,0


In [277]:
lcpm.iloc[0:5,0:20]

Unnamed: 0,ENSG00000223972,ENSG00000227232,ENSG00000243485,ENSG00000237613,ENSG00000268020,ENSG00000240361,ENSG00000186092,ENSG00000238009,ENSG00000233750,ENSG00000237683,ENSG00000268903,ENSG00000239906,ENSG00000241860,ENSG00000222623,ENSG00000241599,ENSG00000228463,ENSG00000237094,ENSG00000250575,ENSG00000233653,ENSG00000224813
GTEX-11DXY-0526-SM-5EGGQ,5,339,0,3,0,4,0,6,25,287,0,0,30,0,1,76,12,0,7,0
GTEX-11DXZ-0126-SM-5EGGY,5,218,4,0,2,4,2,14,19,51,0,0,12,0,1,22,12,1,11,0
GTEX-11EQ9-0526-SM-5A5JZ,0,179,1,3,1,0,1,2,11,156,2,1,21,0,0,16,10,1,3,0
GTEX-11GSP-0626-SM-5986T,4,434,1,3,0,4,3,4,12,151,1,5,20,0,3,33,29,1,8,0
GTEX-11NUK-1226-SM-5P9GM,8,414,7,1,2,2,5,6,30,201,1,3,30,0,1,82,31,2,17,0


### Merging metadata

In [278]:
print(meta['SMTSD'].unique())

['Adipose - Subcutaneous' 'Adipose - Visceral (Omentum)' 'Adrenal Gland'
 'Artery - Aorta' 'Artery - Coronary' 'Artery - Tibial' 'Bladder'
 'Brain - Cerebellum' 'Brain - Cortex' 'Breast - Mammary Tissue'
 'Cells - EBV-transformed lymphocytes' 'Cells - Transformed fibroblasts'
 'Cervix - Ectocervix' 'Cervix - Endocervix' 'Colon - Sigmoid'
 'Colon - Transverse' 'Esophagus - Gastroesophageal Junction'
 'Esophagus - Mucosa' 'Esophagus - Muscularis' 'Fallopian Tube'
 'Heart - Atrial Appendage' 'Heart - Left Ventricle' 'Kidney - Cortex'
 'Liver' 'Lung' 'Minor Salivary Gland' 'Muscle - Skeletal'
 'Nerve - Tibial' 'Ovary' 'Pancreas' 'Pituitary' 'Prostate'
 'Skin - Not Sun Exposed (Suprapubic)' 'Skin - Sun Exposed (Lower leg)'
 'Small Intestine - Terminal Ileum' 'Spleen' 'Stomach' 'Testis' 'Thyroid'
 'Uterus' 'Vagina' 'Whole Blood']


In [281]:
#tissue_meta = meta[pd.np.where(meta['SMTSD'].str.contains(TISSUE))]
#print(tissue_meta)
tissue_meta=meta[meta['SMTS']==TISSUE]
tissue_meta.iloc[0:5]

Unnamed: 0,SAMPID,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,...,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS,SUBJID,SEX,AGE,DTHHRDY
6536,GTEX-11DXY-0526-SM-5EGGQ,1.0,B1,"2 pieces, congestion",6.8,Liver,Liver,1114,891.0,1149.0,...,50.825165,0.015545,0.993035,,0.0,49.517372,GTEX-11DXY,1.0,60-69,2.0
6537,GTEX-11DXZ-0126-SM-5EGGY,1.0,B1,"2 pieces, foci of hepatocyte necrosis and ball...",7.9,Liver,Liver,1114,250.0,951.0,...,50.576138,0.005714,0.993494,,0.0,49.746162,GTEX-11DXZ,1.0,50-59,0.0
6538,GTEX-11EQ9-0526-SM-5A5JZ,1.0,B1,"2 pieces, diffuse macro and microvesucular ste...",8.1,Liver,Liver,1114,82.0,617.0,...,50.2212,0.007967,0.995574,,0.0,49.962532,GTEX-11EQ9,1.0,30-39,2.0
6539,GTEX-11GSP-0626-SM-5986T,1.0,B1,2 pieces; central vascular congestion,6.2,Liver,Liver,1114,771.0,565.0,...,50.910202,0.025063,0.99258,,0.0,49.414272,GTEX-11GSP,2.0,60-69,2.0
6540,GTEX-11NUK-1226-SM-5P9GM,1.0,B1,"2 pieces, includes capsule (target is 1 cm bel...",6.1,Liver,Liver,1114,956.0,829.0,...,49.88246,0.015518,0.989904,,0.0,50.5536,GTEX-11NUK,1.0,50-59,2.0


## Test and Train Set Creation

In [282]:
cpm_train, cpm_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(lcpm, tissue_meta['AGE'], test_size=.3, random_state=1234) # random state guarantees that the same split is made for a given tissue.
print(cpm_train.shape)
#print(cpm_train.head())
print(tissue_meta['AGE'].unique())
print(cpm_test.shape)
print(y_train.head())
print(y_test.shape)
sum(cpm_train.iloc[:,0]) # Confirms that the split is the same each time

(122, 56202)
['60-69' '50-59' '30-39' '20-29' '40-49' '70-79']
(53, 56202)
6650    40-49
6662    50-59
6572    40-49
6628    60-69
6639    50-59
Name: AGE, dtype: object
(53,)


632

## Filter by Expression
As adapted from the edgeR package function [filterByExpression](https://rdrr.io/bioc/edgeR/src/R/filterByExpr.R).

In [283]:
def filter_by_expr(min):
    pass

In [284]:
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as data

In [285]:
cpm_new = cpm_train.as_matrix()
cpm_new = np.delete(cpm_new,0,axis=1)
print(cpm_new)

[[3.450000e+02 3.000000e+00 8.000000e+00 ... 1.049359e+06 1.000000e+00
  1.000000e+00]
 [4.010000e+02 3.000000e+00 3.000000e+00 ... 1.184313e+06 1.000000e+00
  1.000000e+00]
 [3.760000e+02 5.000000e+00 3.000000e+00 ... 6.927930e+05 3.000000e+00
  6.000000e+00]
 ...
 [2.770000e+02 3.000000e+00 4.000000e+00 ... 2.218908e+06 8.000000e+01
  3.000000e+00]
 [5.050000e+02 2.000000e+00 4.000000e+00 ... 1.416132e+06 2.000000e+00
  0.000000e+00]
 [2.410000e+02 2.000000e+00 2.000000e+00 ... 1.068157e+06 0.000000e+00
  0.000000e+00]]


  """Entry point for launching an IPython kernel.


In [286]:
y_train_new = y_train.map({'20-29':0,'30-39':1,'40-49':2,'50-59':3, '60-69':4,
       '70-79':5})

In [287]:
class my_points():
    def __init__(self):
        #pd_data = pd.read_csv(filename).values   # Read data file.
        self.data = cpm_new # 1st and 2nd columns --> x,y
        self.target = y_train_new.as_matrix() # 3nd column --> label
        self.n_samples = self.data.shape[0]
    
    def __len__(self):   # Length of the dataset.
        return self.n_samples
    
    def __getitem__(self, index):
        # Function that returns one point and one label.
        #print(self.target[index])
        #print(self.data[index])
        #return torch.Tensor(self.data[index]), torch.Tensor(self.target[index])
        return self.data[index], self.target[index]

In [288]:
# We create the dataloader.
#my_data = my_points('clas.csv')
import torch.utils.data as data
my_data = my_points()
batch_size = 1
my_loader = data.DataLoader(my_data,batch_size=batch_size,num_workers=0)


  """


In [289]:
# We build a simple model with the inputs and six output layers(one for each age group).
class my_model(nn.Module):
    def __init__(self,n_in=56201,n_hidden=10,n_out=6):
        super(my_model,self).__init__()
        self.n_in  = n_in
        self.n_out = n_out
         
        self.linearlinear = nn.Sequential(
            nn.Linear(self.n_in,self.n_out,bias=True),   # Hidden layer.
            )
        self.logprob = nn.LogSoftmax(dim=1)                 # -Log(Softmax probability).
    
    def forward(self,x):
        x = self.linearlinear(x)
        x = self.logprob(x)
        return x

In [290]:
# Now, we create the mode, the loss function or criterium and the optimizer 
# that we are going to use to minimize the loss.

# Model.
model = my_model()

# Negative log likelihood loss.
criterium = nn.NLLLoss()

# Adam optimizer with learning rate 0.1 and L2 regularization with weight 1e-4.
optimizer = torch.optim.Adam(model.parameters(),lr=0.1,weight_decay=1e-4)

In [291]:
# Taining.
model.double()
accuracy=0
for epoch in range(3):
    truecount=0
    totalcount=0
    
    for k, (data, target) in enumerate(my_loader):
        model.zero_grad()
        log_p = model(data)
        loss = criterium(log_p,target)
        loss.backward()
        #print(target)#, torch.max(torch.exp(log_p),1))
        #print(type(torch.max(torch.exp(log_p),1)))
        #print(torch.max(torch.exp(log_p),1)[1])
        totalcount+=1
        if(target == torch.max(torch.exp(log_p),1)[1]):
            truecount+=1
        

        # Print loss every 10 iterations.
        #if k%10==0:
        #    print('Loss {:.4f} at iter {:d}'.format(loss.item(),k))

        # Model weight modification based on the optimizer. 
        optimizer.step()
    #output = (log_p>0.5).float()
    #correct = (output == labels).float().sum()
    #print("Epoch {}/{}, Loss: {:.3f}, Accuracy: {:.3f}".format(epoch+1,num_epochs, loss.data[0], correct/x.shape[0]))
    if ((truecount/totalcount)*100)>accuracy:
        accuracy = (truecount/totalcount)*100
    print(truecount/totalcount)
#print(log_p)
print('Accuracy for ',TISSUE,' tissue: ',accuracy)
   

0.3524590163934426
0.38524590163934425
0.4426229508196721
Accuracy for  Liver  tissue:  44.26229508196721


In [86]:
colors = ['r','b','g','y']
points = data.numpy()

# Ground truth.
target = target.numpy()
for k in range(4):
    select = target[:,0]==k
    p = points[select,:]
    plt.scatter(p[:,0],p[:,1],facecolors=colors[k])

# Predictions.
pred = pred.exp().detach()     # exp of the log prob = probability.
_, index = torch.max(pred,1)   # index of the class with maximum probability.
pred = pred.numpy()
index = index.numpy()
for k in range(4):
    select = index==k
    p = points[select,:]
    plt.scatter(p[:,0],p[:,1],s=60,marker='s',edgecolors=colors[k],facecolors='none')

plt.show()

IndexError: too many indices for array

In [213]:
#Accuracy for  Liver  tissue:  45.08196721311475
#Accuracy for  Lung  tissue:  36.577181208053695
#Accuracy for  Stomach  tissue:  39.010989010989015
#Accuracy for  Pancreas  tissue:  34.68208092485549
#Accuracy for  Thyroid  tissue:  39.1025641025641
#Accuracy for  Spleen  tissue:  37.16814159292036
#Accuracy for  Vagina  tissue:  38.75
#Accuracy for  Uterus  tissue:  44.15584415584416



In [246]:
import pandas as pd
from pathlib import Path
from os import listdir
import sklearn.model_selection
import sklearn.feature_selection
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as data

data_dir=Path("data")
tissue_dir=Path("tissue-specific")
manifest={"data":"All_Tissue_Site_Details.combined.reads.gct",
              "sample_meta":"GTEx_v7_Annotations_SampleAttributesDS.txt",
              "subject_meta":"GTEx_v7_Annotations_SubjectPhenotypesDS.txt",
               "merged_meta":"merged_meta.tsv"}
meta=pd.read_csv(data_dir/manifest['merged_meta'],sep="\t",dtype={'SMUBRID':object,'SEX':object,'DTHHRDY':object})
meta=meta[~(meta['AGE'].isnull())] # removes all samples without age
#meta=meta[~(np.isnan(meta['AGE']))]
#meta.iloc[0:3,:]
counts=pd.DataFrame(meta['SMTS'].value_counts())
df=meta[meta['SMTS'].isin(counts[counts['SMTS']>200].index)]
df=pd.crosstab(index=df['SMTS'],columns=df['AGE'])
tissues = df.index.values
print(tissues)


['Adipose Tissue' 'Blood' 'Blood Vessel' 'Brain' 'Breast' 'Colon'
 'Esophagus' 'Heart' 'Lung' 'Muscle' 'Nerve' 'Pancreas' 'Skin' 'Stomach'
 'Testis' 'Thyroid']


In [247]:
class my_points():
    def __init__(self):
        self.data = cpm_new 
        self.target = y_train_new.as_matrix() 
        self.n_samples = self.data.shape[0]
    
    def __len__(self):   
        return self.n_samples
    
    def __getitem__(self, index):
        return self.data[index], self.target[index]

In [248]:
# We build a simple model with the inputs and six output layers(one for each age group).
class my_model(nn.Module):
    def __init__(self,n_in=56201,n_hidden=10,n_out=6):
        super(my_model,self).__init__()
        self.n_in  = n_in
        self.n_out = n_out
         
        self.linearlinear = nn.Sequential(
            nn.Linear(self.n_in,self.n_out,bias=True),   # Hidden layer.
            )
        self.logprob = nn.LogSoftmax(dim=1)                 # -Log(Softmax probability).
    
    def forward(self,x):
        x = self.linearlinear(x)
        x = self.logprob(x)
        return x

In [294]:

infiles=listdir(data_dir/tissue_dir)
tissueList = ['Blood','Blood Vessel','Breast','Colon',
 'Esophagus','Heart','Lung','Muscle','Nerve','Pancreas','Skin','Stomach','Testis','Thyroid']
for TISSUE in tissueList:
    print(TISSUE)
    TISSUE_files=[f for f in infiles if  TISSUE in f]
    cpm=pd.read_csv(data_dir/tissue_dir/TISSUE_files[1],sep="\t",index_col=0)
    lcpm=pd.read_csv(data_dir/tissue_dir/TISSUE_files[0],sep="\t",index_col=0)
    tissue_meta=meta[meta['SMTS']==TISSUE]
    print(meta['AGE'].unique())
    print(tissue_meta.shape[0])
    print(lcpm.shape)
    if tissue_meta.shape[0] == lcpm.shape[0]:
        
        cpm_train, cpm_test, y_train, y_test = \
                sklearn.model_selection.train_test_split(lcpm, tissue_meta['AGE'], test_size=.3, random_state=1234) # random state guarantees that the same split is made for a given tissue.
        cpm_new = cpm_train.as_matrix()
        cpm_new = np.delete(cpm_new,0,axis=1)
        y_train_new = y_train.map({'20-29':0,'30-39':1,'40-49':2,'50-59':3, '60-69':4,
               '70-79':5})
        my_data = my_points()
        batch_size = 1
        import torch.utils.data as data
        my_loader = data.DataLoader(my_data,batch_size=batch_size,num_workers=0)
        # Now, we create the mode, the loss function or criterium and the optimizer 
        # that we are going to use to minimize the loss.

        # Model.
        model = my_model()

        # Negative log likelihood loss.
        criterium = nn.NLLLoss()

        # Adam optimizer with learning rate 0.1 and L2 regularization with weight 1e-4.
        optimizer = torch.optim.Adam(model.parameters(),lr=0.1,weight_decay=1e-4)
        # Training.
        model.double()
        accuracy=0
        for epoch in range(3):
            truecount=0
            totalcount=0

            for k, (data, target) in enumerate(my_loader):
                model.zero_grad()
                log_p = model(data)
                loss = criterium(log_p,target)
                loss.backward()
                totalcount+=1
                if(target == torch.max(torch.exp(log_p),1)[1]):
                    truecount+=1
                # Model weight modification based on the optimizer. 
                optimizer.step()
            if ((truecount/totalcount)*100)>accuracy:
                accuracy = (truecount/totalcount)*100
            print(truecount/totalcount)

        print('Accuracy for ',TISSUE,' tissue: ',accuracy)

Blood
['60-69' '50-59' '40-49' '20-29' '30-39' '70-79']
537
(913, 56202)
Blood Vessel
['60-69' '50-59' '40-49' '20-29' '30-39' '70-79']
913
(913, 56202)


  """


0.2676056338028169
0.3646322378716745
0.3646322378716745
Accuracy for  Blood Vessel  tissue:  36.46322378716745
Breast
['60-69' '50-59' '40-49' '20-29' '30-39' '70-79']
290
(290, 56202)


RuntimeError: Expected object of scalar type Double but got scalar type Long for argument #4 'mat1'