### RNA-SEQ Dataset

- Generating a dataset endowed with expression profiles (DESeq Analysis between fat and slim organs'condition) and GTEx data
- Standardization of expression profiles and simple training test splitting
- Very simple Neural Network and Random Forest working examples with tf.keras and sklearn

In [None]:
import pandas as pd
import numpy as np

In [None]:
## Set the correct working directory
%cd ~/hackathon/data

In [None]:
id_list_pancreas = [] 
for val in open('pancreas/subjID.pancreas.id').readlines():
    id_list_pancreas.append(val.rstrip())
df_raw = pd.read_pickle('rna_seq_dataset.pkl')

In [None]:
drop_that_cols = ['Fat.Percentage_liver','Fat.Percentage_pancreas','liver_clean_specimens',
 'liver_hyalinization','liver_hyperplasia','liver_infarction','liver_ischemic_changes',
 'liver_no_abnormalities','liver_pigment','liver_scarring','liver_sclerotic','pancreas_calcification',
 'pancreas_congestion','pancreas_cyst','pancreas_desquamation','pancreas_diabetic','pancreas_hemorrhage',
 'pancreas_inflammation','pancreas_metaplasia','pancreas_necrosis','pancreas_no_abnormalities',
 'pancreas_nodularity','pancreas_scarring','pancreas_sclerotic']

In [None]:
## Pancreas
df_clean_pancreas = df_raw.ix[id_list_pancreas]
target_pancreas = df_clean_pancreas[['Fat.Percentage_pancreas']]/100
df_clean_pancreas = df_clean_pancreas.drop(drop_that_cols,axis=1)

df_tpm_pancreas = pd.read_csv('pancreas/tpm_matrix/tpm_matrix.pancreas.RNA.traspose.txt',sep=' ')

In [None]:
df_tpm_pancreas = df_tpm_pancreas.set_index(df_clean_pancreas.index)
df_tpm_pancreas = df_tpm_pancreas.drop(['Name'],axis=1)

In [None]:
## Liver
id_list_liver = [] 
for val in open('liver/subjID.liver.id').readlines():
    id_list_liver.append(val.rstrip())

In [None]:
df_clean_liver = df_raw.ix[id_list_liver]
target_liver = df_clean_liver[['Fat.Percentage_liver']]/100
df_clean_liver = df_clean_liver.drop(drop_that_cols,axis=1)

In [None]:
df_tpm_liver = pd.read_csv('liver/tpm_matrix/tpm_matrix.liver.RNA.traspose.txt',sep=' ')

In [None]:
df_tpm_liver = df_tpm_liver.set_index(df_clean_liver.index)
df_tpm_liver = df_tpm_liver.drop(['Name'],axis=1)

### Feature standardization

In [None]:
from sklearn.preprocessing import StandardScaler

## Pancreas
data = df_tpm_pancreas
scaler = StandardScaler()
scaler.fit(data)

In [None]:
pancreas_scaled = pd.DataFrame(scaler.transform(data))
pancreas_scaled = pancreas_scaled.set_index(df_clean_pancreas.index)

In [None]:
## Liver
data = df_tpm_liver
scaler = StandardScaler()
scaler.fit(data)

In [None]:
liver_scaled = pd.DataFrame(scaler.transform(data))
liver_scaled = liver_scaled.set_index(df_clean_liver.index)

### Models

#### Fromat and split the data

In [None]:
liver = pd.concat([df_clean_liver,liver_scaled],ignore_index=True,axis=1)

In [None]:
pancreas = pd.concat([df_clean_pancreas,pancreas_scaled],ignore_index=True,axis=1)

In [None]:
def split(dataset, target, test_size):
    from sklearn.model_selection import train_test_split
    return train_test_split(dataset, target, test_size=test_size, random_state=42)

#### Simple toy NN

In [None]:
import keras
from keras.layers import Input, Dense, Activation
from keras import Model

In [None]:
def network(training_data):
    input_layer = Input(shape=(training_data.shape[1],))
    hidden = Dense(128,activation='relu')(input_layer)
    hidden = Dense(16,activation='relu')(hidden)
    output_layer = Dense(1)(hidden)
    model = Model(input_layer,output_layer)

    optimizier = keras.optimizers.Adam(learning_rate=0.0001,amsgrad=0.8)
    model.compile(optimizer='Adam',loss='mean_squared_error',metrics=['mae'])
    return model

In [None]:
## Example with Pancreas Dataset

regressor = network(pancreas)
X_train, X_test, y_train, y_test = split(pancreas, target_pancreas, 0.33)

History = regressor.fit(X_train,y_train,validation_data=[X_test,y_test],batch_size=1,epochs=44) 

#### Simple toy Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=10000, random_state=42)
regressor.fit(X_train, y_train)