# RNA and DNA Model

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('merged_data.csv', header=0, index_col=0)
df.head()

Unnamed: 0,Autism,CDK5RAP1,TMEM246,PKD1,EDEM1,LSM11,COL26A1,CPSF6,DCTN5,TIMM9,...,chrX@149013727,chrX@151303393,chrX@151899760,chrX@152815089,chrX@153174819,chrX@153174867,chrX@153668172,chrX@153994596,chrX@154456747,chrY@21154466
1001-1,1.0,4.102123,5.449758,2.754897,4.421296,2.095513,5.256877,8.93742,9.332593,12.725592,...,0,0,0,0,0,0,0,0,0,1
1001-2,1.0,4.694169,5.454356,3.805587,4.818231,2.024392,5.046285,10.638089,8.801531,12.299218,...,0,0,0,0,0,0,0,0,0,1
1001-3,1.0,2.812362,1.189077,3.88632,6.84351,2.03651,1.080013,8.767987,8.341488,15.941539,...,0,0,0,0,0,0,0,0,0,1
1401-1,1.0,4.674325,5.576149,6.349399,3.780281,4.246606,5.585305,12.661424,10.581923,13.798437,...,1,0,0,0,0,0,0,0,0,1
1401-2,1.0,7.434831,5.90921,6.793024,5.334093,4.24192,5.091821,13.879991,11.368513,13.393159,...,1,0,0,0,0,0,0,0,0,1


In [12]:
y = df.Autism
y = y.fillna(0.0).astype(int)
y

1001-1       1
1001-2       1
1001-3       1
1401-1       1
1401-2       1
            ..
GM25256-3    0
GM25256-3    0
PGP1-1       0
PGP1-2       0
PGP1-3       0
Name: Autism, Length: 75, dtype: int32

In [13]:
x = df.iloc[:,1:]
x

Unnamed: 0,CDK5RAP1,TMEM246,PKD1,EDEM1,LSM11,COL26A1,CPSF6,DCTN5,TIMM9,BMPR2,...,chrX@149013727,chrX@151303393,chrX@151899760,chrX@152815089,chrX@153174819,chrX@153174867,chrX@153668172,chrX@153994596,chrX@154456747,chrY@21154466
1001-1,4.102123,5.449758,2.754897,4.421296,2.095513,5.256877,8.937420,9.332593,12.725592,7.022345,...,0,0,0,0,0,0,0,0,0,1
1001-2,4.694169,5.454356,3.805587,4.818231,2.024392,5.046285,10.638089,8.801531,12.299218,6.732810,...,0,0,0,0,0,0,0,0,0,1
1001-3,2.812362,1.189077,3.886320,6.843510,2.036510,1.080013,8.767987,8.341488,15.941539,10.167122,...,0,0,0,0,0,0,0,0,0,1
1401-1,4.674325,5.576149,6.349399,3.780281,4.246606,5.585305,12.661424,10.581923,13.798437,6.693848,...,1,0,0,0,0,0,0,0,0,1
1401-2,7.434831,5.909210,6.793024,5.334093,4.241920,5.091821,13.879991,11.368513,13.393159,8.375181,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GM25256-3,2.650833,6.657273,7.137243,4.336089,3.736166,5.295432,11.733303,9.982328,13.858887,7.570179,...,1,0,0,0,0,0,0,0,0,1
GM25256-3,2.650833,6.657273,7.137243,4.336089,3.736166,5.295432,11.733303,9.982328,13.858887,7.570179,...,1,0,0,0,0,0,0,0,0,0
PGP1-1,5.561823,9.167399,4.762467,4.259699,3.333153,5.245319,15.288924,10.670947,12.501312,7.806645,...,1,0,0,0,0,0,0,0,0,0
PGP1-2,2.050032,6.121957,3.086031,3.888580,3.064389,4.530108,14.225061,11.109630,14.227982,7.346219,...,1,0,0,0,0,0,0,0,0,0


In [18]:
from tensorflow import keras
from keras import layers
from tensorflow.keras.optimizers import Adam

#create early stop
early_stop = keras.callbacks.EarlyStopping(monitor = 'loss', min_delta = 0.001, patience=5, verbose=1)

#create model
model = keras.Sequential([
    layers.Dense(units=100, activation='relu', input_shape=[16776]),
    layers.Dense(units=1, activation='sigmoid')
])

#compile model
opt = Adam(learning_rate=0.001)
model.compile(
    optimizer=opt,
    loss='Poisson',
    metrics=['accuracy']
)

In [19]:
#import cross fold validation
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

mm = StandardScaler()

k = 5 #number of folds

#initialize kfold
kf = KFold(n_splits=k, random_state=1, shuffle=True)

#create empty list to hold scores
acc_score = []

#split training data
for train_index,test_index in kf.split(x):
    x_train, x_test = x.iloc[train_index,:], x.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    
    x_train_scaled = mm.fit_transform(x_train)
    x_test_scaled = mm.fit_transform(x_test)
    
    #fit model with training data
    history = model.fit(x_train, y_train, validation_data=(x_test,y_test),
                        batch_size = 20, epochs=100, shuffle=True, callbacks=[early_stop])
    pred = model.predict(x_test)
    
    #record accuracy of each fold
    acc = accuracy_score(y_test,pred.round())
    acc_score.append(acc)

#Print accuracy of the model
avg_score = sum(acc_score)/k
print('Accuracy of each fold = {}'.format(acc_score))
print('Avg accuracy = {}'.format(avg_score))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').