In [1]:
import pandas as pd
import numpy as np
import config

In [4]:
import functions
X, y = functions.load_taxonomy(boolean=True) 
X

Unnamed: 0_level_0,Unnamed: 1_level_0,s__Abiotrophia_defectiva,s__Acidaminococcus_fermentans,s__Acidaminococcus_intestini,s__Actinomyces_graevenitzii,s__Actinomyces_odontolyticus,s__Actinomyces_oris,s__Actinomyces_turicensis,s__Actinomyces_viscosus,s__Adlercreutzia_equolifaciens,s__Aggregatibacter_segnis,...,s__Subdoligranulum_variabile,s__Succinatimonas_hippei,s__Sutterella_wadsworthensis,s__Turicibacter_sanguinis,s__Varibaculum_cambriense,s__Veillonella_atypica,s__Veillonella_dispar,s__Veillonella_parvula,s__Weissella_cibaria,s__Weissella_confusa
Study Accession,Sample Accession or Sample ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
PRJDB3601,SAMD00036192,False,False,False,True,True,False,False,False,False,False,...,False,False,True,False,False,True,True,True,False,False
PRJDB3601,SAMD00036193,False,False,False,True,False,False,False,False,True,False,...,False,False,True,False,False,False,False,False,False,False
PRJDB3601,SAMD00036194,False,False,False,False,False,False,False,False,True,False,...,False,False,True,False,False,False,False,False,False,False
PRJDB3601,SAMD00036197,False,False,False,True,True,False,False,True,True,False,...,False,False,True,False,False,True,True,True,False,False
PRJDB3601,SAMD00036204,False,False,False,True,True,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N5,SRR5279310,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
N5,SRR5279311,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
N5,SRR5279312,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
N5,SRR5279313,False,False,False,True,False,False,False,False,False,False,...,False,False,True,False,False,True,True,True,False,False


In [5]:
class Linear:
    def __init__(self, input_size, output_size):
        self.w = np.random.rand(output_size, input_size) / input_size
        self.b = np.random.rand(output_size, 1) / input_size
        
    def forward(self, X):
        """
        X = (n_i, m)
        """
        self.input = X
        return np.dot(self.w, X) + self.b
    
    def backward(self, grad):
        """
        grad is partial derivative of the cost function wrt the output of the linear layer.
        grad = (n_o, m)
        Returns partial derivative of the cost function wrt the input of the linear layer.
        """
#         print('backward linear')
        # (n_o, m) * (m, n_i) = (n_o, n_i)
        self.weight_grad = np.dot(grad, self.input.T)
        # (n_o, m)
        self.bias_grad = np.sum(grad, axis=1, keepdims=True)
        # (n_i, n_o) * (n_o, m) = (n_i, m)
        return np.dot(self.w.T, grad)
    
    def step(self, alpha):
        """
            Subtracts alpha * gradient from weights
        """
        self.w = self.w - alpha * self.weight_grad
        self.b = self.b - alpha * self.bias_grad

In [6]:
class Sigmoid:
    def __init__(self):
        pass
    
    def forward(self, z):
        # (n_i, m)
        self.output = 1 / (1 + np.exp(-1 * z)) 
        return self.output
    
    def backward(self, grad):
#         print('backward sigmoid')
        # (n_i, m) 
        return self.output * (1 - self.output) * grad

    def step(self, alpha):
        """Does nothing"""
        pass

In [7]:
class MSE:
    def __init__(self):
        pass
    
    def forward(self, y, y_hat):
        return np.square(y - y_hat)
    
    def backward(self, y, y_hat):
        """
            Returns partial derivative of cost function
            wrt y_hat
        """
        return -2 * (y - y_hat)

In [8]:
class Net:
    def __init__(self, layers):
        self.layers = layers
        self.mse = MSE()
        
    def forward(self, X):
        y_hat = X
        for layer in self.layers:
            y_hat = layer.forward(y_hat)
        return y_hat
    
    def backward(self, grad):
        """
            Grad is partial derivative of cost function wrt to y_hat
        """
        for layer in self.layers[::-1]:
            grad = layer.backward(grad)
            
    def step(self, alpha):
        for layer in self.layers:
            layer.step(alpha)
    
    def fit(self, X, y, epochs=30, mini_batch_size=10, alpha=0.03, test_data=None):
        y = pd.get_dummies(y[0, :]).T.values
        for epoch in range(epochs):
            for minibatch in self.get_minibatches(X, y, mini_batch_size):
                X_batch, y_batch = minibatch
                y_hat = self.forward(X_batch)
                grad = self.mse.backward(y_batch, y_hat)
                self.backward(grad)
                self.step(alpha)
            print("End epoch", str(epoch), )
            if test_data is not None:
                print("Accuracy:", self.score(test_data[0], test_data[1]))
                
    def predict(self, X):
        y_hat = self.forward(X)
        predictions = np.argmax(y_hat, axis=0)
        return predictions
        
    def score(self, X, y):
        """Returns accuracy"""
        predictions = self.predict(X)
        accuracy = (y == predictions).sum(axis=0).mean()
        return accuracy
    
    def get_minibatches(self, X, y, mini_batch_size):
        m = X.shape[1]
        perm = np.random.permutation(m)
        mini_batches = []
        for start_idx in range(0, m, mini_batch_size):
            end_idx = min(start_idx + mini_batch_size, m - 1)
            X_batch = X[:, start_idx:end_idx]
            y_batch = y[:, start_idx:end_idx]
            mini_batches.append((X_batch, y_batch))
        return mini_batches

In [79]:
species = list(X.columns)
genus = np.unique([s.split("_")[2] for s in species])

In [80]:
dfs = []
for g in genus:
    species_with_g = list(filter(lambda x : g in x, species))
    df = pd.DataFrame(X[species_with_g].sum(axis=1))
    df.columns = [g]
    dfs.append(df)
genus_df = pd.concat(dfs, axis=1)
genus_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Abiotrophia,Acidaminococcus,Actinomyces,Adlercreutzia,Aggregatibacter,Akkermansia,Alistipes,Alloscardovia,Anaerococcus,Anaerofustis,...,Staphylococcus,Streptococcus,Subdoligranulum,Succinatimonas,Sutterella,Turicibacter,Varibaculum,Veillonella,Weissella,candidate
Study Accession,Sample Accession or Sample ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
PRJDB3601,SAMD00036192,0,0,2,0,0,0,1,0,0,0,...,0,9,0,0,1,0,0,3,0,0
PRJDB3601,SAMD00036193,0,0,1,1,0,1,2,0,0,0,...,0,4,0,0,1,0,0,0,0,0
PRJDB3601,SAMD00036194,0,0,0,1,0,1,2,0,0,0,...,0,7,0,0,1,0,0,0,0,0
PRJDB3601,SAMD00036197,0,0,3,1,0,0,0,0,0,0,...,0,10,0,0,1,0,0,3,0,0
PRJDB3601,SAMD00036204,0,0,3,1,0,1,2,0,0,0,...,0,3,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N5,SRR5279310,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
N5,SRR5279311,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
N5,SRR5279312,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
N5,SRR5279313,0,0,1,0,0,0,4,0,0,0,...,0,7,0,0,1,0,0,3,0,0


In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [30]:
y_train.T.shape

(1, 4020)

In [33]:
net = Net([
    Linear(313, 30),
    Sigmoid(),
    Linear(30, 2),
    Sigmoid()
#     Linear(313, 2),
#     Sigmoid()
])
net.fit(X_train.T.values, y_train.T.values, test_data=(X_test.T.values, y_test.T.values), epochs=60)

End epoch 0
Accuracy: 0.6978131212723658
End epoch 1
Accuracy: 0.731610337972167
End epoch 2
Accuracy: 0.742544731610338
End epoch 3
Accuracy: 0.7495029821073559
End epoch 4
Accuracy: 0.7534791252485089
End epoch 5
Accuracy: 0.7584493041749503
End epoch 6
Accuracy: 0.757455268389662
End epoch 7
Accuracy: 0.7564612326043738
End epoch 8
Accuracy: 0.757455268389662
End epoch 9
Accuracy: 0.7624254473161034
End epoch 10
Accuracy: 0.7624254473161034
End epoch 11
Accuracy: 0.76441351888668
End epoch 12
Accuracy: 0.7594433399602386
End epoch 13
Accuracy: 0.7544731610337972
End epoch 14
Accuracy: 0.7534791252485089
End epoch 15
Accuracy: 0.7514910536779325
End epoch 16
Accuracy: 0.7514910536779325
End epoch 17
Accuracy: 0.7504970178926441
End epoch 18
Accuracy: 0.7495029821073559
End epoch 19
Accuracy: 0.7495029821073559
End epoch 20
Accuracy: 0.7504970178926441
End epoch 21
Accuracy: 0.7475149105367793
End epoch 22
Accuracy: 0.7475149105367793
End epoch 23
Accuracy: 0.7435387673956262
End epoc