In [4]:
import time
import os
from tqdm import trange
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

In [5]:
def process_frequencies(dirpath):
    
    dirpath = os.walk(dirpath)
    df_all = pd.DataFrame()
    name_files = []
    print("Reading files...")
    for dirpath, dirnames, filenames in dirpath:
        for filename in [f for f in filenames if f.endswith(".freq")]:
            path_freq_file = os.path.join(dirpath, filename)             
            name_freq_file = path_freq_file.split('/')[-1]                  
            name_files.append(name_freq_file)
            df_cpgs_file = pd.read_table(path_freq_file, sep="\t", index_col="position" )                                                            
            df_all = df_all.append(df_cpgs_file.T)                                                         
    df_all = df_all.fillna(0)
    df_all = df_all.T            
    print("Preprocessing dataframe...")
    print(len(name_files))
    X_train = preprocess_df(df_all, name_files)    
    print("Preprocessing Done!")
    
    return X_train, name_files

In [6]:
def transform_df(df_pos):
    df = pd.DataFrame()
    ### transoforms each sample to the sample scale row/all
    for i in range(len(df_pos)):
        row = df_pos.iloc[i]
        row = row/sum(row)
        df = df.append(pd.DataFrame(np.array(row).reshape(1,6), columns = df_pos.columns))
    df = df.fillna(0)
    scaler = MinMaxScaler()
    scaler.fit(df)
    df_transformed = scaler.transform(df)    
    df_transformed = pd.DataFrame(df_transformed, columns=df_pos.columns, index=df_pos.index)

    return df_transformed

In [7]:
def preprocess_df(df_all, name_files):
    
    df = pd.DataFrame()
    for index in range(len(df_all)):        
        
        df_tmp = pd.DataFrame()
        a = pd.DataFrame(df_all.iloc[index]["A"])
        a.index=name_files
        a.columns = ["A"]
        
        t = pd.DataFrame(df_all.iloc[index]["T"])    
        t.index=name_files
        t.columns = ["T"]    

        g = pd.DataFrame(df_all.iloc[index]["G"])
        g.index=name_files
        g.columns = ["G"]

        c = pd.DataFrame(df_all.iloc[index]["C"])
        c.index=name_files
        c.columns = ["C"]

        insertion = pd.DataFrame(df_all.iloc[index]["+"])
        insertion.index=name_files
        insertion.columns = ["+"]

        deletion = pd.DataFrame(df_all.iloc[index]["-"])
        deletion.index=name_files
        deletion.columns = ["-"]

        df_tmp = pd.concat([df_tmp, a, t, g, c, insertion, deletion], axis=1, sort=False)                
        df_scaled = transform_df(df_tmp)
        
        df = pd.concat([df, df_scaled], axis=1)
        
    return df

In [8]:
def predict(X_test, dirpath):
    load_model = tf.keras.models.load_model
    y_probs = np.array(0)
    nets = 0
    df_activations = pd.DataFrame()    
    dirpath = os.walk(dirpath)
    
    for dirpath, dirnames, filenames in dirpath:
        for filename in [f for f in filenames if f.endswith(".model")]:        
            
            nets += 1
            print(nets)                        
            path_model = os.path.join(dirpath, filename)             
            name_model = path_model.split('/')[-1]                   
            train_model = load_model(path_model)            
            y_pred = train_model.predict_classes(X_test)
            y_probs = y_probs + train_model.predict(X_test)            
            
    return y_pred,y_probs/nets

In [9]:
def predict(X_test, dirpath):
    
    nets = 0
    list_models = []
    y_probs = np.array(0)
    df_activations = pd.DataFrame()    
    dirpath = os.walk(dirpath)
    load_model = tf.keras.models.load_model        
    
    for dirpath, dirnames, filenames in dirpath:
        for filename in [f for f in filenames if f.endswith(".model")]:   
            path_model = os.path.join(dirpath, filename)    
            list_models.append(path_model)
    
    for i in trange(len(list_models)):                                                    
            path_model = os.path.join(dirpath, filename)             
            name_model = path_model.split('/')[-1]                   
            train_model = load_model(path_model)            
            y_pred = train_model.predict_classes(X_test)
            y_probs = y_probs + train_model.predict(X_test)                        
            time.sleep(0.01)
            nets += 1               
    return y_pred,y_probs/nets

### Preprocessing

In [10]:
freq_dirpath = "freq/" # Tissue ID
model_dirpath = "Model/"

In [11]:
X_test, name_files_test = process_frequencies(freq_dirpath)

Reading files...
Preprocessing dataframe...
9
Preprocessing Done!


### Prediction

In [12]:
y_pred, y_probs = predict(X_test.values, model_dirpath)

100%|██████████| 50/50 [04:19<00:00,  9.69s/it]


## Save Predictions and probabilities

In [13]:
df_probs = pd.DataFrame(y_probs,name_files_test)
df_probs.columns = ["Skin", "Oral","Vagina"]
df_probs.to_csv("predictions.csv")