In [13]:
from sklearn import preprocessing

import pandas as pd
import numpy as np
import os

In [14]:
data_path = "../dataset/data/"
HM_names = ["H3K27me3", "H3K26me3", "H3K4me1", "H3K4me3", "H3K9me3"]
columns_names = ["geneID", "binID", "HM1", "HM2", "HM3", "HM4", "HM5", "label"]

In [15]:
def load_datasets(folder_name):    
    # ? test.csv
    test_df = pd.read_csv(f"{data_path}{folder_name}/classification/test.csv", header=None, names=columns_names,)
    # ? train.csv
    train_df = pd.read_csv(f"{data_path}{folder_name}/classification/train.csv", header=None, names=columns_names,)
    # ? valid.csv
    valid_df = pd.read_csv(f"{data_path}{folder_name}/classification/valid.csv", header=None, names=columns_names,)
    return (train_df, valid_df, test_df)

In [16]:
def create_dataset(df, maxElements):
    data = []
    labels = []
    genes_inserted = {}

    range_ = range(int(len(df) / 100))
    if maxElements:
        range_ = range_[:maxElements]
    #bar = Bar("\t- creating datasets", max=len(range_))
    for i in range_:  # todo: remove this limit to consider the full dataset
        geneID = df.iloc[i * 100]["geneID"]
        if genes_inserted.get(geneID) is None:
            genes_inserted[geneID] = True

            df_gene = df.loc[df["geneID"] == geneID]

            labels.append(df_gene.iloc[0]["label"])  # ? saving the label once and for all the gene
            # ? extract the whole list of values for the different HMs as a matrix
            # ! I had to put [:100] to limit the errors on the input data (e.i. some genes where duplicates)
            gene_data = [
                list(df_gene["HM1"])[:100],
                list(df_gene["HM2"])[:100],
                list(df_gene["HM3"])[:100],
                list(df_gene["HM4"])[:100],
                list(df_gene["HM5"])[:100],
            ]
            data.append(preprocessing.normalize(gene_data))
        #bar.next()
    #bar.finish()
    return (data, labels)


In [17]:
def to_numpy(data, labels):
    #bar = Bar("\t- converting to numpy format", max=len(data))
    numpy_data = np.array([])
    for gene in data:
        numpy_gene = np.array([])
        for bins in gene:
            numpy_gene = np.append(numpy_gene, np.array(bins))
        numpy_data = np.append(numpy_data, numpy_gene)
        #bar.next()
    #bar.finish()
    numpy_data = numpy_data.reshape(len(data), 5, 100, 1)
    numpy_labels = np.array(labels)
    return numpy_data, numpy_labels

In [18]:
'''
    This function normalize the datasets using the mean and the standard deviation of the training set.
    Each normalization is computed at cell-type level, not at global level.
'''
def normalize(train_data,valid_data,test_data):

    mean = np.mean(train_data)
    train_data-=mean
    valid_data-=mean
    test_data-=mean
    
    std = np.std(train_data)
    train_data/=std
    valid_data/=std
    test_data/=std    
    
    return (train_data,valid_data,test_data)

In [29]:
def load(maxFolders=None, maxElements=None):

    full_train_data = np.array([])
    full_valid_data = np.array([])
    full_test_data = np.array([])
    full_train_labels = np.array([])
    full_valid_labels = np.array([])
    full_test_labels = np.array([])

    dirs = os.listdir(data_path)
    if maxFolders:
        dirs = dirs[:maxFolders]    
    for folder in dirs:
        folder_files =  os.listdir(data_path+folder)
        if len(folder_files)==1:
            (train_data, train_labels), (valid_data, valid_labels), (test_data, test_labels) = load_folder(
                folder, maxElements
            )
        else:
            (train_data, train_labels), (valid_data, valid_labels), (test_data, test_labels) = load_clean_data(folder)            
            print(f"{folder} loaded")
        
        # append to the full np array
        
        # train
        full_train_data = np.append(full_train_data, train_data)
        full_train_labels = np.append(full_train_labels, train_labels)
        # valid
        full_valid_data = np.append(full_valid_data, valid_data)
        full_valid_labels = np.append(full_valid_labels, valid_labels)
        # test
        full_test_data = np.append(full_test_data, test_data)
        full_test_labels = np.append(full_test_labels, test_labels)

    # ? reshaping
    full_train_data = full_train_data.reshape(len(full_train_labels), 5, 100, 1)
    full_valid_data = full_valid_data.reshape(len(full_valid_labels), 5, 100, 1)
    full_test_data = full_test_data.reshape(len(full_test_labels), 5, 100, 1)

    return (
        (full_train_data, full_train_labels),
        (full_valid_data, full_valid_labels),
        (full_test_data, full_test_labels),
    )

In [25]:
'''
    Load the data contained in one folder
'''
def load_folder(folder_name, maxElements=None):
    print(folder_name)
    # loading the datasets
    print("\tloading")
    train_df, valid_df, test_df = load_datasets(folder_name)

    # create the datasets with the correct format
    print("\tcreating datasets")
    train_data, train_labels = create_dataset(train_df, maxElements)
    valid_data, valid_labels = create_dataset(valid_df, maxElements)
    test_data, test_labels = create_dataset(test_df, maxElements)

    # numpy arrays
    print("\tto numpy")
    train_data, train_labels = to_numpy(train_data, train_labels)
    valid_data, valid_labels = to_numpy(valid_data, valid_labels)
    test_data, test_labels = to_numpy(test_data, test_labels)
    
    print("\tnormalize")
    train_data,valid_data,test_data = normalize(train_data,valid_data,test_data)
    
    # saving data to file
    print("\tsaving data")
    save_data(folder_name, train_data, train_labels, valid_data, valid_labels, test_data, test_labels)

    # return tris of tuples (data,labels)
    return ((train_data, train_labels), (valid_data, valid_labels), (test_data, test_labels))


In [26]:
''' 
    save the data in the correct format in order to speed up future loading time
'''
def save_data(folder_name, train_data, train_labels, valid_data, valid_labels, test_data, test_labels):
    train_data = train_data.reshape(len(train_data),5,100)
    valid_data = valid_data.reshape(len(valid_data),5,100)
    test_data = test_data.reshape(len(test_data),5,100)
    # data
    np.save(f"{data_path}{folder_name}/train_data.npy", train_data)
    np.save(f"{data_path}{folder_name}/valid_data.npy", valid_data)
    np.save(f"{data_path}{folder_name}/test_data.npy", test_data)
    # labels
    np.save(f"{data_path}{folder_name}/train_labels.npy", train_labels)
    np.save(f"{data_path}{folder_name}/valid_labels.npy", valid_labels)
    np.save(f"{data_path}{folder_name}/test_labels.npy", test_labels)
    

In [27]:
'''
    loads to memory the already cleaned data
'''
def load_clean_data(folder_name):
    # data
    train_data = np.load(f"{data_path}{folder_name}/train_data.npy")
    valid_data = np.load(f"{data_path}{folder_name}/valid_data.npy")
    test_data = np.load(f"{data_path}{folder_name}/test_data.npy")
    # labels
    train_labels = np.load(f"{data_path}{folder_name}/train_labels.npy")
    valid_labels = np.load(f"{data_path}{folder_name}/valid_labels.npy")
    test_labels = np.load(f"{data_path}{folder_name}/test_labels.npy")
    # return complete tuples
    return ((train_data, train_labels), (valid_data, valid_labels), (test_data, test_labels))

In [28]:
(train_data, train_labels), (valid_data, valid_labels), (test_data, test_labels) = load()

E003 loaded
E003 normalized
E003 re-saved
E004 loaded
E004 normalized
E004 re-saved
E005 loaded
E005 normalized
E005 re-saved
E006 loaded
E006 normalized
E006 re-saved
E007 loaded
E007 normalized
E007 re-saved
E011 loaded
E011 normalized
E011 re-saved
E012 loaded
E012 normalized
E012 re-saved
E013 loaded
E013 normalized
E013 re-saved
E016 loaded
E016 normalized
E016 re-saved
E024 loaded
E024 normalized
E024 re-saved
E027 loaded
E027 normalized
E027 re-saved
E028 loaded
E028 normalized
E028 re-saved
E037 loaded
E037 normalized
E037 re-saved
E038 loaded
E038 normalized
E038 re-saved
E047 loaded
E047 normalized
E047 re-saved
E050 loaded
E050 normalized
E050 re-saved
E053 loaded
E053 normalized
E053 re-saved
E054 loaded
E054 normalized
E054 re-saved
E055 loaded
E055 normalized
E055 re-saved
E056 loaded
E056 normalized
E056 re-saved
E057 loaded
E057 normalized
E057 re-saved
E058 loaded
E058 normalized
E058 re-saved
E059 loaded
E059 normalized
E059 re-saved
E061 loaded
E061 normalized
E061 r