# Processes data in Raw folder and stores them in Data folder

## Libraries

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import time
import csv

---
## Code

### `process_raw()`

Creates both a test and a train dataset in `SmartS_data` or `DropSeq_data` respectively, with of shape: (*n_features*, *n_samples*)

In [2]:
def process_raw(section="SmartS", test_size=0.20, seed=42):

    ignore=""
    name = "_SmartS"
    if section == "DropSeq":
        ignore = "_ignore"
        name = ""

    HCC = pd.read_csv(f"{section}_raw{ignore}/HCC1806{name}_Filtered_Normalised_3000_Data_train.txt", delimiter="\ ",engine='python',index_col=0)
    MCF = pd.read_csv(f"{section}_raw{ignore}/MCF7{name}_Filtered_Normalised_3000_Data_train.txt", delimiter="\ ",engine='python',index_col=0)

    HCC_train, HCC_test = train_test_split(HCC.T, test_size=test_size, random_state=seed)
    MCF_train, MCF_test = train_test_split(MCF.T, test_size=test_size, random_state=seed)

    HCC_train.T.to_csv(f"{section}_data/HCC1806_{section}_Filtered_Normalised_3000_Data_train.txt", sep=" ", quoting=csv.QUOTE_NONE)
    HCC_test.T.to_csv(f"{section}_data/HCC1806_{section}_Filtered_Normalised_3000_Data_test.txt", sep=" ", quoting=csv.QUOTE_NONE)
    MCF_train.T.to_csv(f"{section}_data/MCF7_{section}_Filtered_Normalised_3000_Data_train.txt", sep=" ", quoting=csv.QUOTE_NONE)
    MCF_test.T.to_csv(f"{section}_data/MCF7_{section}_Filtered_Normalised_3000_Data_test.txt", sep=" ", quoting=csv.QUOTE_NONE)

### `data_split()`

Returns Train and Test `pandas.DataFrame` along with `max_dim` and their true labels

In [3]:
def data_split(file='MCF7',section="SmartS"):
#>> Import, Rename, Cleaning Data (Missing XCells train data)
    def renamer(name, section=section):
        class_position = {"SmartS":-3, "DropSeq":-1}
        classification = name.split("_")[class_position[section]] #change -1 into -3
        cell = name.split("_")[-2]
        if len(classification) > 4:
            classification = classification[:4]
        return classification+"_"+cell

    # Train
    filepath_Train = f"{section}_data/{file}_{section}_Filtered_Normalised_3000_Data_train.txt" # remove "(DropSeq)"
    pd_Train = pd.read_csv(filepath_Train,delimiter=" ",index_col=0).astype('float32')
    pd_Train.rename(mapper=renamer, axis='columns', inplace=True)
    pd_Train.dropna(axis='rows', inplace = True)
    # print(df_Train.shape)
    pd_y_Train = [int(i.split("_")[0]=='Norm') for i in pd_Train.columns]

    # Test
    filepath_Test = f"{section}_data/{file}_{section}_Filtered_Normalised_3000_Data_test.txt" # remove "(DropSeq)"
    pd_Test = pd.read_csv(filepath_Test,delimiter=" ",index_col=0).astype('float32')
    pd_Test.rename(mapper=renamer, axis='columns', inplace=True)
    pd_Test.dropna(axis='rows', inplace = True)
    # print(df_Test.shape)
    pd_y_Test = [int(i.split("_")[0]=='Norm') for i in pd_Test.columns]

    max_dim = min(pd_Train.shape)

    data = {"train":pd_Train, "test":pd_Test, "max dim":max_dim, "y train":pd_y_Train, "y test":pd_y_Test}

    return data

### ```data_loader()```

In [4]:
def data_loader(file='MCF7',section="SmartS"):
#>> Import, Rename, Cleaning Data (Missing XCells train data)
    def renamer(name, section=section):
        class_position = {"SmartS":-3, "DropSeq":-1}
        classification = name.split("_")[class_position[section]] #change -1 into -3
        cell = name.split("_")[-2]
        if len(classification) > 4:
            classification = classification[:4]
        return classification+"_"+cell

    ignore=""
    name = "_SmartS"
    if section == "DropSeq":
        ignore = "_ignore"
        name = ""

    # Set
    filepath_X = f"{section}_raw{ignore}/{file}{name}_Filtered_Normalised_3000_Data_train.txt"
    X = pd.read_csv(filepath_X,delimiter=" ",index_col=0).astype('float32')
    X.rename(mapper=renamer, axis='columns', inplace=True)
    X.dropna(axis='rows', inplace = True)
    y = [int(i.split("_")[0]=='Norm') for i in X.columns]

    return X, y