In [1]:
import flowio
import numpy as np
import pandas as pd
import os

In [3]:
file_path = 'OriginalDATA/FlowRepository_FR-FCM-Z24N_files'
files = os.listdir(file_path)
files.sort()

In [5]:
controls = [f for f in files if "KTR" in f]

unstim_control = [f for f in controls if 'unstim' in f]
stim_control = [f for f in controls if ' stim' in f]

In [7]:
ras = [f for f in files if "RA" in f]

unstim_ras = [f for f in ras if "unstim" in f]
stim_ras = [f for f in ras if " stim" in f]

### TODO
- get a list of the channels that I want with their channel number
- from the pandas get this channels
- name the channels appropriately

### Note
- Cannot find functional marker p-Erk

# RA-data description

Mass cytometry (CyTOF)
1. panel of 12 phenotyping and 10 functional markers
2. signaling in unstimulated and TNF-stimulated peripheral blood mononulear cells
3. from 20 newly diagnosed, untreated RA patients
4. 20 healthy donors.

1. unstim = unstimulated with tumor necrosis factor (TNF) (TNF inhibitors effectively repress inflammatory activity in RA)
2. stim = stimulated with TNF
3. ungated = manual removal of events representing 

## The RA paper:
The objective of this study was to identify markers in immune cell populations that distinguish RA patients from healthy donors with an emphasis on TNF signaling

We employed mass cytometry (CyTOF) with a panel of 13 phenotyping and 10 functional markers to explore signaling in unstimulated and TNF-stimulated peripheral blood mononuclear cells from 20 newly diagnosed, untreated RA patients and 20 healthy donors.

1. intracellular functional markers
2. surface phenotypic markers 

problem:
- want to collect more cells for each patient
    - don't know what file belongs to each patient
    - don't know which of the data is preprosessed or not
- want to use more markers <b style="color:green"> DONE </b> 
    - don't know how to get the other markers 
    - don't know which markers are phenotypical and which are functional
    

In [143]:
def get_events(file, markers, nr_events):
    f = flowio.flowdata.FlowData(file)
    marker_channels = dict()
    for j in markers:
        for k in f.channels.keys():
            if (j in f.channels[k]['PnS']):
                marker_channels[int(k)] = f.channels[k]['PnS']
    
    npy_data = np.reshape(f.events, (-1, f.channel_count))
    subsample = random.sample(list(npy_data), nr_events)
    df = pd.DataFrame(subsample)
    df2 = df.rename(columns=marker_channels)
    return df2[list(marker_channels.values())]

In [14]:
# choose files
# choose nr of cells per patient
# concatenate all files into one df
# save the file
# write about the data

In [19]:
def get_patients(group_files, nr_cells, file_path, markers, group_name, id_key):
    all_df = None
    for i, file in enumerate(group_files):
        path = file_path + "/" + file
        df = get_events(path, markers, nr_cells)
        key = "({}-[0-9]+)".format(id_key)
        df["id"] = i #re.findall(key, file)[0]
        if i == 0:
            all_df = df
        else:
            all_df = all_df.append(df)
    all_df["group"]=group_name
    return all_df

phenotyping for clustering : cell type

functional for within clustering : what happens IN the cells

In [13]:
# Taken from the paper
phenotyping = ['CD20','CD3','CD4','CD8a','CD45RA', 'CD56', 
               'CD16', 'CD14', 'CD61', 'CD11c','CD123', 'HLA-DR', 'CD45']

functional = ['Caspase3', 'CD86','p-p38','p-Erk','p-Akt','p-cJun','p-NFkB','IkBa','CD120a','CD120b']

In [21]:
def scale_df(df):
    df_sub = df[df.columns.difference(["id","group"])]
    raw_scaled_df = df_sub.transform(lambda x: np.arcsinh(x/5))
    df_scaled = raw_scaled_df.copy()
    df_scaled["id"] = df.id
    df_scaled["group"] = df.group
    return df_scaled

In [23]:
df = scale_df(df)

In [29]:
df.to_csv("ModifiedDATA/scaled_ra2.csv", index=False)

In [31]:
df.group.unique()

array(['control'], dtype=object)

TODO
- prepare the 40 patients with the same number of events, and find such maximum item. (unstim)
    - phenotyping
    - functional
    - phenotyping and functional
- prepare the 40 patients with the same number of events, and find such maximum item. (stim for later)
    - phenotyping
    - functional
    - phenotyping and functional
- scale the values

In [54]:
# find the minimum nr of cells in all files

In [59]:
minimum_control = float("inf")
for file in controls:
    f = flowio.flowdata.FlowData(file_path + "/" + file)
    if f.event_count < minimum_control:
        minimum_control = f.event_count

In [60]:
minimum_ras = float("inf")
for file in ras:
    f = flowio.flowdata.FlowData(file_path + "/" + file)
    if f.event_count < minimum_ras:
        minimum_ras = f.event_count

In [155]:
nr_cells

159736

In [None]:
#nr_cells = 20000
nr_cells = min(minimum_control, minimum_ras) # this gives 3194720 events per group, 6389440 in total for certain datatype.

# max = 159736
# n = 20 000, this might be too little.
# n = 40 000
# n = 50 000
# n = 75 000 this the maximum we can get.
data_shape = {"ideal_ddloss" : (n, n),
              "train + dd_test": (n*0.8, n*0.2), 
              "dimred_train + rest": (n*0.2*0.8, n*0.8**2), 
              "model testing" : (n*0.8**2, n*0.8*0.2), 
              "model validation" : (n*0.8**3, n*0.8**2*0.2) # if k-fold is used, then this is not constant
             }


### Notes
- possible issues with such large data set?
    - need large models, takes longer to train
    - takes time with dimensionality reduction
- Advantage:
    - can train better models, and avoid variance in the model
- comparison:
    - MNIST is 21 MB sized file
    - Extended MNIST is 535 MB
    - Kuzushiji MNIST is 31 MB
    - BINARIZED MNIST is 104 MB
- To try, nr_cells per patient
    - 20 000
    - 50 000
    - 100 000
    - 150 000

In [157]:
nr_cells = 40000

In [None]:
df_cp = get_patients(unstim_control, nr_cells, file_path, phenotyping, "control", "KTR") 
df_rp = get_patients(unstim_ras, nr_cells, file_path, phenotyping, "diseased", "RA")

df_cf = get_patients(unstim_control, nr_cells, file_path, functional, "control", "KTR")
df_rf = get_patients(unstim_ras, nr_cells, file_path, functional, "diseased", "RA")

df_cpf = get_patients(unstim_control, nr_cells, file_path, phenotyping + functional, "control", "KTR")
df_crf = get_patients(unstim_ras, nr_cells, file_path, phenotyping + functional, "diseased", "RA")


In [None]:
df_p = df_cp.append(df_rp)
df_f = df_cf.append(df_rf)
df_pf = df_cpf.append(df_rpf)

In [None]:
df_p = scale_df(df_p)
df_f = scale_df(df_f)
df_pf = scale_df(df_pf)

In [None]:
df_p.to_csv("ModifiedDATA/scaled_ra_pheno_40000.csv", index=False)
df_f.to_csv("ModifiedDATA/scaled_ra_func_40000.csv", index=False)
df_pf.to_csv("ModifiedDATA/scaled_ra_phenofunc_40000.csv", index=False)