# Preprocess the data
This is a jupyter notebook to process the data, integerate all datas in a DataFrame.

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Load data
* ```cell_label.txt```
* ```data_total.txt```
* ```data.txt```

In [2]:
cell_label = pd.read_table('../data/cell_label.txt')
data_all = pd.read_table('../data/data_total.txt')
data = pd.read_table('../data/data.txt')

## See the data format
There are three txt files.
* ```cell_label.txt```
* ```data_total.txt```
* ```data.txt```

We loaded the three files in DataFrame format

### The cell type is one-hot encoding

In [3]:
cell_label.head(5)

Unnamed: 0,Granulocytes,basophils,Plasmacytoid dendritic cells,Non-Classical Monocytes,myeloid dendritic cells,classical monocytes,Natural Killer cells,Effector T killer cells,naive T killer cells,Memory T killer cells,activated T killer cells,Effector T helper cells,Naive T helper cells,Memory T helper cells,activated T helper cells,memory B cells CD19,naive B cells CD19,plasma B cells CD19
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
data_all.head(5)

Unnamed: 0,CD235ab,MCEMP1,ICOS,CD96,LCK,CXCR6,CD2,HAPTOGLOBIN,FLT3,STAT4,...,CD123,CD11c,CD14,CD16,CD27,CD45RA,CD3,CD38,HLA-DR,batch
0,0.266458,0.843588,0.0,0.566443,0.578018,0.426957,2.147042,0.608361,0.550156,1.029276,...,0.0,0.88413,0.818624,0.0,1.888936,1.527312,2.728251,0.872396,0.0,c01_IPFH
1,0.890183,1.039873,0.977795,0.76944,0.245717,0.198752,1.95385,0.0,0.0,0.904484,...,0.0,0.674371,0.116559,0.0,1.547359,0.0,2.7769,0.774546,0.0,c01_IPFH
2,1.032193,1.408455,1.35116,1.229337,0.476334,0.750055,1.939668,0.662886,0.915624,1.056118,...,0.814026,0.0,1.245918,0.0,0.190691,1.503655,2.802467,0.0,2.287302,c01_IPFH
3,0.722698,1.851187,0.671645,0.877242,0.0,0.0,0.0,0.779155,0.374503,0.806161,...,0.0,1.621537,1.052678,0.0,0.0,1.291378,1.121688,1.132275,1.631254,c01_IPFH
4,1.126662,1.80431,0.863143,0.495244,0.727941,0.0,0.0,0.755254,0.501104,0.7568,...,0.0,2.055391,1.046785,0.0,0.0,0.0,0.0,0.325935,2.194841,c01_IPFH


In [5]:
cell_label['Unlabel cells'] = cell_label.apply(lambda x: 1 - x.sum(), axis=1)

In [6]:
data_all['celltype'] = cell_label.apply(lambda x: x.argmax(), axis=1)

In [7]:
data_all.head(5)

Unnamed: 0,CD235ab,MCEMP1,ICOS,CD96,LCK,CXCR6,CD2,HAPTOGLOBIN,FLT3,STAT4,...,CD11c,CD14,CD16,CD27,CD45RA,CD3,CD38,HLA-DR,batch,celltype
0,0.266458,0.843588,0.0,0.566443,0.578018,0.426957,2.147042,0.608361,0.550156,1.029276,...,0.88413,0.818624,0.0,1.888936,1.527312,2.728251,0.872396,0.0,c01_IPFH,Unlabel cells
1,0.890183,1.039873,0.977795,0.76944,0.245717,0.198752,1.95385,0.0,0.0,0.904484,...,0.674371,0.116559,0.0,1.547359,0.0,2.7769,0.774546,0.0,c01_IPFH,Unlabel cells
2,1.032193,1.408455,1.35116,1.229337,0.476334,0.750055,1.939668,0.662886,0.915624,1.056118,...,0.0,1.245918,0.0,0.190691,1.503655,2.802467,0.0,2.287302,c01_IPFH,Effector T helper cells
3,0.722698,1.851187,0.671645,0.877242,0.0,0.0,0.0,0.779155,0.374503,0.806161,...,1.621537,1.052678,0.0,0.0,1.291378,1.121688,1.132275,1.631254,c01_IPFH,Unlabel cells
4,1.126662,1.80431,0.863143,0.495244,0.727941,0.0,0.0,0.755254,0.501104,0.7568,...,2.055391,1.046785,0.0,0.0,0.0,0.0,0.325935,2.194841,c01_IPFH,Unlabel cells


In [8]:
data_all['Batch type'] = data_all[0:2].apply(lambda x: x['batch'].split('_')[-1], axis = 1)

In [9]:
data_all.head(2)

Unnamed: 0,CD235ab,MCEMP1,ICOS,CD96,LCK,CXCR6,CD2,HAPTOGLOBIN,FLT3,STAT4,...,CD14,CD16,CD27,CD45RA,CD3,CD38,HLA-DR,batch,celltype,Batch type
0,0.266458,0.843588,0.0,0.566443,0.578018,0.426957,2.147042,0.608361,0.550156,1.029276,...,0.818624,0.0,1.888936,1.527312,2.728251,0.872396,0.0,c01_IPFH,Unlabel cells,IPFH
1,0.890183,1.039873,0.977795,0.76944,0.245717,0.198752,1.95385,0.0,0.0,0.904484,...,0.116559,0.0,1.547359,0.0,2.7769,0.774546,0.0,c01_IPFH,Unlabel cells,IPFH


In [10]:
data_all.to_csv('../processd_data.csv', index = False)