# Data Processing

# Setup

In [17]:
import random,copy,math,time,os,csv,sys
import scipy.io as sio # for loading .mat files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [18]:
DATA_DIRECTORY = './../../experimental_data/raw_data/'

## Load raw data

In [19]:
data_raw = []
for filename in os.listdir(DATA_DIRECTORY):
        if filename.endswith(".mat"):
            experiment = filename[:-4]
            print(experiment)
            data_dict = sio.loadmat(DATA_DIRECTORY +filename)
            if 't' in data_dict:
                data_dict["experiment"] = experiment
                if experiment == 'NoGuidedRNA':
                    data_dict["experiment"] = 'Control'
                if '-' in experiment:
                    defect = experiment.split('-')[0]
                    if defect in 'NonRepeatedSequence':
                        data_dict['defect'] = 'NRS'
                    else:
                        data_dict['defect'] = experiment.split('-')[0]
                    data_dict['nuclease'] = experiment.split('-')[1]

    
                if 'AllCells' in data_dict:
                    data_dict['AllCellsBF'] = data_dict['AllCells']
                if 'AllGFPCells' in data_dict:
                    data_dict['AllCellsGFP'] = data_dict['AllGFPCells']
                data_raw.append(data_dict)
                print('  success!')
            else:
                print('  missing data!')


NR-SpCas9
  success!
CTG-Cpf1
  success!
CGG-Cpf1
  success!
NoGuideRNA
  success!
CTG-SpCas9-1
  success!
GAA - Cpf1
  success!
CTG-SpCas9-2
  success!
NR-Cpf1-1
  success!
CGG-SpCas9
  success!
NR-Cpf1-3
  missing data!
NR-Cpf1-2
  success!
GAA-SpCas9-1
  success!
GAA-SpCas9-2
  success!


In [20]:
def make_single_cell_dataframe(dct):
    '''
    convert the dictionary of raw data to a data frame
    '''
    data = pd.DataFrame() # dataframe for single-cell data
    
    
    n_wells = len(dct['AllCellsBF'][0])
    n_points = len(dct['t'][0])
    data['well'] = np.concatenate([np.ones(n_points)*j for j in range(n_wells)])
    data['bf'] = dct['AllCellsBF'].T.reshape(n_wells*n_points)
    
    if 'AllCellsGFP' in dct:
        data['gfp'] = dct['AllCellsGFP'].T.reshape(n_wells*n_points)
    else: # this means there is no defect
        data['gfp'] = np.nan
        
    data['time'] = np.concatenate([dct['t'][0] for j in range(n_wells)])
    data['experiment'] = dct['experiment']
    
    if 'defect' in dct:
        data['defect'] = dct['defect']
    else:
        data['defect'] = 'none'
    if 'nuclease' in dct:
        data['nuclease'] = dct['nuclease']
    else:
        data['nuclease'] = 'none'
    return data

def make_avg_dataframe(dct):
    '''
    convert the dictionary of raw data to a data frame with the average data
    '''
    data = pd.DataFrame()
    
    n_wells = len(dct['AllCellsBF'][0])
    n_points = len(dct['t'][0])
    data['bf'] = np.mean(dct['AllCellsBF'].T,axis=0)
    
    if 'AllCellsGFP' in dct:
        data['gfp'] = np.mean(dct['AllCellsGFP'].T,axis=0)
    else: # this means there is no defect
        data['gfp'] = np.nan
        
    data['time'] = dct['t'][0]
    data['experiment'] = dct['experiment']
    
    if 'defect' in dct:
        data['defect'] = dct['defect']
    else:
        data['defect'] = 'none'
    if 'nuclease' in dct:
        data['nuclease'] = dct['nuclease']
    else:
        data['nuclease'] = 'none'
    return data

In [21]:
data = pd.concat([make_single_cell_dataframe(dct) for dct in data_raw])
data.to_csv('./../../experimental_data/processed_data/single_cell_data.csv')

In [22]:
data = pd.concat([make_avg_dataframe(dct) for dct in data_raw])
data.to_csv('./../../experimental_data/processed_data/avg_data.csv')

In [23]:
data

Unnamed: 0,bf,gfp,time,experiment,defect,nuclease
0,1.312757,0.000000,0,NR-SpCas9,NR,SpCas9
1,1.308642,0.000000,20,NR-SpCas9,NR,SpCas9
2,1.316872,0.000000,40,NR-SpCas9,NR,SpCas9
3,1.325103,0.000000,60,NR-SpCas9,NR,SpCas9
4,1.337449,0.000000,80,NR-SpCas9,NR,SpCas9
...,...,...,...,...,...,...
68,15.979021,8.083916,1360,GAA-SpCas9-2,GAA,SpCas9
69,16.265734,8.195804,1380,GAA-SpCas9-2,GAA,SpCas9
70,16.573427,8.377622,1400,GAA-SpCas9-2,GAA,SpCas9
71,16.972028,8.643357,1420,GAA-SpCas9-2,GAA,SpCas9
