[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gibsonlab/mdsine2_tutorials/blob/main/notebooks/tut_01_preprocess.ipynb)

# Data wrangling
Data organization and formatting is an important part of any workflow. For the current dataset (`./data/raw_tables/), we will:
1. Remove a subject with incomplete data (a mouse jumped out of it's cage)
2. Identify and seperate various experimental conditions
    - Healthy subjects
    - Unhealthy subjects
    - Subjects with physical replicate qPCR data
3. Make smaller "toy" datasets to use in the other tutorials

In [None]:
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !curl -LJO https://github.com/gibsonlab/mdsine2_tutorials/raw/main/data/raw_tables.zip
    !mkdir -p ./data/ && unzip -o raw_tables.zip -d ./data/

    !git clone https://github.com/gerberlab/MDSINE2
    !pip install MDSINE2/.

else:
    !cd ..

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  269k  100  269k    0     0   541k      0 --:--:-- --:--:-- --:--:--  541k
Archive:  raw_tables.zip
   creating: ./data/raw_tables/
  inflating: ./data/raw_tables/perturbations.tsv  
  inflating: ./data/raw_tables/qpcr.tsv  
  inflating: ./data/raw_tables/metadata.tsv  
  inflating: ./data/raw_tables/silva_species.tsv  
  inflating: ./data/raw_tables/counts.tsv  
  inflating: ./data/raw_tables/rdp_species.tsv  
Cloning into 'MDSINE2'...
remote: Enumerating objects: 3973, done.[K
remote: Counting objects: 100% (904/904), done.[K
remote: Compressing objects: 100% (258/258), done.[K
remote: Total 3973 (delta 659), reused 887 (delta 646), pack-reused 3069[K
Receiving objects: 100% (3973/3973), 78.22 MiB | 7.57 MiB/s, done.
Resolving deltas: 100% (26

In [None]:
import pandas as pd
from pathlib import Path
import mdsine2 as md2
from mdsine2.util import make_toy


[DEBUG] Using default logger (stdout, stderr).


In [None]:
# Specify data input and output directories
data_dir = Path('./data/')
raw_data_dir = data_dir / 'raw_tables'
healthy_data_dir = data_dir / 'healthy'
replicates_data_dir = data_dir / 'replicates'

for ff in [healthy_data_dir, replicates_data_dir]:
    ff.mkdir(parents=True, exist_ok=True)


In [None]:
# Read the data files
data = {}
tsv_files = sorted(raw_data_dir.glob('*.tsv'))

sep = '\t'

for tsv_f in tsv_files:
    data[tsv_f.stem] = pd.read_csv(tsv_f, index_col=0, sep=sep)


 ### Task 1: Identify subjects that have no associated qpcr data

In [None]:
subj_counts = set(data['counts'].columns.to_list())
subj_qpcr = set(data['qpcr'].index.to_list())

subj_counts_only = list(subj_counts - subj_qpcr)

data['counts'] = data['counts'].drop(columns=subj_counts_only)
data['metadata'] = data['metadata'].drop(index=subj_counts_only)


 ### Task 2: Identify replicate data

In [None]:
# Create a function to extract data based on a given tag.

def identify_subjects_by_tag(data, tags, exclude=False):
    """ Filter out subjects by a tag.
    """
    data = data.copy()
    all_subj_ids = data['counts'].columns

    filter_func = lambda x: any([x.startswith(t) for t in tags])
    subj_ids = list(filter(filter_func, all_subj_ids))

    if exclude:
        subj_ids = sorted(set(data['counts'].columns) - set(subj_ids))
    
    data['counts'] = data['counts'][subj_ids]
    data['metadata'] = data['metadata'].loc[subj_ids]
    data['qpcr'] = data['qpcr'].loc[subj_ids]
    return data
    

In [None]:
# First identify replicates and save data
rep_tag = "M2-"
tags = [rep_tag]

replicates = identify_subjects_by_tag(data, tags)

# Write data to subjects directory
for key in replicates.keys():
    replicates[key].to_csv(
        replicates_data_dir / (key + '.tsv'),
        sep=sep, 
        index=True, 
        header=True,
        )


In [None]:
# Then identify subject data only by excluding the replicate and inoculum data.
healthy_tags = ['2-', '3-', '4-', '5-']

healthy = identify_subjects_by_tag(data, healthy_tags)

# Write data to subjects directory
for key in healthy.keys():
    healthy[key].to_csv(
        healthy_data_dir / (key + '.tsv'),
        sep=sep, 
        index=True, 
        header=True,
        )

 ### Task 3: Make toy datasets

In [None]:
# Read in the preprocessed files, and make toy datasets.
for ff in [healthy_data_dir, replicates_data_dir]:
    tsv_files = sorted(ff.glob('*.tsv'))
    tsv_files = {f.stem : f for f in tsv_files}

    # Create a small toy dataset from full dataset
    toy_study = make_toy(
        metadata_f=tsv_files['metadata'],
        qpcr_f=tsv_files['qpcr'],
        reads_f=tsv_files['counts'],
        taxa_f=tsv_files['rdp_species'],
        perturbations_f=tsv_files['perturbations'],
        n_taxa=15,
        )


[INFO] TaxaSet parsng new taxonomy table. Resetting
[INFO] No `name` found - assuming index is the name
[DEBUG] Reseting perturbations
[INFO] TaxaSet parsng new taxonomy table. Resetting
[INFO] No `name` found - assuming index is the name
[DEBUG] Reseting perturbations
[INFO] TaxaSet parsng new taxonomy table. Resetting
[INFO] No `name` found - assuming index is the name
[DEBUG] Reseting perturbations
[INFO] TaxaSet parsng new taxonomy table. Resetting
[INFO] No `name` found - assuming index is the name
[DEBUG] Reseting perturbations
