In [1]:
cd ..

/Users/flamholz/Documents/workspace/ccm_evolution


In [2]:
import numpy as np
import pandas as pd
from os import path

In [3]:
# A list of dictionaries specifying the data that we have
data_spec = [
    dict(path="data/Hnea/barseq/2017_11_03_miseq_barSeq1/",
         sets=[dict(key="setAS2 HCO", cond='5% CO2'), 
               dict(key="setAS3 LCO", cond='ambient CO2')],
         note="5% and ambient rep 1"),
    dict(path="data/Hnea/barseq/2017_11_03_miseq_barseq2/",
         sets=[dict(key="setAS2 HCO", cond='5% CO2'), 
               dict(key="setAS3 LCO", cond='ambient CO2')],
         note="5% and ambient rep 2"),
    dict(path="data/Hnea/barseq/2018_10_30_miseq_barSeq3",
         sets=[dict(key="setAS3 0-5", cond='0.5% CO2'), 
               dict(key="setAS5 10", cond='10% CO2')],
         note="0.5% and 10% CO2"),
    dict(path="data/Hnea/barseq/2018_10_30_miseq_barSeq4",
         sets=[dict(key="setAS4 0-5", cond='0.5% CO2'), 
               dict(key="setAS6 10", cond='10% CO2')],
         note="0.5% and 10% CO2"),
    dict(path="data/Hnea/barseq/2018_10_30_miseq_barSeq5",
         sets=[dict(key="setAS7 1-5", cond='1.5% CO2'), 
               dict(key="setAS8 5", cond='5% CO2')],
         note="1.5% and 5% CO2"),
    dict(path="data/Hnea/barseq/2018_10_30_miseq_barSeq6",
         sets=[dict(key="setAS10 1-5", cond='1.5% CO2'), 
               dict(key="setAS11 5", cond='5% CO2')],
         note="0.5% and 10% CO2"),
]

In [4]:
def preprocess_barseq(specs):
    dfs = []
    
    for my_spec in specs:
        # Merge the data called "good" by the barseq scripts, i.e. meets statistical quality filters.
        fpath = path.join(my_spec['path'], 'fit_logratios_good.tab')
        fit_df = pd.read_csv(fpath, sep='\t')
        cols = [s['key'] for s in my_spec['sets']]+['locusId']
        my_df = fit_df[cols].set_index('locusId')
        my_df.columns = [s['cond'] for s in my_spec['sets']]
        dfs.append(my_df)    
    con_df = pd.concat(dfs,axis=1,sort=True).reset_index()
    cols = list(con_df.columns)
    cols[0] = 'locus_id'
    con_df.columns = cols
    return(con_df)

In [5]:
all_barseq_df = preprocess_barseq(data_spec)
all_barseq_df.head()

Unnamed: 0,locus_id,5% CO2,ambient CO2,5% CO2.1,ambient CO2.1,0.5% CO2,10% CO2,0.5% CO2.1,10% CO2.1,1.5% CO2,5% CO2.2,1.5% CO2.1,5% CO2.3
0,GFF1190,-0.000402,0.021627,-0.001932,0.076393,-0.062972,-0.044107,-0.097597,-0.114159,0.039218,0.022432,0.003704,-0.027252
1,GFF1209,0.222629,-0.117649,-0.130837,-0.047011,-0.087761,-0.134193,0.15456,-0.038279,0.168402,0.069838,0.22785,-0.037395
2,GFF1357,-0.612701,-0.344187,-0.504753,-0.315189,-1.174064,-0.906337,-1.203053,-0.942894,-1.133942,-1.033053,-1.031073,-1.220323
3,GFF1439,-0.277288,-0.268327,-0.234944,-0.290411,0.117596,-0.007212,0.229604,0.198248,-0.147151,-0.079903,-0.161343,-0.110442
4,GFF1496,0.676934,0.320859,0.515599,0.189416,-0.795918,-0.274587,-0.364849,-0.181126,0.111388,-0.022536,-0.05318,-0.028839


In [6]:
all_barseq_df.to_csv('data/Hnea/barseq/fit_logratios_all_good.csv', index=False)