In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from glob import glob

In [4]:
import itertools

#### some pandas opts

In [5]:
pd.options.display.max_columns = 100

In [6]:
pd.options.display.max_rows = 500

#### where are the files?

In [7]:
%ls /home/dizak/Pulpit/BIONAS/G148/SNPs_calling/

[0m[01;32m16 documents from consensus.geneious[0m*
[01;32m16 documents from WT-day70 to WT-zero point.geneious[0m*
[34;42mBY-day0[0m/
[34;42mBY-nup133-day70[0m/
[34;42mBY-WT-day70[0m/
[34;42mconsensus[0m/
[34;42mmapped_contigs[0m/
[34;42mmapped_reads[0m/
[01;32mnup mapped to wt cons.geneious[0m*
[34;42mW303-cog7-day42[0m/
[34;42mW303-nup133-day42[0m/
[34;42mWT-day70[0m/
[01;32mWT-day70.xlsx[0m*


In [8]:
W303_cog7_day42_C1_files = glob("/home/dizak/Pulpit/BIONAS/G148/SNPs_calling/W303-cog7-day42/C1/*csv")

In [9]:
W303_cog7_day42_C2_files = glob("/home/dizak/Pulpit/BIONAS/G148/SNPs_calling/W303-cog7-day42/C2/*csv")

In [10]:
W303_cog7_day42_C3_files = glob("/home/dizak/Pulpit/BIONAS/G148/SNPs_calling/W303-cog7-day42/C3/*csv")

#### get non-redundant list of genes in the inputfiles

In [11]:
def find_flat_value(inputfiles_list,
                    col_name = "CDS"):
    """
    Get flat list of desired values from list of CSV files.
    
    Parameters
    -------
    inputfiles_list: list of str
        List of input CSV files.
    col_name: str
        Desired column name in the input CSV file.
    
    Returns
    -------
    list of desired values.
    """
    values_list = []
    for i in inputfiles_list:
        df = pd.read_csv(i)
        if len(df) == 0:
            pass
        elif col_name not in df.columns:
            pass
        else:
            values_list.append(df[col_name].dropna().drop_duplicates().tolist())
    return list(itertools.chain.from_iterable(values_list))

#### get values from input files by key

In [12]:
def find_by_key(inputfiles_list,
                key):
    """
    Get pandas.DataFrame selected by a given key from list of CSV files.
    
    Parameters
    -------
    inputfiles_list: list of str
        List of input CSV files.
    key: str, int, float, bool
        Key used as query against rows in the CSV files.
    value_col: str
        Column name which holds values to be returned.
    
    Returns
    -------
    dict of lists of desired values if pandas.Dataframe not empty
    None if pandas.DataFrame empty
    """
    values_list = []
    for i in inputfiles_list:
        filename = "".join(i.split("/")[-1].split(".")[:-1])
        df = pd.read_csv(i)
        if len(df) == 0:
            pass
        else:
            if isinstance(key, str) == True:
                df_dtype_sel = df.select_dtypes(include=["object"])
            elif isinstance(key, int) == True:
                df_dtype_sel = df.select_dtypes(include=["int"])
            elif isinstance(key, float) == True:
                df_dtype_sel = df.select_dtypes(include=["float"])
            elif isinstance(key, bool) == True:
                df_dtype_sel = df.select_dtypes(include=["bool"])
            else:
                raise ValueError("key must str, int, float or bool dtype")
            for col in df_dtype_sel.columns:
                df_sel = df[df_dtype_sel[col] == key]
                if len(df_sel) > 0:
                    return {"dataframe": df_sel,
                            "filename": filename}

#### get whole set of pandas.DataFrames selections in one dict

In [345]:
def get_dfs_set(key_list,
                files_list,
                vals=["Minimum",
                      "Maximum",
                      "Change"],
                df_key="dataframe",
                index_by_key=True):
    """
    Get desired values in pandas.Dataframe gathered in dict by
    the list of keys.
    """
    out_dict = {}
    for i in key_list:
        key_vals = find_by_key(files_list,
                              key=i)[df_key]
        out_dict[i] = key_vals[vals]
    if index_by_key is True:
        for i in out_dict:
            out_dict[i].index = [i] * len(out_dict[i])
    else:
        pass
    return out_dict

In [440]:
def get_dfs_set(key_list,
                files_list,
                vals=["Minimum",
                      "Maximum",
                      "Change"],
                df_key="dataframe",
                key_index_name="Gene",
                smpl_index_name="Sample",
                row_index_name="Number",
                index_by_key=True,
                smpl_index_val=None,):
    """
    Get desired values in pandas.Dataframe gathered in dict by
    the list of keys.
    """
    out_dict = {}
    for i in key_list:
        key_vals = find_by_key(files_list,
                              key=i)[df_key]
        out_dict[i] = key_vals[vals]
    if index_by_key is True:
        for i in out_dict:
            key_index = [i] * len(out_dict[i])
            if smpl_index_val is not None:
                smpl_index = [smpl_index_val] * len(out_dict[i])
                tpls = list(zip(*[key_index,
                                  smpl_index,
                                  out_dict[i].index]))
                mindex = pd.MultiIndex.from_tuples(tpls,
                                                   names=[key_index_name,
                                                          smpl_index_name,
                                                          row_index_name])
            else:
                tpls = list(zip(*[key_index,
                                  out_dict[i].index]))
                mindex = pd.MultiIndex.from_tuples(tpls,
                                                   names=[key_index_name,
                                                          row_index_name])
            out_dict[i].index = mindex
    return out_dict

In [441]:
W303_cog7_day42_C1_dfs = get_dfs_set(W303_cog7_day42_C1_CDSs,
                                     W303_cog7_day42_C1_files,
                                     smpl_index_val="C1")

In [442]:
dfs_list = [W303_cog7_day42_C1_dfs[i] for i in W303_cog7_day42_C1_dfs.keys()]

In [443]:
merged_dfs = reduce(lambda df1, df2: pd.concat([df1, df2],
                                              axis=0),
                    dfs_list)

In [444]:
C1_merged_dfs = merged_dfs.sort(columns=["Minimum", "Maximum"])

  if __name__ == '__main__':


In [445]:
C1_merged_dfs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Minimum,Maximum,Change
Gene,Sample,Number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
COS7 CDS,C1,35,1933,1933,T -> A
COS7 CDS,C1,34,1946,1946,T -> G
COS7 CDS,C1,33,1948,1948,A -> C
COS7 CDS,C1,32,1979,1979,A -> G
COS7 CDS,C1,31,2009,2009,A -> G
COS7 CDS,C1,30,2011,2011,T -> C
COS7 CDS,C1,29,2016,2016,A -> G
COS7 CDS,C1,28,2019,2019,A -> G
COS7 CDS,C1,27,2025,2025,G -> A
COS7 CDS,C1,26,2028,2029,TA -> CT


In [397]:
W303_cog7_day42_C2_dfs = get_dfs_set(W303_cog7_day42_C2_CDSs, W303_cog7_day42_C2_files)

In [398]:
dfs_list = [W303_cog7_day42_C2_dfs[i] for i in W303_cog7_day42_C2_dfs.keys()]

In [399]:
merged_dfs = reduce(lambda df1, df2: pd.concat([df1, df2],
                                              axis=0),
                    dfs_list)

In [400]:
C2_merged_dfs = merged_dfs.sort(columns=["Minimum", "Maximum"])

  if __name__ == '__main__':


In [407]:
C2_merged_dfs

Unnamed: 0_level_0,Unnamed: 1_level_0,Minimum,Maximum,Change
Gene,Number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
COS7 CDS,42,1933,1933,T -> A
COS7 CDS,41,1946,1946,T -> G
COS7 CDS,40,1948,1948,A -> C
COS7 CDS,39,1979,1979,A -> G
COS7 CDS,38,2009,2009,A -> G
COS7 CDS,37,2011,2011,T -> C
COS7 CDS,36,2016,2016,A -> G
COS7 CDS,35,2019,2019,A -> G
COS7 CDS,34,2025,2025,G -> A
COS7 CDS,33,2028,2029,TA -> CT


In [405]:
pd.merge(left=C1_merged_dfs, right=C2_merged_dfs)

Unnamed: 0,Minimum,Maximum,Change
0,1933,1933,T -> A
1,1946,1946,T -> G
2,1948,1948,A -> C
3,1979,1979,A -> G
4,2009,2009,A -> G
5,2011,2011,T -> C
6,2016,2016,A -> G
7,2019,2019,A -> G
8,2025,2025,G -> A
9,2028,2029,TA -> CT


#### merge any given number of dfs

In [68]:
def merge_dfs(dfs,
              sort_cols=["Minimum",
                         "Maximum"],
              reconstr_index=True):
    """
    Merge any number of pandas.DataFrame into one.
    Indexes must be identical in all the pandas.DataFrames.
    
    Parameters
    -------
    dfs: list
        list of pandas.DataFrames to merge.
    sort_cols: list, None
        list of col names to sort the final
        pandas.DataFrame by. No sorting if None.
    """
    for x in [set(i.index) for i in dfs]:
        assert len(x) == 1, "Indices are not homogenic."
    new_index = list(x)
    df = reduce(lambda df1, df2: pd.merge(left=df1,
                                          right=df2,
                                          how="outer"),
                dfs)
    if sort_cols is not None:
        df.sort(columns=sort_cols)
    if reconstr_index is True:
        df.index = len(df) * new_index
    return df

#### let's find out which CDS are present in all the files one by one

In [15]:
W303_cog7_day42_C1_CDSs = find_flat_value(W303_cog7_day42_C1_files)
W303_cog7_day42_C2_CDSs = find_flat_value(W303_cog7_day42_C2_files)
W303_cog7_day42_C3_CDSs = find_flat_value(W303_cog7_day42_C3_files)

#### let's gather some info about each of the CDS from each sample

In [16]:
W303_cog7_day42_C1_dfs = get_dfs_set(W303_cog7_day42_C1_CDSs, W303_cog7_day42_C1_files)
W303_cog7_day42_C2_dfs = get_dfs_set(W303_cog7_day42_C2_CDSs, W303_cog7_day42_C2_files)
W303_cog7_day42_C3_dfs = get_dfs_set(W303_cog7_day42_C3_CDSs, W303_cog7_day42_C3_files)

#### which CDSs are common for all 3?

In [39]:
W303_cog7_day42_comm_CDS = reduce(lambda x, y: [i for i in x if i in y],
                                  [W303_cog7_day42_C1_CDSs,
                                   W303_cog7_day42_C2_CDSs,
                                   W303_cog7_day42_C3_CDSs])

In [69]:
out_dict = {}
for i in W303_cog7_day42_comm_CDS:
    out_dict[i] = merge_dfs([W303_cog7_day42_C1_dfs[i],
                             W303_cog7_day42_C2_dfs[i],
                             W303_cog7_day42_C3_dfs[i]])



In [275]:
df = W303_cog7_day42_C1_dfs["FLO1 CDS"]

In [276]:
df.index = range(len(df))

In [277]:
sample = len(W303_cog7_day42_C1_dfs["FLO1 CDS"]) * ["C1"]

In [278]:
gene = len(W303_cog7_day42_C1_dfs["FLO1 CDS"]) * ["FLO1 CDS"]

In [279]:
tpls = list(zip(*[sample, gene, range(len(df))]))

In [280]:
df.index = pd.MultiIndex.from_tuples(tpls, names=["sample", "gene", "number"])

In [281]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Minimum,Maximum,Change
sample,gene,number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C1,FLO1 CDS,0,204538,204538,A -> G
C1,FLO1 CDS,1,204529,204529,G -> A
C1,FLO1 CDS,2,204459,204460,WA -> TT
C1,FLO1 CDS,3,204456,204456,A -> T
C1,FLO1 CDS,4,204453,204454,CG -> TT
C1,FLO1 CDS,5,204449,204451,CAC -> GTT
C1,FLO1 CDS,6,204443,204448,TSSCCA -> GAAACC
C1,FLO1 CDS,7,204435,204435,G -> C
C1,FLO1 CDS,8,204431,204433,ACM -> CAA
C1,FLO1 CDS,9,204428,204430,BHB -> CGGT


In [282]:
df2 = W303_cog7_day42_C2_dfs["FLO1 CDS"]

In [283]:
df2.index = range(len(df2))

In [284]:
sample = len(W303_cog7_day42_C2_dfs["FLO1 CDS"]) * ["C2"]

In [285]:
gene = len(W303_cog7_day42_C2_dfs["FLO1 CDS"]) * ["FLO1 CDS"]

In [286]:
tpls = list(zip(*[sample, gene, range(len(df2))]))

In [287]:
df2.index = pd.MultiIndex.from_tuples(tpls, names=["sample", "gene", "number"])

In [288]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Minimum,Maximum,Change
sample,gene,number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C2,FLO1 CDS,0,204538,204538,A -> G
C2,FLO1 CDS,1,204529,204529,G -> A
C2,FLO1 CDS,2,204459,204460,WA -> TT
C2,FLO1 CDS,3,204456,204456,A -> T
C2,FLO1 CDS,4,204453,204454,CG -> TT
C2,FLO1 CDS,5,204439,204441,GWG -> ACT
C2,FLO1 CDS,6,204434,204436,CGA -> AAC
C2,FLO1 CDS,7,204431,204433,ACM -> GTC
C2,FLO1 CDS,8,204428,204430,BHB -> ACG
C2,FLO1 CDS,9,204416,204418,MMY -> TCA


In [289]:
df2.T

sample,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2,C2
gene,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS,FLO1 CDS
number,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
Minimum,204538,204529,204459,204456,204453,204439,204434,204431,204428,204416,204413,204410,204407,204404,204402,204398,204396,204394,204388,203986,203977,203713,203705,203696,203679,203677
Maximum,204538,204529,204460,204456,204454,204441,204436,204433,204430,204418,204414,204412,204408,204404,204402,204400,204396,204394,204388,203986,203977,203713,203705,203698,203680,203677
Change,A -> G,G -> A,WA -> TT,A -> T,CG -> TT,GWG -> ACT,CGA -> AAC,ACM -> GTC,BHB -> ACG,MMY -> TCA,AM -> CC,CAY -> ATA,CH -> AT,C -> A,K -> G,CAY -> GCT,A -> C,C -> A,C -> A,A -> T,C -> T,A -> T,G -> A,ATC -> GTT,TG -> AA,C -> T


In [290]:
pd.merge(left=df, right=df2)

Unnamed: 0,Minimum,Maximum,Change
0,204538,204538,A -> G
1,204529,204529,G -> A
2,204459,204460,WA -> TT
3,204456,204456,A -> T
4,204453,204454,CG -> TT
5,204407,204408,CH -> AT
6,204404,204404,C -> A
7,204402,204402,K -> G
8,204398,204400,CAY -> GCT
9,204396,204396,A -> C


In [291]:
df_c = pd.concat([df, df2]) 

In [292]:
pd.concat([df_c.xs("C1"), df_c.xs("C2")], axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Minimum,Maximum,Change,Minimum,Maximum,Change
gene,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
FLO1 CDS,0,204538.0,204538.0,A -> G,204538,204538,A -> G
FLO1 CDS,1,204529.0,204529.0,G -> A,204529,204529,G -> A
FLO1 CDS,2,204459.0,204460.0,WA -> TT,204459,204460,WA -> TT
FLO1 CDS,3,204456.0,204456.0,A -> T,204456,204456,A -> T
FLO1 CDS,4,204453.0,204454.0,CG -> TT,204453,204454,CG -> TT
FLO1 CDS,5,204449.0,204451.0,CAC -> GTT,204439,204441,GWG -> ACT
FLO1 CDS,6,204443.0,204448.0,TSSCCA -> GAAACC,204434,204436,CGA -> AAC
FLO1 CDS,7,204435.0,204435.0,G -> C,204431,204433,ACM -> GTC
FLO1 CDS,8,204431.0,204433.0,ACM -> CAA,204428,204430,BHB -> ACG
FLO1 CDS,9,204428.0,204430.0,BHB -> CGGT,204416,204418,MMY -> TCA


In [302]:
pd.merge(left=df, right=df2, on=["Change", "Minimum", "Maximum"], right_index=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Minimum,Maximum,Change
sample,gene,number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C1,FLO1 CDS,0,204538,204538,A -> G
C1,FLO1 CDS,1,204529,204529,G -> A
C1,FLO1 CDS,2,204459,204460,WA -> TT
C1,FLO1 CDS,3,204456,204456,A -> T
C1,FLO1 CDS,4,204453,204454,CG -> TT
C1,FLO1 CDS,13,204407,204408,CH -> AT
C1,FLO1 CDS,14,204404,204404,C -> A
C1,FLO1 CDS,15,204402,204402,K -> G
C1,FLO1 CDS,16,204398,204400,CAY -> GCT
C1,FLO1 CDS,17,204396,204396,A -> C


In [309]:
df["Sample"] = pd.Series(len(df) * ["C1"])

In [317]:
df.drop("Sample", inplace=True, axis=1)

In [342]:
for i in df_c.index.levels:
    print list(i)

['C1', 'C2']
['FLO1 CDS']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]


In [344]:
df.xs("C1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Minimum,Maximum,Change
gene,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FLO1 CDS,0,204538,204538,A -> G
FLO1 CDS,1,204529,204529,G -> A
FLO1 CDS,2,204459,204460,WA -> TT
FLO1 CDS,3,204456,204456,A -> T
FLO1 CDS,4,204453,204454,CG -> TT
FLO1 CDS,5,204449,204451,CAC -> GTT
FLO1 CDS,6,204443,204448,TSSCCA -> GAAACC
FLO1 CDS,7,204435,204435,G -> C
FLO1 CDS,8,204431,204433,ACM -> CAA
FLO1 CDS,9,204428,204430,BHB -> CGGT


In [268]:
df1 = W303_cog7_day42_C1_dfs["FLO1 CDS"]
df2 = W303_cog7_day42_C2_dfs["FLO1 CDS"]
df3 = W303_cog7_day42_C3_dfs["FLO1 CDS"]

In [269]:
for i in (df1, df2, df3):
    i.index = range(len(i))

In [271]:
df1.index, df2.index, df3.index

(Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
             17, 18, 19, 20, 21, 22, 23],
            dtype='int64'),
 Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
             17, 18, 19, 20, 21, 22, 23, 24, 25],
            dtype='int64'),
 Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
             17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
            dtype='int64'))

In [274]:
pd.concat([df1, pd.Series(["C1"] * len(df1))], axis=1)

Unnamed: 0,Minimum,Maximum,Change,0
0,204538,204538,A -> G,C1
1,204529,204529,G -> A,C1
2,204459,204460,WA -> TT,C1
3,204456,204456,A -> T,C1
4,204453,204454,CG -> TT,C1
5,204449,204451,CAC -> GTT,C1
6,204443,204448,TSSCCA -> GAAACC,C1
7,204435,204435,G -> C,C1
8,204431,204433,ACM -> CAA,C1
9,204428,204430,BHB -> CGGT,C1
