In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from glob import glob

In [4]:
import itertools

#### some pandas opts

In [5]:
pd.options.display.max_columns = 100

In [6]:
pd.options.display.max_rows = 500

#### where are the files?

In [7]:
%ls /home/dizak/Pulpit/BIONAS/G148/SNPs_calling/

[0m[01;32m16 documents from consensus.geneious[0m*
[01;32m16 documents from WT-day70 to WT-zero point.geneious[0m*
[34;42mBY-day0[0m/
[34;42mBY-nup133-day70[0m/
[34;42mBY-WT-day70[0m/
[01;32mC2_merged_dfs.csv[0m*
[34;42mconsensus[0m/
[34;42mmapped_contigs[0m/
[34;42mmapped_reads[0m/
[01;32mnup mapped to wt cons.geneious[0m*
[34;42mW303-cog7-day42[0m/
[34;42mW303-nup133-day42[0m/
[34;42mWT-day70[0m/
[01;32mWT-day70.xlsx[0m*


In [8]:
sampling_levels = ["C1", "C2", "C3"]

In [9]:
strain_gene_day_N1_files = glob("/home/dizak/Pulpit/BIONAS/G148/SNPs_calling/W303-cog7-day42/C1/*csv")

In [10]:
strain_gene_day_N2_files = glob("/home/dizak/Pulpit/BIONAS/G148/SNPs_calling/W303-cog7-day42/C2/*csv")

In [11]:
strain_gene_day_N3_files = glob("/home/dizak/Pulpit/BIONAS/G148/SNPs_calling/W303-cog7-day42/C3/*csv")

#### get non-redundant list of genes in the inputfiles

In [12]:
def find_flat_value(inputfiles_list,
                    col_name = "CDS"):
    """
    Get flat list of desired values from list of CSV files.
    
    Parameters
    -------
    inputfiles_list: list of str
        List of input CSV files.
    col_name: str
        Desired column name in the input CSV file.
    
    Returns
    -------
    list of desired values.
    """
    values_list = []
    for i in inputfiles_list:
        df = pd.read_csv(i)
        if len(df) == 0:
            pass
        elif col_name not in df.columns:
            pass
        else:
            values_list.append(df[col_name].dropna().drop_duplicates().tolist())
    return list(itertools.chain.from_iterable(values_list))

#### get values from input files by key

In [13]:
def find_by_key(inputfiles_list,
                key):
    """
    Get pandas.DataFrame selected by a given key from list of CSV files.
    
    Parameters
    -------
    inputfiles_list: list of str
        List of input CSV files.
    key: str, int, float, bool
        Key used as query against rows in the CSV files.
    value_col: str
        Column name which holds values to be returned.
    
    Returns
    -------
    dict of lists of desired values if pandas.Dataframe not empty
    None if pandas.DataFrame empty
    """
    values_list = []
    for i in inputfiles_list:
        filename = "".join(i.split("/")[-1].split(".")[:-1])
        df = pd.read_csv(i)
        if len(df) == 0:
            pass
        else:
            if isinstance(key, str) == True:
                df_dtype_sel = df.select_dtypes(include=["object"])
            elif isinstance(key, int) == True:
                df_dtype_sel = df.select_dtypes(include=["int"])
            elif isinstance(key, float) == True:
                df_dtype_sel = df.select_dtypes(include=["float"])
            elif isinstance(key, bool) == True:
                df_dtype_sel = df.select_dtypes(include=["bool"])
            else:
                raise ValueError("key must str, int, float or bool dtype")
            for col in df_dtype_sel.columns:
                df_sel = df[df_dtype_sel[col] == key]
                if len(df_sel) > 0:
                    return {"dataframe": df_sel,
                            "filename": filename}

#### get whole set of pandas.DataFrames selections in one dict

In [14]:
def get_dfs_set(key_list,
                files_list,
                vals=["Minimum",
                      "Maximum",
                      "Change"],
                df_key="dataframe",
                key_index_name="Gene",
                smpl_index_name="Sample",
                row_index_name="Number",
                index_by_key=True,
                smpl_index_val=None,):
    """
    Get desired values in pandas.Dataframe gathered in dict by
    the list of keys.
    
    Parameters
    -------
    key_list: list, tuple
        List of keys by which pandas.Dataframes are
        initially selected.
    files_list: list, tuple
        List of input CSV files.
    vals: list of str, default: ["Minimum", 
                                 "Maximum",
                                 "Change"]
        Columns names holding data in pandas.DataFrames
        selected.
    df_key: str, default: <"dataframe">
        Key for generic pandas.DataFrame selection for
        SNPs-sheets_merge.find_by_key function.
    key_index_name: str, default: <"Gene">
        Name for index of selection key.
    smpl_index_name: str, default: <"Sample">
        Name for index of sample.
    row_index_name: str, default: <"Number">
        Name for numeric row index.
    index_by_key: bool, default: True
        Enables multiindexing.
    smpl_index_val: str, default: <None>
        Adds sample name to multiindexing if not <None>
    """
    out_dict = {}
    for i in key_list:
        key_vals = find_by_key(files_list,
                               key=i)[df_key]
        out_dict[i] = key_vals[vals]
    if index_by_key is True:
        for i in out_dict:
            key_index = [i] * len(out_dict[i])
            if smpl_index_val is not None:
                smpl_index = [smpl_index_val] * len(out_dict[i])
                tpls = list(zip(*[key_index,
                                  smpl_index,
                                  out_dict[i].index]))
                mindex = pd.MultiIndex.from_tuples(tpls,
                                                   names=[key_index_name,
                                                          smpl_index_name,
                                                          row_index_name])
            else:
                tpls = list(zip(*[key_index,
                                  out_dict[i].index]))
                mindex = pd.MultiIndex.from_tuples(tpls,
                                                   names=[key_index_name,
                                                          row_index_name])
            out_dict[i].index = mindex
    return out_dict

#### merge any given number of dfs

In [15]:
def merge_dfs(dfs,
              sort_cols=["Minimum",
                         "Maximum"],
              reconstr_index=True):
    """
    Merge any number of pandas.DataFrame into one.
    Indexes must be identical in all the pandas.DataFrames.
    
    Parameters
    -------
    dfs: list
        list of pandas.DataFrames to merge.
    sort_cols: list, None
        list of col names to sort the final
        pandas.DataFrame by. No sorting if None.
    """
    for x in [set(i.index) for i in dfs]:
        assert len(x) == 1, "Indices are not homogenic."
    new_index = list(x)
    df = reduce(lambda df1, df2: pd.merge(left=df1,
                                          right=df2,
                                          how="outer"),
                dfs)
    if sort_cols is not None:
        df.sort(columns=sort_cols)
    if reconstr_index is True:
        df.index = len(df) * new_index
    return df

#### let's find out which CDS are present in all the files one by one

In [16]:
strain_gene_day_N1_CDSs = find_flat_value(strain_gene_day_N1_files)
strain_gene_day_N2_CDSs = find_flat_value(strain_gene_day_N2_files)
strain_gene_day_N3_CDSs = find_flat_value(strain_gene_day_N3_files)

#### let's gather some info about each of the CDS from each sample

In [17]:
strain_gene_day_N1_dfs = get_dfs_set(strain_gene_day_N1_CDSs,
                                     strain_gene_day_N1_files,
                                     smpl_index_val=sampling_levels[0])
strain_gene_day_N2_dfs = get_dfs_set(strain_gene_day_N2_CDSs,
                                     strain_gene_day_N2_files,
                                     smpl_index_val=sampling_levels[1])
strain_gene_day_N3_dfs = get_dfs_set(strain_gene_day_N3_CDSs,
                                     strain_gene_day_N3_files,
                                     smpl_index_val=sampling_levels[2])

#### let's unwind them all from this dict into lists

In [18]:
N1_dfs = [strain_gene_day_N1_dfs[i] for i in strain_gene_day_N1_dfs.keys()]
N2_dfs = [strain_gene_day_N2_dfs[i] for i in strain_gene_day_N2_dfs.keys()]
N3_dfs = [strain_gene_day_N3_dfs[i] for i in strain_gene_day_N3_dfs.keys()]

#### let's merge them to see changes between samples.
#### rememeber now cannot use the merge_dfs function since multindex is NOT homogenic

In [19]:
N1_df = reduce(lambda df1, df2: pd.concat([df1, df2]),
                N1_dfs)

In [20]:
N2_df = reduce(lambda df1, df2: pd.concat([df1, df2]),
               N2_dfs)

In [21]:
N3_df = reduce(lambda df1, df2: pd.concat([df1, df2]),
               N3_dfs)

In [22]:
N1_N2_N3_df = reduce(lambda df1, df2: pd.concat([df1, df2]),
                     [N1_df, N2_df, N3_df])

In [23]:
N1_N2_N3_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Minimum,Maximum,Change
Gene,Sample,Number,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PIR3 CDS,C1,20,144808,144808,T -> A
hypothetical protein CDS,C1,7,25590,25590,T -> A
COG7 CDS,C1,120,490511,490512,TA -> AT
COG7 CDS,C1,121,490493,490509,ACCCGTAATTGTCAACT -> TGGAGAGACCTCGTGGA
COG7 CDS,C1,122,490487,490489,ATC -> GTA
COG7 CDS,C1,123,490469,490484,CATACTCAATATATCA -> GCGTACGGTACCTCGT
COG7 CDS,C1,124,490458,490467,TCATCAAAAA -> GTCGACCTGC
COG7 CDS,C1,125,490452,490454,AAC -> GGA
COG7 CDS,C1,126,490444,490450,GGGGAAC -> TAACCCG
COG7 CDS,C1,127,490440,490441,GC -> TA
