In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from glob import glob

In [4]:
import itertools

#### some pandas opts

In [5]:
pd.options.display.max_columns = 100

In [6]:
pd.options.display.max_rows = 500

#### where are the files?

In [7]:
%ls /home/dizak/Pulpit/BIONAS/G148/SNPs_calling/

[0m[01;32m16 documents from consensus.geneious[0m*
[01;32m16 documents from WT-day70 to WT-zero point.geneious[0m*
[34;42mBY-day0[0m/
[34;42mBY-nup133-day70[0m/
[34;42mBY-WT-day70[0m/
[34;42mconsensus[0m/
[34;42mmapped_contigs[0m/
[34;42mmapped_reads[0m/
[01;32mnup mapped to wt cons.geneious[0m*
[34;42mW303-cog7-day42[0m/
[34;42mW303-nup133-day42[0m/
[34;42mWT-day70[0m/
[01;32mWT-day70.xlsx[0m*


In [8]:
W303_cog7_day42_C1_files = glob("/home/dizak/Pulpit/BIONAS/G148/SNPs_calling/W303-cog7-day42/C1/*csv")

In [9]:
W303_cog7_day42_C2_files = glob("/home/dizak/Pulpit/BIONAS/G148/SNPs_calling/W303-cog7-day42/C2/*csv")

In [10]:
W303_cog7_day42_C3_files = glob("/home/dizak/Pulpit/BIONAS/G148/SNPs_calling/W303-cog7-day42/C3/*csv")

#### get non-redundant list of genes in the inputfiles

In [11]:
def find_flat_value(inputfiles_list,
                    col_name = "CDS"):
    """
    Get flat list of desired values from list of CSV files.
    
    Parameters
    -------
    inputfiles_list: list of str
        List of input CSV files.
    col_name: str
        Desired column name in the input CSV file.
    
    Returns
    -------
    list of desired values.
    """
    values_list = []
    for i in inputfiles_list:
        df = pd.read_csv(i)
        if len(df) == 0:
            pass
        elif col_name not in df.columns:
            pass
        else:
            values_list.append(df[col_name].dropna().drop_duplicates().tolist())
    return list(itertools.chain.from_iterable(values_list))

#### get values from input files by key

In [12]:
def find_by_key(inputfiles_list,
                key):
    """
    Get pandas.DataFrame selected by a given key from list of CSV files.
    
    Parameters
    -------
    inputfiles_list: list of str
        List of input CSV files.
    key: str, int, float, bool
        Key used as query against rows in the CSV files.
    value_col: str
        Column name which holds values to be returned.
    
    Returns
    -------
    dict of lists of desired values if pandas.Dataframe not empty
    None if pandas.DataFrame empty
    """
    values_list = []
    for i in inputfiles_list:
        filename = "".join(i.split("/")[-1].split(".")[:-1])
        df = pd.read_csv(i)
        if len(df) == 0:
            pass
        else:
            if isinstance(key, str) == True:
                df_dtype_sel = df.select_dtypes(include=["object"])
            elif isinstance(key, int) == True:
                df_dtype_sel = df.select_dtypes(include=["int"])
            elif isinstance(key, float) == True:
                df_dtype_sel = df.select_dtypes(include=["float"])
            elif isinstance(key, bool) == True:
                df_dtype_sel = df.select_dtypes(include=["bool"])
            else:
                raise ValueError("key must str, int, float or bool dtype")
            for col in df_dtype_sel.columns:
                df_sel = df[df_dtype_sel[col] == key]
                if len(df_sel) > 0:
                    return {"dataframe": df_sel,
                            "filename": filename}

#### get whole set of pandas.DataFrames selections in one dict

In [13]:
def get_dfs_set(key_list,
                files_list,
                vals=["Minimum",
                      "Maximum",
                      "Change"],
                df_key="dataframe",
                index_by_key=True):
    """
    Get desired values in pandas.Dataframe gathered in dict by
    the list of keys.
    """
    out_dict = {}
    for i in key_list:
        key_vals = find_by_key(files_list,
                              key=i)[df_key]
        out_dict[i] = key_vals[vals]
    if index_by_key is True:
        for i in out_dict:
            out_dict[i].index = [i] * len(out_dict[i])
    else:
        pass
    return out_dict

#### merge any given number of dfs

In [68]:
def merge_dfs(dfs,
              sort_cols=["Minimum",
                         "Maximum"],
              reconstr_index=True):
    """
    Merge any number of pandas.DataFrame into one.
    Indexes must be identical in all the pandas.DataFrames.
    
    Parameters
    -------
    dfs: list
        list of pandas.DataFrames to merge.
    sort_cols: list, None
        list of col names to sort the final
        pandas.DataFrame by. No sorting if None.
    """
    for x in [set(i.index) for i in dfs]:
        assert len(x) == 1, "Indices are not homogenic."
    new_index = list(x)
    df = reduce(lambda df1, df2: pd.merge(left=df1,
                                          right=df2,
                                          how="outer"),
                dfs)
    if sort_cols is not None:
        df.sort(columns=sort_cols)
    if reconstr_index is True:
        df.index = len(df) * new_index
    return df

#### let's find out which CDS are present in all the files one by one

In [15]:
W303_cog7_day42_C1_CDSs = find_flat_value(W303_cog7_day42_C1_files)
W303_cog7_day42_C2_CDSs = find_flat_value(W303_cog7_day42_C2_files)
W303_cog7_day42_C3_CDSs = find_flat_value(W303_cog7_day42_C3_files)

#### let's gather some info about each of the CDS from each sample

In [16]:
W303_cog7_day42_C1_dfs = get_dfs_set(W303_cog7_day42_C1_CDSs, W303_cog7_day42_C1_files)
W303_cog7_day42_C2_dfs = get_dfs_set(W303_cog7_day42_C2_CDSs, W303_cog7_day42_C2_files)
W303_cog7_day42_C3_dfs = get_dfs_set(W303_cog7_day42_C3_CDSs, W303_cog7_day42_C3_files)

#### which CDSs are common for all 3?

In [39]:
W303_cog7_day42_comm_CDS = reduce(lambda x, y: [i for i in x if i in y],
                                  [W303_cog7_day42_C1_CDSs,
                                   W303_cog7_day42_C2_CDSs,
                                   W303_cog7_day42_C3_CDSs])

In [69]:
out_dict = {}
for i in W303_cog7_day42_comm_CDS:
    out_dict[i] = merge_dfs([W303_cog7_day42_C1_dfs[i],
                             W303_cog7_day42_C2_dfs[i],
                             W303_cog7_day42_C3_dfs[i]])



In [70]:
out_dict["FLO1 CDS"]

Unnamed: 0,Minimum,Maximum,Change
FLO1 CDS,204538,204538,A -> G
FLO1 CDS,204529,204529,G -> A
FLO1 CDS,204459,204460,WA -> TT
FLO1 CDS,204456,204456,A -> T
FLO1 CDS,204453,204454,CG -> TT
FLO1 CDS,204449,204451,CAC -> GTT
FLO1 CDS,204443,204448,TSSCCA -> GAAACC
FLO1 CDS,204435,204435,G -> C
FLO1 CDS,204431,204433,ACM -> CAA
FLO1 CDS,204428,204430,BHB -> CGGT
