In [275]:
import pathlib
import pandas as pd
import numpy as np
from typing import Union
import matplotlib.pyplot as plt
import seaborn as sns

In [276]:
def check_file(data_fname: Union[pathlib.Path, str]):
    """
    Check for valid file name
    accept strings and pathlib.Path objects

    Parameters
    ----------
    data_fname: pathlib.Path or str

    return
    ----------
    fname: pathlib.Path
    """

    try:
        fname = pathlib.Path(data_fname)
    except TypeError:
        print("ERROR: Please supply a string or a pathlib.Path instance.")
        raise
    if not fname.exists():
        raise ValueError(f"File {str(fname)} doesn't exist.")
    return fname

In [277]:
def dates_to_datetime_objects(micro_df: pd.DataFrame):
    """
    Change 'date' column from string to datetime objects and normalize the time
    Set the time to be the index of the df

    Parameters
    ----------
    df_list: list of df with column 'date'

    Return
    ----------
    df_list: list of df
    """
    
    micro_df['date'] = pd.to_datetime(micro_df['date'], dayfirst=True).dt.normalize()
    micro_df = micro_df.set_index('date')
    micro_df = micro_df[~micro_df.index.duplicated(keep='first')]
    micro_df = micro_df.reset_index()
    return micro_df

In [278]:
def split_microscopic_to_reactor(micro_df: pd.DataFrame):
    """
    Splits the microscopic data to 4 reactors dfs and saves it in df list
    Changes the columns names to be identical in the microscopic data frame of each reactor.
    
    Parameters
    ----------
    data_micro: pd.DataFrame
    
    Returns
    -------
    micro_df_list: List of 4 dfs, each representing a bio_reactor
    """
    micro_df_list = []
    for i in range(0, 4):
        lcol = (len(micro_df.columns.to_list())-1)/4
        first_col = 1 + lcol * i
        last_col = 1 + lcol * (i + 1)
        micro_reactor_df = micro_df.iloc[:, np.r_[0, first_col:last_col]]
        micro_reactor_df = micro_reactor_df.set_index('date')
        if i > 0:
            micro_reactor_df = micro_reactor_df.rename(columns = lambda x : str(x)[:-2])
        micro_df_list.append(micro_reactor_df)

    return micro_df_list

In [279]:
def remove_nan_rows(micro_df: pd.DataFrame):
    """
    Remove rows that contain only nan values (except date column).
    Change df inplace.
    
    Parameters
    ----------
    micro_df: pd.DataFrame
   
    """
    micro_df.dropna(how = 'all', inplace=True)

In [280]:
def fix_col_to_float(micro_df: pd.DataFrame, col_i: int):
    """ 
    Fix string values with commas to float values, in column number 'col_i'.
    Change df inplace.
    
    Parameters
    ----------
    micro_df: pd.DataFrame
    col_i: int
        column index

    """
    for row_i in range(micro_df.shape[0]):
        datum = micro_df.iloc[row_i, col_i]
        if type(datum) is str and ',' in datum:
            num = datum.split(',')
            micro_df.iloc[row_i, col_i] = num[0]+num[1]

    col_name = micro_df.columns[col_i]
    micro_df.loc[:, col_name] = pd.to_numeric(micro_df[col_name])

In [281]:
def fix_object_cols_to_float(micro_df: pd.DataFrame):
    """
    Convert 'object' columns with string numbers to dtype float
    Change df inplace.

    Parameters
    ----------
    micro_df: pd.DataFrame
    """
    obj_cols_is = [] 
    for col_i in range(0, len(micro_df.dtypes)):
        if micro_df.dtypes[col_i]==object:
            obj_cols_is.append(col_i)
    
    for col_i in obj_cols_is:
        fix_col_to_float(micro_df, col_i)

In [282]:
def remove_negatives(micro_df: pd.DataFrame):
    """
    Replaces negative values with NaN.
    Change df inplace.

    Parameters
    ----------
    micro_df: pd.DataFrame
    """

    numeric = micro_df._get_numeric_data()
    numeric.where(numeric>=0, np.nan, inplace=True)

In [283]:
def filaments_zero_to_nan(micro_df: pd.DataFrame):
    """
    If a row has all its "filament" columns 0 or NaN,
    turns all the "filament" values, including the "Total count- Filaments" to NaN.
    Change df inplace.

    Parameters
    ----------
    micro_df: pd.DataFrame
    """
    ## find col index of first filament:
    for i in range(len(micro_df.columns)):
        if '0041_0675' in micro_df.columns[i]:
            first_filament = i-1
            break

    for i in range(micro_df.shape[0]):
        # if all fillaments are NaN or Zero, turn them all to NaN
        if (micro_df.iloc[i, first_filament:first_filament+9].isin([0, np.nan])).all():
            micro_df.iloc[i, first_filament:] = np.nan

In [284]:
def all_zero_to_nan(micro_df: pd.DataFrame):
    """
    If a row has all its columns 0 or NaN,
    turns all the values to NaN.
    Change df inplace.

    Parameters
    ----------
    micro_df: pd.DataFrame
    """
    for i in range(micro_df.shape[0]):
        # if all row's values are NaN or Zero, turn them all to NaN
        if micro_df.iloc[i,:].isin([0, np.nan]).all():
            micro_df.iloc[i, :] = np.nan

In [285]:
def floc_size(micro_df: pd.DataFrame):
    micro_df["Floc_size_small"] = np.where(micro_df["Floc size"].isin([1, 4, 6, 7, 12, 13, 123]), 1,
                                          np.where(micro_df["Floc size"].isin([2, 3, 5, 23, 2.5]), 0, np.nan))
    micro_df["Floc_size_medium"] = np.where(micro_df["Floc size"].isin([2, 5, 6, 7, 12, 23, 2.5, 123]), 1,
                                          np.where(micro_df["Floc size"].isin([1, 3, 4, 13]), 0, np.nan))
    micro_df["Floc_size_large"] = np.where(micro_df["Floc size"].isin([3, 4, 5, 7, 13, 23, 2.5, 123]), 1,
                                          np.where(micro_df["Floc size"].isin([1, 2, 6, 12]), 0, np.nan))
    micro_df = micro_df.drop(["Floc size"], axis=1)
    return micro_df

In [296]:
def floc_shape(micro_df: pd.DataFrame):
    micro_df["Shape_close"] = np.where(micro_df["Close"].isin([3, 0.6, 0.7, 2, 60, 0.02, 0.5, 25]), 1,
                                 np.where(micro_df["Open"].isin([3, 40, 0.5, 75, 0.4, 2]), 1, 
                                 np.where(micro_df["Close"].isin([0, 1]), 0,        
                                 np.where(micro_df["Open and close"].isin([2, 3]), 1,
                                 np.where(micro_df["Shape"].isin([2, 3]), 1,
                                 np.where(micro_df["Shape"].isin([1]), 0, np.nan))))))
    micro_df["Shape_open"] = np.where(micro_df["Open"].isin([3, 40, 0.04, 0.5, 75, 1, 0.4]), 1,
                                 np.where(micro_df["Close"].isin([3, 0.6, 0.7, 0.5, 25, 1]), 1,
                                 np.where(micro_df["Open"].isin([0, 2]), 0,
                                 np.where(micro_df["Open and close"].isin([3]), 1,
                                 np.where(micro_df["Open and close"].isin([2]), 0, 
                                 np.where(micro_df["Shape"].isin([1, 3]), 1,
                                 np.where(micro_df["Shape"].isin([2]), 0, np.nan)))))))

In [297]:
def clean_micro_df(micro_df: pd.DataFrame):
    """
    Cleans values of microscopic dataframes with all the cleansing functions

    Parameters
    ----------
    micro_df: pd.DataFrame

    Return
    ----------
    micro_df: pd.DataFrame
    """
    remove_nan_rows(micro_df)
    fix_object_cols_to_float(micro_df)
    remove_negatives(micro_df)
    filaments_zero_to_nan(micro_df)
    all_zero_to_nan(micro_df)
    micro_df = floc_size(micro_df)
    floc_shape(micro_df)
    return micro_df

In [298]:
def clean_micro_df_list(micro_df_list: list):
    """
    Loop over the 4 dataframes in the dataframe list
    and use the clean_micro_df to clean values.
    Changes all df in list inplace.

    Parameters
    ----------
    micro_df_list: list
    """
    for i in range(4):
        micro_df_list[i] = clean_micro_df(micro_df_list[i])

In [299]:
def save_dfs_to_csv(df_list: list, data_name: str):
    """
    Save the split, cleaned list of 4 bio reactors dataframes to csv file.
    If files already exists, skips saving.

    Parameters
    ----------
    df_list: list
    data_name: str
        desirable csv file name
    """
    assert data_name in {"svi", "micro"}, 'data_name invalid, expected "svi"/"micro"'
    for i in range(4):
        fname = pathlib.Path("clean_tables/" + f"{data_name}_{i+1}.csv")
        if not pathlib.Path(fname).is_file():  # only if it does not exist yet
            df_list[i].to_csv(fname, index=True)

In [300]:
if __name__ == "__main__":
    micro_path = check_file("micro - total.csv")
    micro_df = pd.read_csv(micro_path)
    micro_df = dates_to_datetime_objects(micro_df)
    micro_df_list = split_microscopic_to_reactor(micro_df)
    clean_micro_df_list(micro_df_list)
    save_dfs_to_csv(micro_df_list,"micro")

In [301]:
micro_df_list[0].columns

Index(['arcella', 'nude ameba', 'aspidisca', 'trachelopylum', 'lionutus',
       'paramecium', 'carchecium', 'epistylis', 'opercularia', 'podophyra',
       'tokophyra', 'vorticella', 'rotifer', 'nematode', 'worms',
       'micro flagellates', 'peranema trich', 'spirochaetes', '0041_0675',
       '0092', '1851', 'beggiatoa', 'Microthrix', 'N. Limicola', 'Nocardia',
       'Thiothrix', 'zoogloea', 'Floc Strength', 'Indian Ink',
       'Filament index', 'Close', 'Open and close', 'Open', 'Shape',
       'In and out', 'In', 'Free filaments', 'Filaments location',
       'Floc_size_small', 'Floc_size_medium', 'Floc_size_large', 'Shape_close',
       'Shape_open'],
      dtype='object')

In [302]:
r= micro_df_list[0][["Shape_open", "Shape_close"]]

In [303]:
r[200:250]

Unnamed: 0_level_0,Shape_open,Shape_close
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-08-06,,
2015-09-03,1.0,1.0
2015-09-20,1.0,1.0
2015-10-07,1.0,1.0
2015-10-14,1.0,0.0
2015-10-18,1.0,0.0
2015-10-20,1.0,1.0
2015-12-09,1.0,0.0
2015-12-15,,
2015-12-17,,
