In [886]:
import pathlib
import pandas as pd
import numpy as np
from typing import Union
import matplotlib.pyplot as plt
import seaborn as sns

In [887]:
def check_file(data_fname: Union[pathlib.Path, str]):
    """
    Check for valid file name
    accept strings and pathlib.Path objects

    Parameters
    ----------
    data_fname: pathlib.Path or str

    return
    ----------
    fname: pathlib.Path
    """

    try:
        fname = pathlib.Path(data_fname)
    except TypeError:
        print("ERROR: Please supply a string or a pathlib.Path instance.")
        raise
    if not fname.exists():
        raise ValueError(f"File {str(fname)} doesn't exist.")
    return fname

In [888]:
def dates_to_datetime_objects(micro_df: pd.DataFrame):
    """
    Change 'date' column from string to datetime objects and normalize the time
    Set the time to be the index of the df

    Parameters
    ----------
    df_list: list of df with column 'date'

    Return
    ----------
    df_list: list of df
    """
    
    micro_df['date'] = pd.to_datetime(micro_df['date'], dayfirst=True).dt.normalize()
    micro_df = micro_df.set_index('date')
    micro_df = micro_df[~micro_df.index.duplicated(keep='first')]
    micro_df = micro_df.reset_index()
    return micro_df

In [889]:
def split_microscopic_to_reactor(micro_df: pd.DataFrame):
    """
    Splits the microscopic data to 4 reactors dfs and saves it in df list
    Changes the columns names to be identical in the microscopic data frame of each reactor.
    
    Parameters
    ----------
    data_micro: pd.DataFrame
    
    Returns
    -------
    micro_df_list: List of 4 dfs, each representing a bio_reactor
    """
    micro_df_list = []
    for i in range(0, 4):
        lcol = (len(micro_df.columns.to_list())-1)/4
        first_col = 1 + lcol * i
        last_col = 1 + lcol * (i + 1)
        micro_reactor_df = micro_df.iloc[:, np.r_[0, first_col:last_col]]
        micro_reactor_df = micro_reactor_df.set_index('date')
        if i > 0:
            micro_reactor_df = micro_reactor_df.rename(columns = lambda x : str(x)[:-2])
        micro_df_list.append(micro_reactor_df)

    return micro_df_list

In [890]:
def remove_nan_rows(micro_df: pd.DataFrame):
    """
    Remove rows that contain only nan values (except date column).
    Change df inplace.
    
    Parameters
    ----------
    micro_df: pd.DataFrame
   
    """
    micro_df = micro_df.dropna(how = 'all', inplace=True)

In [891]:
def fix_col_to_float(micro_df: pd.DataFrame, col_i: int):
    """ 
    Fix string values with commas to float values, in column number 'col_i'.
    Change df inplace.
    
    Parameters
    ----------
    micro_df: pd.DataFrame
    col_i: int
        column index

    """
    for row_i in range(micro_df.shape[0]):
        datum = micro_df.iloc[row_i, col_i]
        if type(datum) is str and ',' in datum:
            num = datum.split(',')
            micro_df.iloc[row_i, col_i] = num[0]+num[1]

    col_name = micro_df.columns[col_i]
    micro_df.loc[:, col_name] = pd.to_numeric(micro_df[col_name])

In [892]:
def fix_object_cols_to_float(micro_df: pd.DataFrame):
    """
    Convert 'object' columns with string numbers to dtype float
    Change df inplace.

    Parameters
    ----------
    micro_df: pd.DataFrame
    """
    obj_cols_is = [] 
    for col_i in range(0, len(micro_df.dtypes)):
        if micro_df.dtypes[col_i]==object:
            obj_cols_is.append(col_i)
    
    for col_i in obj_cols_is:
        fix_col_to_float(micro_df, col_i)

In [893]:
def remove_negatives(micro_df: pd.DataFrame):
    """
    Replaces negative values with NaN.
    Change df inplace.

    Parameters
    ----------
    micro_df: pd.DataFrame
    """

    numeric = micro_df._get_numeric_data()
    numeric.where(numeric>=0, np.nan, inplace=True)

In [894]:
def filaments_zero_to_nan(micro_df: pd.DataFrame):
    """
    If a row has all its "filament" columns 0 or NaN,
    turns all the "filament" values to NaN.
    Change df inplace.

    Parameters
    ----------
    micro_df: pd.DataFrame
    """
    ## find col index of first filament:
    for i in range(len(micro_df.columns)):
        if '0041_0675' in micro_df.columns[i]:
            first_filament = i
            break

    for i in range(micro_df.shape[0]):
        # if all fillaments are NaN or Zero, turn them all to NaN
        if (micro_df.iloc[i, first_filament:first_filament+9].isin([0, np.nan])).all():
            micro_df.iloc[i, first_filament:first_filament+9] = np.nan

In [895]:
def all_zero_to_nan(micro_df: pd.DataFrame):
    """
    If a row has all its columns 0 or NaN,
    turns all the values to NaN.
    Change df inplace.

    Parameters
    ----------
    micro_df: pd.DataFrame
    """
    for i in range(micro_df.shape[0]):
        # if all row's values are NaN or Zero, turn them all to NaN
        if micro_df.iloc[i,:].isin([0, np.nan]).all():
            micro_df.iloc[i, :] = np.nan

In [896]:
def floc_size(micro_df: pd.DataFrame):
    micro_df["Floc_size_small"] = np.where(micro_df["Floc size"].isin([1, 4, 6, 7, 12, 13, 123]), 1,
                                          np.where(micro_df["Floc size"].isin([2, 3, 5, 23, 2.5]), 0, np.nan))
    micro_df["Floc_size_medium"] = np.where(micro_df["Floc size"].isin([2, 5, 6, 7, 12, 23, 2.5, 123]), 1,
                                          np.where(micro_df["Floc size"].isin([1, 3, 4, 13]), 0, np.nan))
    micro_df["Floc_size_large"] = np.where(micro_df["Floc size"].isin([3, 4, 5, 7, 13, 23, 2.5, 123]), 1,
                                          np.where(micro_df["Floc size"].isin([1, 2, 6, 12]), 0, np.nan))
    micro_df = micro_df.drop(["Floc size"], axis=1)
    return micro_df

In [897]:
def floc_shape(micro_df: pd.DataFrame):
    micro_df["Shape_close"] = np.where(micro_df["Close"].isin([3, 2, 60, 0.02, 25, 95] + np.round(np.arange (0.15,0.95,0.05),2).tolist()), 1,
                                 np.where(micro_df["Open"].isin([3, 40, 75, 2] + np.round(np.arange (0.05,0.85,0.05),2).tolist()), 1, 
                                 np.where(micro_df["Open and close"].isin([2, 3]), 1,
                                 np.where(micro_df["Close"].isin([0, 1]), 0,        
                                 np.where(micro_df["Shape"].isin([2, 3]), 1,
                                 np.where(micro_df["Shape"].isin([1]), 0, np.nan))))))
    micro_df["Shape_open"] = np.where(micro_df["Open"].isin([3, 40, 0.04, 75, 1, 95] + np.round(np.arange (0.15,0.95,0.05),2).tolist()), 1,
                                 np.where(micro_df["Close"].isin([3, 25, 1] + np.round(np.arange (0.05,0.85,0.05),2).tolist()), 1,
                                 np.where(micro_df["Open and close"].isin([3]), 1,
                                 np.where(micro_df["Open"].isin([0, 2]), 0,
                                 np.where(micro_df["Open and close"].isin([2]), 0, 
                                 np.where(micro_df["Shape"].isin([1, 3]), 1,
                                 np.where(micro_df["Shape"].isin([2]), 0, np.nan)))))))
    micro_df[["Shape_open", "Shape_close"]] = micro_df[["Shape_open", "Shape_close"]].fillna(0)
    micro_df = micro_df.drop(["Close", "Open", "Open and close", "Shape"], axis=1)
    return micro_df

In [898]:
def floc_filament_location(micro_df: pd.DataFrame):
    micro_df["Filaments_in_floc"] = np.where(micro_df["In and out"].isin([1, 2, 95] + np.round(np.arange (0.05,0.99,0.05),2).tolist()), 1,
                                 np.where(micro_df["In"].isin([1, 2, 0.98] + np.round(np.arange (0.15,0.99,0.05),2).tolist()), 1, 
                                 np.where(micro_df["Free filaments"].isin([3]), 0,
                                 np.where(micro_df["Free filaments"].isin([2] + np.round(np.arange (0.05,0.85,0.05),2).tolist()), 1,          
                                 np.where(micro_df["Filaments location"].isin([1, 2]), 1,        
                                 np.where(micro_df["Filaments location"].isin([3]), 0,
                                 np.where(micro_df["In and out"].isin([3]), 0, np.nan)))))))
    micro_df["Free_filaments"] = np.where(micro_df["In and out"].isin([2, 3, 95] + np.round(np.arange (0.05,0.99,0.05),2).tolist()), 1,
                                 np.where(micro_df["Free filaments"].isin([2, 3] + np.round(np.arange (0.15,0.99,0.05),2).tolist()), 1,
                                 np.where(micro_df["In"].isin([1, 0.98]), 0, 
                                 np.where(micro_df["In"].isin([2] + np.round(np.arange (0.05,0.85,0.05),2).tolist()), 1, 
                                 np.where(micro_df["Filaments location"].isin([2, 3]), 1,        
                                 np.where(micro_df["Filaments location"].isin([1]), 0,
                                 np.where(micro_df["In and out"].isin([1]), 0, np.nan)))))))
    micro_df[["Filaments_in_floc", "Free_filaments"]] = micro_df[["Filaments_in_floc", "Free_filaments"]].fillna(0)
    micro_df = micro_df.drop(["In and out", "In", "Free filaments", "Filaments location"], axis=1)
    return micro_df

In [914]:
def remove_organisms_nan_rows(micro_df: pd.DataFrame):
    df = micro_df.loc[:, "arcella" : "spirochaetes"].isnull().sum(axis=1)
    df_nan = df[df > 16]
    micro_df = micro_df.drop(df_nan.index.tolist())
    return micro_df

In [915]:
# micro_path = check_file("micro - total.csv")
# micro_df = pd.read_csv(micro_path)
# micro_df = dates_to_datetime_objects(micro_df)
# micro_df_list = split_microscopic_to_reactor(micro_df)

In [916]:
# micro_df_list[0]

In [917]:
# remove_nan_rows(micro_df_list[0])
# fix_object_cols_to_float(micro_df_list[0])
# remove_negatives(micro_df_list[0])

In [918]:
# micro_df_list[0]

In [919]:
# for i in range(len(micro_df_list[0].columns)):
#         if '0041_0675' in micro_df.columns[i]:
#             first_filament = i-1
#             break


In [920]:
# micro_df_list[0].iloc[303, :]

In [921]:
# micro_df_list[0].iloc[303, first_filament:first_filament+9] = np.nan

In [922]:
# micro_df_list[0].iloc[303,:]

In [923]:
# for i in range(micro_df.shape[0]):
#         # if all fillaments are NaN or Zero, turn them all to NaN
#         if (micro_df.iloc[i, first_filament:first_filament+9].isin([0, np.nan])).all():
#              micro_df.iloc[i, first_filament:first_filament+9] = np.nan

In [939]:
def clean_micro_df(micro_df: pd.DataFrame):
    """
    Cleans values of microscopic dataframes with all the cleansing functions

    Parameters
    ----------
    micro_df: pd.DataFrame

    Return
    ----------
    micro_df: pd.DataFrame
    """
    filaments_zero_to_nan(micro_df)
    all_zero_to_nan(micro_df)
    remove_nan_rows(micro_df)
    micro_df = remove_organisms_nan_rows(micro_df)
    micro_df.loc[:, "arcella" : "spirochaetes"] = micro_df.loc[:, "arcella" : "spirochaetes"].fillna(0)
    fix_object_cols_to_float(micro_df)
    remove_negatives(micro_df)
    micro_df = floc_size(micro_df)
    micro_df = floc_shape(micro_df)
    micro_df = floc_filament_location(micro_df)
    return micro_df

In [940]:
def clean_micro_df_list(micro_df_list: list):
    """
    Loop over the 4 dataframes in the dataframe list
    and use the clean_micro_df to clean values.
    Changes all df in list inplace.

    Parameters
    ----------
    micro_df_list: list
    """
    for i in range(4):
        micro_df_list[i] = clean_micro_df(micro_df_list[i])

In [941]:
def save_dfs_to_csv(df_list: list, data_name: str):
    """
    Save the split, cleaned list of 4 bio reactors dataframes to csv file.
    If files already exists, skips saving.

    Parameters
    ----------
    df_list: list
    data_name: str
        desirable csv file name
    """
    assert data_name in {"svi", "micro"}, 'data_name invalid, expected "svi"/"micro"'
    for i in range(4):
        fname = pathlib.Path("clean_tables/" + f"{data_name}_{i+1}.csv")
        if not pathlib.Path(fname).is_file():  # only if it does not exist yet
            df_list[i].to_csv(fname, index=True)

In [943]:
if __name__ == "__main__":
    micro_path = check_file("micro - total.csv")
    micro_df = pd.read_csv(micro_path)
    micro_df = dates_to_datetime_objects(micro_df)
    micro_df_list = split_microscopic_to_reactor(micro_df)
    clean_micro_df_list(micro_df_list)
    save_dfs_to_csv(micro_df_list,"micro")

In [932]:
micro_df_list[0]

Unnamed: 0_level_0,arcella,nude ameba,aspidisca,trachelopylum,lionutus,paramecium,carchecium,epistylis,opercularia,podophyra,...,Floc Strength,Indian Ink,Filament index,Floc_size_small,Floc_size_medium,Floc_size_large,Shape_close,Shape_open,Filaments_in_floc,Free_filaments
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-18,40.0,4.0,2.0,0.0,6.0,0.0,1.0,4.0,4.0,0.0,...,,,,,,,0.0,1.0,1.0,1.0
2010-03-02,27.0,5.0,3.0,1.0,16.0,0.0,0.0,13.0,4.0,0.0,...,,,,,,,0.0,1.0,1.0,0.0
2010-03-08,27.0,8.0,14.0,1.0,9.0,1.0,0.0,11.0,9.0,0.0,...,,,,,,,1.0,1.0,1.0,1.0
2010-03-18,11.0,12.0,2.0,0.0,16.0,0.0,0.0,7.0,0.0,0.0,...,,,,,,,0.0,1.0,1.0,1.0
2010-04-08,12.0,6.0,10.0,0.0,13.0,1.0,0.0,1.0,1.0,0.0,...,,,,,,,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-09-23,5.0,11.0,9.0,2.0,5.0,0.0,0.0,6.0,0.0,3.0,...,3.0,1.0,2.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-10-21,7.0,57.0,55.0,0.0,12.0,0.0,14.0,26.0,0.0,0.0,...,1.0,3.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
2020-10-28,14.0,20.0,1.0,0.0,24.0,0.0,0.0,35.0,2.0,0.0,...,2.0,3.0,2.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-11-11,23.0,86.0,17.0,1.0,29.0,0.0,0.0,18.0,0.0,0.0,...,3.0,2.0,2.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
