In [2]:
import pathlib
import pandas as pd
import numpy as np
from typing import Union
import matplotlib.pyplot as plt
import seaborn as sns

In [124]:
def check_file(data_fname: Union[pathlib.Path, str]):
    """
    Check for valid file name
    accept strings and pathlib.Path objects

    Parameters
    ----------
    data_fname: pathlib.Path or str

    return
    ----------
    fname: pathlib.Path
    """

    try:
        fname = pathlib.Path(data_fname)
    except TypeError:
        print("ERROR: Please supply a string or a pathlib.Path instance.")
        raise
    if not fname.exists():
        raise ValueError(f"File {str(fname)} doesn't exist.")
    return fname

In [130]:
def dates_to_datetime_objects(micro_df: pd.DataFrame):
    """
    Change 'date' column from string to datetime objects and normalize the time
    Set the time to be the index of the df

    Parameters
    ----------
    df_list: list of df with column 'date'

    Return
    ----------
    df_list: list of df
    """
    
    micro_df['Time'] = pd.to_datetime(micro_df['Time'], dayfirst=True).dt.normalize()
    micro_df = micro_df.set_index('Time')
    micro_df = micro_df[~micro_df.index.duplicated(keep='first')]
    micro_df = micro_df.reset_index()
    return micro_df

In [131]:
def split_microscopic_to_reactor(micro_df: pd.DataFrame):
    """
    Splits the microscopic data to 4 reactors dfs and saves it in df list
    Changes the columns names to be identical in the microscopic data frame of each reactor.
    
    Parameters
    ----------
    data_micro: pd.DataFrame
    
    Returns
    -------
    micro_df_list: List of 4 dfs, each representing a bio_reactor
    """
    micro_df_list = []
    for i in range(0, 4):
        # 27 columns for each reactor, starting with 1:27...
        first_col = 1 + 27 * i
        last_col = 1 + 27 * (i + 1)
        micro_reactor_df = micro_df.iloc[:, np.r_[0, first_col:last_col]]
        micro_reactor_df.columns = [
            "date",
            "ameoba_arcella",
            "ameoba_nude ameba",
            "crawling ciliates_aspidisca",
            "crawling ciliates_trachelopylum",
            "free swimming ciliates_lionutus",
            "free swimming ciliates_paramecium",
            "stalked ciliate_epistylis",
            "stalked ciliate_vorticella",
            "stalked ciliate_carchecium",
            "stalked ciliate_tokophyra",
            "stalked ciliate_podophyra",
            "stalked ciliate_opercularia",
            "rotifer_rotifer",
            "worms_nematode",
            "worms_worms",
            "flagellates_peranema trich",
            "flagellates_micro flagellates",
            "spirochaetes_spirochaetes",
            "Filaments_Nocardia_index",
            "Filaments_Microthrix_index",
            "Filaments_N. Limicola_index",
            "Filaments_Thiothrix_index",
            "Filaments_0041/0675_index",
            "Filaments_0092_index",
            "Filaments_1851_index",
            "Filaments_beggiatoa_index",
            "Filaments_zoogloea_index",
        ]
        micro_df_list.append(micro_reactor_df)

    return micro_df_list

In [162]:
def remove_nan_rows(micro_df: pd.DataFrame):
    """
    Remove rows that contain only nan values (except date column).
    Change df inplace.
    
    Parameters
    ----------
    micro_df: pd.DataFrame
   
    """
    data_cols = micro_df.columns.tolist()[1:]
    micro_df.dropna(how = 'all', subset = data_cols, inplace=True)

In [133]:
def fix_col_to_float(micro_df: pd.DataFrame, col_i: int):
    """ 
    Fix string values with commas to float values, in column number 'col_i'.
    Change df inplace.
    
    Parameters
    ----------
    micro_df: pd.DataFrame
    col_i: int
        column index

    """
    for row_i in range(micro_df.shape[0]):
        datum = micro_df.iloc[row_i, col_i]
        if type(datum) is str and ',' in datum:
            num = datum.split(',')
            micro_df.iloc[row_i, col_i] = num[0]+num[1]

    col_name = micro_df.columns[col_i]
    micro_df.loc[:, col_name] = pd.to_numeric(micro_df[col_name])

In [134]:
def fix_object_cols_to_float(micro_df: pd.DataFrame):
    """
    Convert 'object' columns with string numbers to dtype float
    Change df inplace.

    Parameters
    ----------
    micro_df: pd.DataFrame
    """
    obj_cols_is = [] 
    for col_i in range(1, len(micro_df.dtypes)): # exclude 'date' column
        if micro_df.dtypes[col_i]==object:
            obj_cols_is.append(col_i)
    
    for col_i in obj_cols_is:
        fix_col_to_float(micro_df, col_i)

In [135]:
def remove_negatives(micro_df: pd.DataFrame):
    """
    Replaces negative values with NaN.
    Change df inplace.

    Parameters
    ----------
    micro_df: pd.DataFrame
    """

    numeric = micro_df._get_numeric_data()
    numeric.where(numeric>=0, np.nan, inplace=True)

In [136]:
def filaments_zero_to_nan(micro_df: pd.DataFrame):
    """
    If a row has all its "filament" columns 0 or NaN,
    turns all the "filament" values, including the "Total count- Filaments" to NaN.
    Change df inplace.

    Parameters
    ----------
    micro_df: pd.DataFrame
    """
    ## find col index of first filament:
    for i in range(len(micro_df.columns)):
        if 'Filaments' in micro_df.columns[i]:
            first_filament = i
            break

    for i in range(micro_df.shape[0]):
        # if all fillaments are NaN or Zero, turn them all, including "Total" to NaN
        if (pd.isnull(micro_df.iloc[i, first_filament + 1:])).all() or (micro_df.iloc[i, first_filament + 1:]==0).all():
            micro_df.iloc[i, first_filament:] = np.nan

In [102]:
def clean_micro_df(micro_df: pd.DataFrame):
    """
    Cleans values of microscopic dataframes with all the cleansing functions

    Parameters
    ----------
    micro_df: pd.DataFrame

    Return
    ----------
    micro_df: pd.DataFrame
    """
    remove_nan_rows(micro_df)
    fix_object_cols_to_float(micro_df)
    remove_negatives(micro_df)
    filaments_zero_to_nan(micro_df)
    return micro_df

In [103]:
def clean_micro_df_list(micro_df_list: list):
    """
    Loop over the 4 dataframes in the dataframe list
    and use the clean_micro_df to clean values.
    Changes all df in list inplace.

    Parameters
    ----------
    micro_df_list: list
    """
    for i in range(4):
        micro_df_list[i] = clean_micro_df(micro_df_list[i])

In [114]:
def set_index_df_list(micro_df_list: list):
    """
    Loop over the 4 dataframes in the dataframe list and set index for date
    """
    
    for i in range(4):
        micro_df_list[i] = micro_df_list[i].set_index('date')
    

In [104]:
def save_dfs_to_csv(df_list: list, data_name: str):
    """
    Save the split, cleaned list of 4 bio reactors dataframes to csv file.
    If files already exists, skips saving.

    Parameters
    ----------
    df_list: list
    data_name: str
        desirable csv file name
    """
    assert data_name in {"svi", "micro"}, 'data_name invalid, expected "svi"/"micro"'
    for i in range(4):
        fname = pathlib.Path("micro_tables/" + f"{data_name}_{i}.csv")
        if not pathlib.Path(fname).is_file():  # only if it does not exist yet
            df_list[i].to_csv(fname, index=False)

In [175]:
if __name__ == "__main__":
    micro_path = check_file("micro - total.csv")
    micro_df = pd.read_csv(micro_path)
    micro_df = dates_to_datetime_objects(micro_df)
    micro_df_list = split_microscopic_to_reactor(micro_df)
    clean_micro_df_list(micro_df_list)
    save_dfs_to_csv(micro_df_list,"micro")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [172]:
micro_df_list[0]

Unnamed: 0_level_0,ameoba_arcella,ameoba_nude ameba,crawling ciliates_aspidisca,crawling ciliates_trachelopylum,free swimming ciliates_lionutus,free swimming ciliates_paramecium,stalked ciliate_epistylis,stalked ciliate_vorticella,stalked ciliate_carchecium,stalked ciliate_tokophyra,...,spirochaetes_spirochaetes,Filaments_Nocardia_index,Filaments_Microthrix_index,Filaments_N. Limicola_index,Filaments_Thiothrix_index,Filaments_0041/0675_index,Filaments_0092_index,Filaments_1851_index,Filaments_beggiatoa_index,Filaments_zoogloea_index
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-02,67.0,8.0,,1.0,4.0,,9.0,53.0,6.0,,...,,,,,,,,,,
2010-03-08,27.0,8.0,,1.0,9.0,,11.0,38.0,0.0,0.0,...,0.0,,,,,,,,,
2010-04-22,5.0,9.0,,0.0,7.0,,2.0,32.0,0.0,0.0,...,0.0,,,,,,,,,
2010-04-29,9.0,16.0,,0.0,25.0,,2.0,23.0,0.0,1.0,...,0.0,,,,,,,,,
2010-06-01,1.0,8.0,,0.0,8.0,,10.0,10.0,0.0,0.0,...,11.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-07-20,9.0,3.0,8.0,0.0,8.0,0.0,9.0,53.0,0.0,0.0,...,1210.0,0.0,,,,4.0,,,3.0,1.0
2020-07-28,7.0,29.0,19.0,1.0,10.0,2.0,27.0,70.0,0.0,3.0,...,1183.0,1.0,1.0,4.0,0.0,5.0,0.0,3.0,1.0,1.0
2020-08-03,5.0,5.0,0.0,0.0,13.0,1.0,15.0,32.0,0.0,0.0,...,2200.0,0.0,,,,4.0,,,3.0,1.0
2020-08-17,5.0,5.0,12.0,2.0,10.0,1.0,70.0,23.0,0.0,0.0,...,1911.0,0.0,1.0,5.0,3.0,4.0,0.0,2.0,0.0,2.0
