# Tutorial #2 - Preprocessing options

## Load the PyEEM library and display version

In [None]:
import pyeem
print(pyeem.__version__)

## Check out the supported instruments

In [None]:
pyeem.instruments.supported

## Check out the demo datasets

In [None]:
demos_df = pyeem.datasets.demos
display(demos_df)

print("Dataset description for the drEEM demo:")
print(demos_df[
    demos_df["demo_name"] == "drEEM"
]["description"].item())

## Download the drEEM demo dataset from S3
- Please note that this step requires an internet connection because the data is downloaded from an AWS S3 bucket.

In [None]:
demo_data_dir = pyeem.datasets.download_demo(
    "demo_data",
    demo_name="drEEM"
)

## Load the dataset

In [None]:
demo_data_dir = "demo_data/drEEM"

dataset = pyeem.datasets.Dataset(
    data_dir=demo_data_dir,
    raman_instrument="fluorolog",
    absorbance_instrument="cary_4e",
    eem_instrument="fluorolog",
    mode="w"
)

## Let's checkout the metadata
- The metadata contains information about collected sample sets which are composed of a few different scan types.

In [None]:
display(dataset.meta_df)

## Checkout the metadata summary information

In [None]:
dataset.metadata_summary_info()

In [None]:
from IPython.display import HTML
fig_kws = {"dpi": 200}
anim = pyeem.plots.water_raman_peak_animation(dataset, excitation_wavelength=275, fig_kws=fig_kws)
HTML(anim.to_html5_video())

In [None]:
import matplotlib.pyplot as plt
fig_kws={"dpi": 95}
plot_kws = {"fmt": "o-"}
kwargs = {"byweekday": 0}
ax = pyeem.plots.water_raman_timeseries(dataset, excitation_wavelength=275, fig_kws=fig_kws, plot_kws=plot_kws, **kwargs)
plt.show()

## Create a preprocessing routine
- The demo dataset contains raw scans, in order to analyze and interpret this data, we must first apply several preprocessing steps.

In [None]:
routine_df = pyeem.preprocessing.create_routine(
    crop = False,
    discrete_wavelengths = False,
    gaussian_smoothing = False,
    blank_subtraction = True,
    inner_filter_effect = True,
    raman_normalization = True,
    scatter_removal = True,
    dilution = False,
)

display(routine_df)

## Execute the preprocessing routine
- Each preprocessing step has certain knobs and dials you can tune to have them run to your liking. It is worth checking the documentation to learn more about these customizations.
- Please note that depending on the steps and settings you've chosen as well as your dataset's size, the time it takes for this step to complete will vary.

In [None]:
kwargs = {
    "raman_source_type": "water_raman",
    "water_raman_wavelength": 275,
    "excision_width": 30,
    "fill": "interp",
}


routine_results_df = pyeem.preprocessing.perform_routine(
    dataset,
    routine_df,
    progress_bar=True,
    **kwargs
)

display(routine_results_df)

## Check to see if any of the steps failed to complete
- If you are using a demo dataset, you should see an empty dataframe.

In [None]:
display(routine_results_df[
    routine_results_df["step_exception"].notna()
])

## Visualize the preprocessing steps for a single sample

In [None]:
import matplotlib.pyplot as plt

sample_set = 16
sample_name = "sample_eem1"
axes = pyeem.plots.preprocessing_routine_plot(
    dataset,
    routine_results_df,
    sample_set=sample_set,
    sample_name=sample_name,
    plot_type="contour",
    fig_kws={"dpi": 200},
)
plt.show()

In [None]:
routine_df = pyeem.preprocessing.create_routine(
    crop = False,
    discrete_wavelengths = False,
    gaussian_smoothing = False,
    blank_subtraction = True,
    inner_filter_effect = False,
    raman_normalization = False,
    scatter_removal = True,
    dilution = False,
)

display(routine_df)

In [None]:
routine_results_df = pyeem.preprocessing.perform_routine(
    dataset,
    routine_df,
    fill = None,
    excision_width = 25,
    progress_bar=True
)

axes = pyeem.plots.preprocessing_routine_plot(
    dataset,
    routine_results_df,
    sample_set=sample_set,
    sample_name=sample_name,
    plot_type="contour",
    fig_kws={"dpi": 200},
)
plt.show()

In [None]:
routine_results_df = pyeem.preprocessing.perform_routine(
    dataset,
    routine_df,
    raman_source_type = "water_raman",
    fill = None,
    truncate = "both",
    progress_bar=True
)

axes = pyeem.plots.preprocessing_routine_plot(
    dataset,
    routine_results_df,
    sample_set=sample_set,
    sample_name=sample_name,
    plot_type="contour",
    fig_kws={"dpi": 200},
)
plt.show()

In [None]:
routine_results_df = pyeem.preprocessing.perform_routine(
    dataset,
    routine_df,
    raman_source_type = "water_raman",
    fill = None,
    band="rayleigh",
    order="first",
    excision_width=20,
    progress_bar=True
)

axes = pyeem.plots.preprocessing_routine_plot(
    dataset,
    routine_results_df,
    sample_set=sample_set,
    sample_name=sample_name,
    plot_type="contour",
    fig_kws={"dpi": 200},
)
plt.show()

In [None]:
routine_results_df = pyeem.preprocessing.perform_routine(
    dataset,
    routine_df,
    raman_source_type = "water_raman",
    fill = "interp",
    band="both",
    excision_width = 25,
    progress_bar=True
)

axes = pyeem.plots.preprocessing_routine_plot(
    dataset,
    routine_results_df,
    sample_set=sample_set,
    sample_name=sample_name,
    plot_type="imshow",
)
plt.show()

In [None]:
routine_results_df = pyeem.preprocessing.perform_routine(
    dataset,
    routine_df,
    raman_source_type = "water_raman",
    fill = None,
    band="rayleigh",
    order="first",
    truncate="below",
    excision_width = 25,
    progress_bar=True
)

axes = pyeem.plots.preprocessing_routine_plot(
    dataset,
    routine_results_df,
    sample_set=sample_set,
    sample_name=sample_name,
    plot_type="imshow",
)
plt.show()