# Data Preprocessing for Plant Data
This notebook demonstrates the preprocessing steps for the plant data including:
- Loading data
- Outlier detection and handling
- Missing value interpolation
- Smoothing and transformations
- Growth calculation


In [1]:
# Import standard libraries
import os
import sys
import logging

# Add the project root directory to sys.path
project_root = os.path.abspath(os.path.join('..'))
sys.path.append(project_root)

# Import third-party libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
from datetime import datetime

# Import local modules
from src.data.data_preprocessing import (
    initial_data_preparation,
    clean_duplicated_timestamps,
    process_weight_data,
    outliers_by_thresholds,
    outliers_with_moving_avg,
    interpolate_missing_values,
    moving_average_with_kernel_pandas,
    calculate_growth,
)

from src.analysis.exploratory_analysis import (
    plot_basic_plant_measurements,
    plot_col,
)

# Configure logging
logger = logging.getLogger(__name__)
logger.info("Environment setup completed.")

2025-02-05 21:37:49,933 - INFO - Environment setup completed.


## Load the Dataset
We begin by loading the dataset from a Parquet file.

In [2]:
# Specify the file path
raw_dataset_path = os.path.join(project_root, 'data', 'raw', 'full_data.parquet')

# Preprocess the data
data = initial_data_preparation(raw_dataset_path)
data.head()

2025-02-05 21:37:49,955 - INFO - Starting preprocessing for file: g:\My Drive\Shani Friedman\HUJI\ML\lab_git\sci-phy\data\raw\full_data.parquet
2025-02-05 21:37:51,420 - INFO - Renaming 'index' to 'timestamp'
2025-02-05 21:37:53,825 - INFO - Creating 'unique_id' by combining 'plant_ID', 'exp_ID', and 'gh_ID'
2025-02-05 21:37:59,583 - INFO - Preprocessing completed. Number of unique IDs: 496


Unnamed: 0_level_0,s4,wsrh,wstemp,wspar,vpd,gh_ID,exp_ID,plant_ID,plant_type,soil_sand,Weight_change,condition,dt,pnw,unique_id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-07-06 00:00:00,6603.29,85.5,27.0,0.0,0.51674,3,13,1002,Tomato,sand,,W,102.8,42.84257,1002_13_3
2018-07-06 00:03:00,6653.07,85.5,26.0,0.0,0.48715,3,13,1002,Tomato,sand,49.78,W,,,1002_13_3
2018-07-06 00:06:00,6632.14,85.5,26.0,0.0,0.48715,3,13,1002,Tomato,sand,-20.93,W,,,1002_13_3
2018-07-06 00:09:00,6625.71,85.6,26.0,0.0,0.48379,3,13,1002,Tomato,sand,-6.43,W,,,1002_13_3
2018-07-06 00:12:00,6621.66,85.7,26.0,0.0,0.48044,3,13,1002,Tomato,sand,-4.05,W,,,1002_13_3


In [3]:
data = clean_duplicated_timestamps(data)

2025-02-05 21:38:01,153 - INFO - UID 21101_346_3 has duplicated timestamps. Removing duplicates...
2025-02-05 21:38:02,755 - INFO - UID 21135_346_3 has duplicated timestamps. Removing duplicates...
2025-02-05 21:38:05,477 - INFO - Duplicate cleaning process completed successfully.


In [4]:
original_data = data.copy() #for ploting

## Outlier Handling
Next, we identify and replace outliers based on predefined thresholds for specific columns.

In [5]:
# Outliers in weather data

# Define thresholds for outlier detection
threshold_dict = {
    'wsrh': [],
    'wstemp': [1, 47],
    'wspar': [],
    'vpd': [0, 8],
    'dt' : [],
    'pnw' : [] #will be handled separately
}

# Apply outlier handling
for col, thresholds in threshold_dict.items():
    data = outliers_by_thresholds(data, col, thresholds)

data.head()

2025-02-05 21:38:06,786 - INFO - Processing outliers for column 'wsrh' with thresholds: []
2025-02-05 21:38:06,788 - INFO - Processing outliers for column 'wstemp' with thresholds: [1, 47]
2025-02-05 21:38:06,818 - INFO - Outliers set to NaN in column: wstemp
2025-02-05 21:38:06,819 - INFO - Processing outliers for column 'wspar' with thresholds: []
2025-02-05 21:38:06,821 - INFO - Processing outliers for column 'vpd' with thresholds: [0, 8]
2025-02-05 21:38:06,855 - INFO - Outliers set to NaN in column: vpd
2025-02-05 21:38:06,858 - INFO - Processing outliers for column 'dt' with thresholds: []
2025-02-05 21:38:06,861 - INFO - Processing outliers for column 'pnw' with thresholds: []


Unnamed: 0_level_0,s4,wsrh,wstemp,wspar,vpd,gh_ID,exp_ID,plant_ID,plant_type,soil_sand,Weight_change,condition,dt,pnw,unique_id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-07-06 00:00:00,6603.29,85.5,27.0,0.0,0.51674,3,13,1002,Tomato,sand,,W,102.8,42.84257,1002_13_3
2018-07-06 00:03:00,6653.07,85.5,26.0,0.0,0.48715,3,13,1002,Tomato,sand,49.78,W,,,1002_13_3
2018-07-06 00:06:00,6632.14,85.5,26.0,0.0,0.48715,3,13,1002,Tomato,sand,-20.93,W,,,1002_13_3
2018-07-06 00:09:00,6625.71,85.6,26.0,0.0,0.48379,3,13,1002,Tomato,sand,-6.43,W,,,1002_13_3
2018-07-06 00:12:00,6621.66,85.7,26.0,0.0,0.48044,3,13,1002,Tomato,sand,-4.05,W,,,1002_13_3


In [6]:
# Outliers in weight (s4) column
data = outliers_by_thresholds(data, 's4', thresholds = [300, 10000])
data = outliers_with_moving_avg(data, value_column='s4', window_size=45,  threshold=2.5)

2025-02-05 21:38:06,944 - INFO - Processing outliers for column 's4' with thresholds: [300, 10000]
2025-02-05 21:38:06,979 - INFO - Outliers set to NaN in column: s4
2025-02-05 21:38:06,981 - INFO - Finding outliers for column: s4 using moving average with window size 45 and threshold 2.5
2025-02-05 21:38:07,657 - INFO - Outliers found in the following plant IDs: ['103_4_33' '1030_13_3' '108_4_33' '1111_33_33' '1119_33_33' '1121_33_33'
 '1145_33_33' '119_4_33' '11918_189_3' '11926_189_3' '11927_189_3'
 '11933_189_3' '11949_189_3' '11963_189_3' '11974_189_3' '11994_189_3'
 '12005_189_3' '12024_189_3' '12040_189_3' '12055_189_3' '12059_189_3'
 '12083_189_3' '12114_189_3' '12125_189_3' '12135_189_3' '12141_189_3'
 '12157_189_3' '12158_189_3' '12362_199_3' '12394_199_3' '12422_199_3'
 '12450_199_3' '12451_199_3' '12467_199_3' '12498_199_3' '12502_199_3'
 '12526_199_3' '12537_199_3' '12545_199_3' '12562_199_3' '12594_202_3'
 '12603_202_3' '12614_202_3' '12616_202_3' '12619_202_3' '12624_202

In [7]:
data

Unnamed: 0_level_0,s4,wsrh,wstemp,wspar,vpd,gh_ID,exp_ID,plant_ID,plant_type,soil_sand,Weight_change,condition,dt,pnw,unique_id,s4_outlier,s4_clean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2018-07-06 00:00:00,6603.29,85.5,27.0,0.0,0.51674,3,13,1002,Tomato,sand,,W,102.8,42.84257,1002_13_3,False,6603.29
2018-07-06 00:03:00,6653.07,85.5,26.0,0.0,0.48715,3,13,1002,Tomato,sand,49.78,W,,,1002_13_3,False,6653.07
2018-07-06 00:06:00,6632.14,85.5,26.0,0.0,0.48715,3,13,1002,Tomato,sand,-20.93,W,,,1002_13_3,False,6632.14
2018-07-06 00:09:00,6625.71,85.6,26.0,0.0,0.48379,3,13,1002,Tomato,sand,-6.43,W,,,1002_13_3,False,6625.71
2018-07-06 00:12:00,6621.66,85.7,26.0,0.0,0.48044,3,13,1002,Tomato,sand,-4.05,W,,,1002_13_3,False,6621.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-12 23:45:00,6787.65,87.3,13.0,0.0,0.19007,3,346,21135,Tomato,sand,-0.05,W,,,21135_346_3,False,6787.65
2022-12-12 23:48:00,6788.18,87.3,13.0,0.0,0.19007,3,346,21135,Tomato,sand,0.53,W,,,21135_346_3,False,6788.18
2022-12-12 23:51:00,6788.54,87.3,13.0,0.0,0.19007,3,346,21135,Tomato,sand,0.36,W,,,21135_346_3,False,6788.54
2022-12-12 23:54:00,6787.77,87.3,13.0,0.0,0.19007,3,346,21135,Tomato,sand,-0.77,W,,,21135_346_3,False,6787.77


In [8]:
plot_col('8922_154_3', data , 's4')

Plot generated for plant ID: 8922_154_3


In [17]:
plot_basic_plant_measurements('23838_398_3', data)

info: Plot generated for plant ID: 23838_398_3


## Missing Value Interpolation
Interpolate missing values for specific columns while respecting thresholds for consecutive gaps.

In [10]:
# Define columns to interpolate
columns_to_interpolate = ['wsrh', 'wstemp', 'wspar', 'vpd','s4']

# Perform interpolation
data = interpolate_missing_values(data, columns_to_interpolate, method='linear', threshold=43)
data.head()

2025-02-05 21:38:12,364 - INFO - Interpolating missing values for columns: ['wsrh', 'wstemp', 'wspar', 'vpd', 's4'] using method: linear
2025-02-05 21:38:21,110 - INFO - Unique ID '12362_199_3' has 212 NaN values remaining in 's4_clean' after interpolation.
2025-02-05 21:38:21,357 - INFO - Unique ID '12394_199_3' has 198 NaN values remaining in 's4_clean' after interpolation.
2025-02-05 21:38:21,624 - INFO - Unique ID '12422_199_3' has 198 NaN values remaining in 's4_clean' after interpolation.
2025-02-05 21:38:21,859 - INFO - Unique ID '12450_199_3' has 198 NaN values remaining in 's4_clean' after interpolation.
2025-02-05 21:38:22,111 - INFO - Unique ID '12451_199_3' has 212 NaN values remaining in 's4_clean' after interpolation.
2025-02-05 21:38:22,365 - INFO - Unique ID '12467_199_3' has 199 NaN values remaining in 's4_clean' after interpolation.
2025-02-05 21:38:22,607 - INFO - Unique ID '12498_199_3' has 214 NaN values remaining in 's4_clean' after interpolation.
2025-02-05 21:38

Unnamed: 0_level_0,s4,wsrh,wstemp,wspar,vpd,gh_ID,exp_ID,plant_ID,plant_type,soil_sand,Weight_change,condition,dt,pnw,unique_id,s4_outlier,s4_clean
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2018-07-06 00:00:00,6603.29,85.5,27.0,0.0,0.51674,3,13,1002,Tomato,sand,,W,102.8,42.84257,1002_13_3,False,6603.29
2018-07-06 00:03:00,6653.07,85.5,26.0,0.0,0.48715,3,13,1002,Tomato,sand,49.78,W,,,1002_13_3,False,6653.07
2018-07-06 00:06:00,6632.14,85.5,26.0,0.0,0.48715,3,13,1002,Tomato,sand,-20.93,W,,,1002_13_3,False,6632.14
2018-07-06 00:09:00,6625.71,85.6,26.0,0.0,0.48379,3,13,1002,Tomato,sand,-6.43,W,,,1002_13_3,False,6625.71
2018-07-06 00:12:00,6621.66,85.7,26.0,0.0,0.48044,3,13,1002,Tomato,sand,-4.05,W,,,1002_13_3,False,6621.66


## Smoothing and Transformation
Apply kernel smoothing and Savitzky-Golay filters to clean the data further.

In [11]:
# Apply Gaussian smoothing to selected columns
columns_to_smooth = ['wstemp', 's4_clean']
for col in columns_to_smooth:
    data = moving_average_with_kernel_pandas(data, col, window_size=12, kernel_type='gaussian', std_dev=1.5)


2025-02-05 21:40:18,889 - INFO - Applying gaussian smoothing on column: wstemp with window size 12
2025-02-05 21:40:19,337 - INFO - Applying gaussian smoothing on column: s4_clean with window size 12


In [12]:
data.head()

Unnamed: 0_level_0,s4,wsrh,wstemp,wspar,vpd,gh_ID,exp_ID,plant_ID,plant_type,soil_sand,Weight_change,condition,dt,pnw,unique_id,s4_outlier,s4_clean,wstemp_gaussian_smoothed,s4_clean_gaussian_smoothed
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2018-07-06 00:00:00,6603.29,85.5,27.0,0.0,0.51674,3,13,1002,Tomato,sand,,W,102.8,42.84257,1002_13_3,False,6603.29,26.503201,6624.080927
2018-07-06 00:03:00,6653.07,85.5,26.0,0.0,0.48715,3,13,1002,Tomato,sand,49.78,W,,,1002_13_3,False,6653.07,26.334753,6628.623151
2018-07-06 00:06:00,6632.14,85.5,26.0,0.0,0.48715,3,13,1002,Tomato,sand,-20.93,W,,,1002_13_3,False,6632.14,26.176709,6630.625508
2018-07-06 00:09:00,6625.71,85.6,26.0,0.0,0.48379,3,13,1002,Tomato,sand,-6.43,W,,,1002_13_3,False,6625.71,26.067727,6629.116285
2018-07-06 00:12:00,6621.66,85.7,26.0,0.0,0.48044,3,13,1002,Tomato,sand,-4.05,W,,,1002_13_3,False,6621.66,26.01754,6625.496831


## Plant Weight and Growth
Calculate growth metrics based on slopes for each unique plant ID.

In [13]:
# Calculate growth
data = calculate_growth(data)
data[['unique_id', 'growth']].drop_duplicates().head()

2025-02-05 21:40:19,729 - INFO - Calculating growth for each unique_id
2025-02-05 21:40:21,562 - INFO - Calculated slope for unique_id 1002_13_3: 1.646977629281197e-05
2025-02-05 21:40:21,567 - INFO - Calculated slope for unique_id 1011_13_3: 2.2980277638431194e-05
2025-02-05 21:40:21,572 - INFO - Calculated slope for unique_id 1028_13_3: 8.703867977829091e-05
2025-02-05 21:40:21,578 - INFO - Calculated slope for unique_id 1030_13_3: 5.418868388652313e-05
2025-02-05 21:40:21,582 - INFO - Calculated slope for unique_id 103_4_33: 3.9795647455275944e-05
2025-02-05 21:40:21,585 - INFO - Calculated slope for unique_id 108_4_33: 9.393205031236519e-05
2025-02-05 21:40:21,592 - INFO - Calculated slope for unique_id 1111_33_33: 6.086386482992811e-06
2025-02-05 21:40:21,598 - INFO - Calculated slope for unique_id 1119_33_33: 2.6075389774585726e-05
2025-02-05 21:40:21,604 - INFO - Calculated slope for unique_id 1121_33_33: -0.0001671349025412207
2025-02-05 21:40:21,611 - INFO - Calculated slope f

Unnamed: 0_level_0,unique_id,growth
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-07-06,1002_13_3,1.6e-05
2018-07-06,1011_13_3,2.3e-05
2018-07-06,1028_13_3,8.7e-05
2021-04-08,103_4_33,4e-05
2018-07-06,1030_13_3,5.4e-05


Smooth Cumulative Sum and Fix the Weight (pnw)

In [14]:
data = process_weight_data(data)
data = interpolate_missing_values(data,['pnw'], method='linear', threshold=481)  # threshold=1 day

2025-02-05 21:40:33,224 - INFO - No plant weight data for unique_id: 7804_131_3
2025-02-05 21:40:37,331 - INFO - Weight data processing completed successfully.
2025-02-05 21:40:37,913 - INFO - Interpolating missing values for columns: ['pnw'] using method: linear
2025-02-05 21:40:47,679 - INFO - Unique ID '12362_199_3' has 959 NaN values remaining in 'pnw' after interpolation.
2025-02-05 21:40:47,961 - INFO - Unique ID '12394_199_3' has 959 NaN values remaining in 'pnw' after interpolation.
2025-02-05 21:40:48,199 - INFO - Unique ID '12422_199_3' has 959 NaN values remaining in 'pnw' after interpolation.
2025-02-05 21:40:48,427 - INFO - Unique ID '12450_199_3' has 959 NaN values remaining in 'pnw' after interpolation.
2025-02-05 21:40:48,680 - INFO - Unique ID '12451_199_3' has 959 NaN values remaining in 'pnw' after interpolation.
2025-02-05 21:40:48,930 - INFO - Unique ID '12467_199_3' has 959 NaN values remaining in 'pnw' after interpolation.
2025-02-05 21:40:49,163 - INFO - Unique 

## Derivative of the weight (Transpiration rate (g/m))

calculating transpiration rate (g/m) from the weight (s4) column

In [15]:
# Apply Savitzky-Golay filter
data['tr'] = savgol_filter(data['s4_clean_gaussian_smoothed'], window_length=5, polyorder=2, deriv=1)
# Divide the 'tr' column by 3 - so it is gr/min 
data['tr'] = data['tr'] / (-3)

# Filter the DataFrame to include only entries between 4 AM and 8 PM
data = data.between_time('04:00', '20:00', inclusive="left") #interval includes the start time but excludes the end time so it would be a round number of observations.

data.head()

Unnamed: 0_level_0,s4,wsrh,wstemp,wspar,vpd,gh_ID,exp_ID,plant_ID,plant_type,soil_sand,...,dt,pnw,unique_id,s4_outlier,s4_clean,wstemp_gaussian_smoothed,s4_clean_gaussian_smoothed,growth,plant_weight_process,tr
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-07-06 04:00:00,6605.04,87.2,25.0,0.0,0.40524,3,13,1002,Tomato,sand,...,,42.994487,1002_13_3,False,6605.04,25.0,6604.793893,1.6e-05,,0.003617
2018-07-06 04:03:00,6604.66,87.2,25.0,0.0,0.40524,3,13,1002,Tomato,sand,...,,42.996386,1002_13_3,False,6604.66,25.0,6604.784108,1.6e-05,,-0.000586
2018-07-06 04:06:00,6604.52,87.2,25.0,0.0,0.40524,3,13,1002,Tomato,sand,...,,42.998285,1002_13_3,False,6604.52,25.0,6604.779307,1.6e-05,,-0.006561
2018-07-06 04:09:00,6605.09,87.3,25.0,0.0,0.40208,3,13,1002,Tomato,sand,...,,43.000184,1002_13_3,False,6605.09,25.0,6604.814701,1.6e-05,,-0.011974
2018-07-06 04:12:00,6604.75,87.2,25.0,0.0,0.40524,3,13,1002,Tomato,sand,...,,43.002083,1002_13_3,False,6604.75,25.0,6604.877013,1.6e-05,,-0.010666


## Summary
The preprocessing steps have been successfully applied to the dataset, making it ready for further analysis.

In [12]:
data['unique_id'].nunique()

496

In [13]:
# Build the path to data/processed
processed_dir = os.path.join(project_root, 'data', 'processed')
os.makedirs(processed_dir, exist_ok=True)  # Ensure the directory exists

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
file_path = os.path.join(processed_dir, 'preprocessed_data_{timestamp}.parquet')

data.to_parquet(file_path)

In [None]:
# Apply the smoothing function and process the group
def smooth_data(data, window_length=9, polyorder=2):
    # Check if the data length is sufficient for the given window_length
    # If not, adjust the window_length to the next smallest odd number
    if len(data) < window_length:
        window_length = len(data) - (len(data) % 2) - 1  # Decrease to the nearest odd number smaller than the length of data
    if window_length > 2:  # window_length must be at least 3 for the savgol_filter to work
        return savgol_filter(data, window_length, polyorder, mode='nearest')  # Apply Savitzky-Golay filter
    else:
        return data  # If data is too short, return it unchanged

def process_group(group):
    group = group.copy()
    # Replace values under 3 with NaN
    group['plant_weight_process'] = group['pnw'].apply(lambda x: x if x >= 3 else np.nan)
    
    # If the first 4 values are NaN, set a base start of 10 g (shift_value)
    if group['plant_weight_process'].iloc[:4].isnull().all() or group['plant_weight_process'].max() > 1500:
        #shift_value = 10 - group['plant_weight'].iloc[0] if group['plant_weight'].iloc[0] else 0
        #group['plant_weight_process'] += shift_value
        first_valid_index = group['pnw'].first_valid_index() #sometimes starts with Nan so we need this
        # Shift the series to start from 10g
        if first_valid_index is not None:
            # Apply the shift only if a valid index exists
            group['plant_weight_process'] = group['pnw'] - group.loc[first_valid_index, 'pnw'] + 10
        else:
            # Handle groups where all pnw values are NaN
            group['plant_weight_process'] = np.nan  
        
    # Apply cumulative maximum to ensure increasing trend
    group['plant_weight_process'] = group['plant_weight_process'].cummax()
    
    # Interpolate missing values linearly
    group['plant_weight_process'] = group['plant_weight_process'].interpolate(method='linear', limit_direction='both')

    # Smooth the data
    group['plant_weight_process'] = smooth_data(group['plant_weight_process']) #.fillna(method='ffill').fillna(method='bfill')
    
    return group

# Main function to process data
def process_data(data):
    data = data.copy()

    # Extract relevant columns and rows
    pnw_df = data[['dt', 'pnw', 'unique_id']]
    pnw_df = pnw_df[pnw_df.index.time == pd.to_datetime('00:00:00').time()]
    
    # Sort by 'plantID' and then by date
    pnw_df.sort_values(by=['unique_id', data.index.name], inplace=True)
    grouped = pnw_df.groupby('unique_id', group_keys=False) # Group by plantID

    # Apply processing to each group
    pnw_df_processed = grouped.apply(process_group)

    # Merge the processed data back with the original DataFrame
    # Reset the index to make 'timestamp' a column for merging
    df_reset = data.reset_index()
    pnw_df_processed = pnw_df_processed.reset_index()
    df_merged = df_reset.merge(pnw_df_processed[['timestamp', 'unique_id', 'plant_weight_process']] , on=['timestamp', 'unique_id'], how='left')

    # Set the index back to timestamp if needed
    df_merged.set_index('timestamp', inplace=True)

    return df_merged

# Example usage
processed_data = process_data(data)