In [38]:
import os
import pandas as pd
import numpy as np


In [39]:
def load_data(dir):
    data_dict = {}
    if not os.path.exists(dir):
        raise FileExistsError(f"Directory not found.")
    for filename in os.listdir(dir):
        if filename.endswith('.csv'):
            filepath = os.path.join(dir, filename)
            df = pd.read_csv(filepath)
            cleaned_column = {col: col.strip('%').strip() for col in df.columns}
            df.rename(columns=cleaned_column, inplace =True)
            df = df.astype("float")
            data_dict[filename] = df
    return data_dict

In [None]:
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def create_block_missingness(df, n_blocks=2, block_size_fraction=0.05, seed=None):
    """Creates consecutive blocks of missing values to simulate outages."""
    df_with_missing = df.copy()
    if seed is not None:
        np.random.seed(seed)
    
    n_rows = len(df)
    block_size = int(n_rows * block_size_fraction)
    if block_size == 0:
        block_size = 1 # Ensure at least one value is removed

    for _ in range(n_blocks):
        start_index = np.random.randint(0, n_rows - block_size)
        end_index = start_index + block_size
        # Create a boolean mask for the block
        mask = df_with_missing.index.to_series().between(df.index[start_index], df.index[end_index])
        # Apply the mask to all columns
        df_with_missing[mask] = np.nan
        
    return df_with_missing

def create_missing_value(df, missing_fraction=0.1, seed=None):    
    df_with_missing = df.copy()
    if seed is not None:
        np.random.seed(seed)
    mask = np.random.rand(*df.shape) < missing_fraction
    df_with_missing = df_with_missing.mask(mask)
    
    return df_with_missing

def longest_notna(df):
    notna_mask = df.notna().all(axis=1)  
    group_id = (notna_mask != notna_mask.shift()).cumsum()

    # Filter only the True blocks = Notna 
    valid_blocks = df[notna_mask].copy()
    valid_blocks['group'] = group_id[notna_mask]

    # Find the longest group
    longest_group = valid_blocks['group'].value_counts().idxmax()

    # Return the longest block
    return valid_blocks[valid_blocks['group'] == longest_group].drop(columns='group')

def knn_mean(ts, n=4):
    ts = np.array(ts)
    out = np.copy(ts)
    for i, val in enumerate(ts):
        if np.isnan(val):
            lower = max(0, i - n)
            upper = min(len(ts), i + n + 1)
            neighbors = ts[lower:i].tolist() + ts[i+1:upper].tolist()
            neighbors = [x for x in neighbors if not np.isnan(x)]
            out[i] = np.mean(neighbors) if neighbors else np.nan
    return out

def seasonal_mean(series, season_lag, lr=0.7):

    out = series.copy()
    for i in range(len(series)):
        if pd.isna(series.iloc[i]):
            # Previous seasonal values
            prev_seasons = series.iloc[i - season_lag::-season_lag]
            if pd.isna(prev_seasons.mean()):
                # Combine previous and forward seasonal values
                forward_seasons = series.iloc[i + season_lag::season_lag]
                seasonal_values = pd.concat([prev_seasons, forward_seasons])
            else:
                seasonal_values = prev_seasons
            out.iloc[i] = seasonal_values.mean() * lr
    return out

def handle_missing(df, percent):
    # Remove columns have missing data above threshhold.
    missing_percent = df.isnull().mean()
    df_filtered = df.loc[:, missing_percent<percent]
    print(f"Filtered all features having missing value > 20%")
    print(f"Number of features removed: {len(df.columns)-len(df_filtered.columns)}")

    result_df = pd.DataFrame(columns = ["Feature","Method","Result"])

    # After filtered, dataframe now has only columns that can
    for col in df_filtered.columns:

        feature_data = df_filtered[[col]]
        ground_truth = longest_notna(feature_data)
        missing_df = create_missing_value(ground_truth)
        missing_df = create_block_missingness(missing_df)

        imputation_dict = {}
        imputation_dict["backward/forward_fill"] = missing_df.bfill().ffill()
        imputation_dict["linear_interpolation"] = missing_df.interpolate(method = 'linear', axis = 0, limit_direction = 'forward')
        imputation_dict["spline_interpolation"] = missing_df.interpolate(method = 'spline', order=3)        
        imputation_dict["quadaric_interpolation"] = missing_df.interpolate(method = 'cubicspline')
        imputation_dict["knn_mean"] = knn_mean(missing_df, 8)
        imputation_dict["seasonal_mean"] = seasonal_mean(missing_df[col], season_lag = 12, lr = 1.25)
        imputation_dict["linear_ffill"] = missing_df.interpolate(method='linear').ffill()
        imputation_dict["linear_bfill"] = missing_df.interpolate(method='linear').bfill()
        imputation_dict["spline_bfill_ffill"] = missing_df.interpolate(method='spline', order = 3).bfill().ffill()
        imputation_dict["seasonal_linear_ffill"] = seasonal_mean(missing_df[col], season_lag = 12, lr = 1.25).interpolate(method='linear').ffill()

        for key, pred_value in imputation_dict.items():
            if pd.isna(np.array(pred_value)).any():
                r2 = np.nan
            else:
                r2 = r2_score(ground_truth, pred_value)
            result_df.loc[len(result_df)]=[col,key,r2]
            
    best_methods_idx = result_df.groupby('Feature')['Result'].idxmax()
    best_methods_df = result_df.loc[best_methods_idx]
    print("Highest R² method per feature:")
    print(best_methods_df[['Feature', 'Method', 'Result']])


In [None]:
class TSImputer:
    def __init__(self, strategy = 'r2'):
        if strategy not in ['r2', 'mae', 'mse']:
            raise ValueError("Strategy must be either 'r2' or 'mae'")
        self.strategy = strategy
        self.verbose = verbose
        self.best_methods_ = {} # Lưu trữ phương pháp tốt nhất cho mỗi cột
        self._define_pipelines()

In [61]:
DATA_PATH = '../'
GREENHOUSE_TEAM = "Digilog"

team_folder_path = os.path.join(DATA_PATH,GREENHOUSE_TEAM)
team_df = load_data(team_folder_path)

print(f"Successfully loaded data from {GREENHOUSE_TEAM}")

handle_missing(team_df["GreenhouseClimate.csv"], 0.2)

  df = pd.read_csv(filepath)


Successfully loaded data from Digilog
Filtered all features having missing value > 20%
Number of features removed: 4
🔝 Highest R² method per feature:
                         Feature                  Method    Result
17                    AssimLight            linear_bfill  0.807793
21                      BlackScr    linear_interpolation  0.867427
30                        CO2air   backward/forward_fill  0.915223
42                       Cum_irr    spline_interpolation  0.863713
51                   EC_drain_PC    linear_interpolation  0.999405
63                         EnScr  quadaric_interpolation  0.999769
71                        HumDef    linear_interpolation  0.934009
81                      PipeGrow    linear_interpolation  0.990942
91                       PipeLow    linear_interpolation  0.816963
100                        Rhair   backward/forward_fill  0.979228
111                         Tair    linear_interpolation  0.945754
121                      Tot_PAR    linear_int

In [None]:
from statsmodels.tsa.stattools import adfuller
def adf_test(timeseries):
    print('Result of Augmented Dickey Fuller: ')
    df_test= adfuller(timeseries, autolag = "AIC")
    df_output = pd.Series(df_test[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations'])
    for key, value in df_test[4].items():
        df_output['Critical Value (%s)'%key] = value
    print(df_output)

In [40]:
adf_test(df_climate["int_red_sp"].astype("float").dropna())

Result of Augmented Dickey Fuller: 
Test Statistic           -1.313535e+01
p-value                   1.466031e-24
#Lags Used                5.300000e+01
Number of Observations    4.680100e+04
Critical Value (1%)      -3.430490e+00
Critical Value (5%)      -2.861602e+00
Critical Value (10%)     -2.566803e+00
dtype: float64
