In [3]:
import os
import pandas as pd
import numpy as np


In [4]:
def load_data(dir):
    data_dict = {}
    if not os.path.exists(dir):
        raise FileExistsError(f"Directory not found.")
    for filename in os.listdir(dir):
        if filename.endswith('.csv'):
            filepath = os.path.join(dir, filename)
            df = pd.read_csv(filepath)
            cleaned_column = {col: col.strip('%').strip() for col in df.columns}
            df.rename(columns=cleaned_column, inplace =True)
            df = df.astype("float")
            data_dict[filename] = df
    return data_dict

In [10]:
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def create_block_missingness(series, n_blocks=2, block_size_fraction=0.05, seed=None):
    """Tạo các khối dữ liệu thiếu liên tiếp trên một Series."""
    series_with_missing = series.copy()
    if seed is not None:
        np.random.seed(seed)
    
    n_rows = len(series)
    block_size = int(n_rows * block_size_fraction)
    if block_size == 0: block_size = 1

    for _ in range(n_blocks):
        if n_rows > block_size:
            start_index = np.random.randint(0, n_rows - block_size)
            series_with_missing.iloc[start_index : start_index + block_size] = np.nan
            
    return series_with_missing

def create_missing_value(series, missing_fraction=0.1, seed=None):    
    series_with_missing = series.copy()
    if seed is not None:
        np.random.seed(seed)
    mask = np.random.rand(*series.shape) < missing_fraction
    series_with_missing = series_with_missing.mask(mask)
    
    return series_with_missing

def longest_notna(series):

    notna_mask = series.notna()
    group_id = (notna_mask != notna_mask.shift()).cumsum()
    notna_groups = group_id[notna_mask]
    
    if notna_groups.empty:
        return pd.Series(dtype=series.dtype, index=series.index) # Trả về series rỗng nhưng giữ index
        
    longest_group_id = notna_groups.value_counts().idxmax()
    return series[group_id == longest_group_id]

def knn_mean(series, n=4):
    ts = np.array(series, dtype=float) # Chuyển sang float để chứa NaN
    out = np.copy(ts)
    for i, val in enumerate(ts):
        if np.isnan(val):
            lower = max(0, i - n)
            upper = min(len(ts), i + n + 1)
            neighbors = list(ts[lower:i]) + list(ts[i+1:upper])
            valid_neighbors = [x for x in neighbors if not np.isnan(x)]
            if valid_neighbors:
                out[i] = np.mean(valid_neighbors)
    # CHUẨN HÓA: Chuyển đổi mảng numpy kết quả trở lại thành Series với index ban đầu
    return pd.Series(out, index=series.index)

def seasonal_mean(series, season_lag, lr=1.0):
    out = series.copy()
    for i in range(len(series)):
        if pd.isna(series.iloc[i]):
            seasonal_values = series.iloc[i % season_lag::season_lag].dropna()
            if not seasonal_values.empty:
                out.iloc[i] = seasonal_values.mean() * lr
    return out

def handle_missing(df, percent):
    # Remove columns have missing data above threshhold.
    missing_percent = df.isnull().mean()
    df_filtered = df.loc[:, missing_percent<percent]
    print(f"Filtered all features having missing value > 20%")
    print(f"Number of features removed: {len(df.columns)-len(df_filtered.columns)}")

    result_df = pd.DataFrame(columns = ["Feature","Method","Result"])

    # After filtered, dataframe now has only columns that can
    for col in df_filtered.columns:

        feature_data = df_filtered[[col]]
        ground_truth = longest_notna(feature_data)
        missing_df = create_missing_value(ground_truth)
        missing_df = create_block_missingness(missing_df)

        imputation_dict = {}
        imputation_dict["backward/forward_fill"] = missing_df.bfill().ffill()
        imputation_dict["linear_interpolation"] = missing_df.interpolate(method = 'linear', axis = 0, limit_direction = 'forward')
        imputation_dict["spline_interpolation"] = missing_df.interpolate(method = 'spline', order=3)        
        imputation_dict["quadaric_interpolation"] = missing_df.interpolate(method = 'cubicspline')
        imputation_dict["knn_mean"] = knn_mean(missing_df, 8)
        imputation_dict["seasonal_mean"] = seasonal_mean(missing_df[col], season_lag = 12, lr = 1.25)
        imputation_dict["linear_ffill"] = missing_df.interpolate(method='linear').ffill()
        imputation_dict["linear_bfill"] = missing_df.interpolate(method='linear').bfill()
        imputation_dict["spline_bfill_ffill"] = missing_df.interpolate(method='spline', order = 3).bfill().ffill()
        imputation_dict["seasonal_linear_ffill"] = seasonal_mean(missing_df[col], season_lag = 12, lr = 1.25).interpolate(method='linear').ffill()

        for key, pred_value in imputation_dict.items():
            if pd.isna(np.array(pred_value)).any():
                r2 = np.nan
            else:
                r2 = r2_score(ground_truth, pred_value)
            result_df.loc[len(result_df)]=[col,key,r2]
            
    best_methods_idx = result_df.groupby('Feature')['Result'].idxmax()
    best_methods_df = result_df.loc[best_methods_idx]
    print("Highest R² method per feature:")
    print(best_methods_df[['Feature', 'Method', 'Result']])


In [11]:
class TSImputer:
    
    def __init__(self, strategy = 'r2', verbose=True):
        if strategy not in ['r2', 'mae', 'mse']:
            raise ValueError("Strategy must be either 'r2' or 'mae'")
        self.strategy = strategy
        self.verbose = verbose
        self.best_methods = {} # Lưu trữ phương pháp tốt nhất cho mỗi cột
        self._define_pipelines()
        
    def _define_pipelines(self):

        self.pipelines = {
            "backward_forward_fill": lambda s: s.bfill().ffill(),
            "linear_interpolation": lambda s: s.interpolate(method='linear', limit_direction='both'),
            "spline_interpolation": lambda s: s.interpolate(method='spline', order=3, limit_direction='both'),
            "quadratic_interpolation": lambda s: s.interpolate(method='quadratic', limit_direction='both'),
            "knn_mean_k8": lambda s: knn_mean(s, n=8),
            "seasonal_mean_lag12": lambda s: seasonal_mean(s, season_lag=12, lr=1.0),
            "linear_then_fill_edges": lambda s: s.interpolate(method='linear').bfill().ffill(),
            "spline_then_fill_edges": lambda s: s.interpolate(method='spline', order=3).bfill().ffill(),
            "seasonal_then_linear": lambda s: seasonal_mean(s, season_lag=12, lr=1.0).interpolate(method='linear').bfill().ffill()
        }

        if self.verbose: 
            print(f"AutoImputer initialized with {len(self.pipelines)} methods to evaluate.")

    def fit(self, df: pd.DataFrame, threshold = 0.5):
        if self.verbose:
            print("\nStart find best imputation method")

        df_to_impute = df.loc[:, df.isna().mean() < threshold]
        for col in df_to_impute:

            if self.verbose:
                print(f"Evaluating columns: {col}")

            feature_data = df_to_impute[col]
            ground_truth = longest_notna(feature_data)

            if len(ground_truth) < 20: # Cần đủ dữ liệu để đánh giá
                    self.best_methods_[col] = "linear_then_fill_edges" # Default
                    if self.verbose: print(f"   -> Not enough clean data. Defaulting to 'linear_then_fill_edges'.")
                    continue

                # SỬA LỖI: Truyền Series vào hàm tạo dữ liệu thiếu
            missing_series = create_missing_value(ground_truth, missing_fraction=0.1)
            missing_series = create_block_missingness(missing_series, n_blocks=2)

            original_values = ground_truth[missing_series.isnull()]
            if original_values.empty:
                self.best_methods[col] = "backward/forward_fill" # Default
                if self.verbose: print(f"   -> Failed to create test gaps. Defaulting.")
                continue

            best_method = None # SỬA LỖI: Khởi tạo là None
            best_score = -np.inf if self.strategy == 'r2' else np.inf

            for name, method_func in self.pipelines.items():
                # SỬA LỖI: Truyền Series vào hàm imputation
                imputed_series = method_func(missing_series)
                imputed_values = imputed_series.loc[original_values.index]

                if imputed_values.isnull().any():
                    continue # Bỏ qua nếu phương pháp thất bại

                # Tính điểm trên DỮ LIỆU ĐÚNG
                current_score = 0
                if self.strategy == "r2":
                    current_score = r2_score(original_values, imputed_values)
                    if current_score > best_score:
                        best_score, best_method = current_score, name
                elif self.strategy == "mae":
                    current_score = mean_absolute_error(original_values, imputed_values)
                    if current_score < best_score:
                        best_score, best_method = current_score, name
                elif self.strategy == "mse":
                    current_score = mean_squared_error(original_values, imputed_values)
                    if current_score < best_score:
                        best_score, best_method = current_score, name

            # SỬA LỖI: Kiểm tra 'is not None' thay vì '!= np.nan'
            if best_method is not None:
                self.best_methods[col] = best_method
                if self.verbose:
                    print(f"   -> Best method found: '{best_method}' ({self.strategy.upper()}: {best_score:.4f})")
            else: 
                self.best_methods[col] = 'backward_forward_fill'
                if self.verbose:
                    print(f"   -> No suitable method found. Defaulting to 'backward_forward_fill'.")
        return self

    def transform(self, df):

        if not self.best_methods:
            raise RuntimeError("You must use 'fit()' before calling 'transform()' ")
        if self.verbose:
            print("\nStarting apply best methods...")
        
        new_df = df.copy()

        for col, method in self.best_methods.items():
            if col in new_df.columns:
                if self.verbose:
                    print(f"Apply '{method}' to column '{col}'")

                pipeline_func = self.pipelines[method]
                new_df[col] = pipeline_func(new_df[col])

        return new_df
    
    def fit_transform(self, df, threshold=0.5):

        self.fit(df, threshold)
        return self.transform(df)

In [None]:
DATA_PATH = '../'
GREENHOUSE_TEAM = "Digilog"

team_folder_path = os.path.join(DATA_PATH,GREENHOUSE_TEAM)
team_df = load_data(team_folder_path)

print(f"Successfully loaded data from {GREENHOUSE_TEAM}")

autoImp = TSImputer()
imputed_df = autoImp.fit(team_df["GreenhouseClimate.csv"])


  df = pd.read_csv(filepath)


Successfully loaded data from Digilog
AutoImputer initialized with 9 methods to evaluate.

Start find best imputation method
Evaluating columns: time
   -> Best method found: 'spline_interpolation' (R2: 1.0000)
Evaluating columns: AssimLight
   -> Best method found: 'backward_forward_fill' (R2: 0.3827)
Evaluating columns: BlackScr
   -> Best method found: 'linear_interpolation' (R2: 0.3133)
Evaluating columns: CO2air
   -> Best method found: 'linear_interpolation' (R2: 0.4307)
Evaluating columns: Cum_irr
   -> Best method found: 'linear_interpolation' (R2: 0.4639)
Evaluating columns: EC_drain_PC
   -> Best method found: 'backward_forward_fill' (R2: 0.9932)
Evaluating columns: EnScr
   -> Best method found: 'backward_forward_fill' (R2: 0.8567)
Evaluating columns: HumDef
   -> Best method found: 'backward_forward_fill' (R2: 0.7542)
Evaluating columns: PipeGrow
   -> Best method found: 'linear_interpolation' (R2: 0.9428)
Evaluating columns: PipeLow
   -> Best method found: 'linear_interpo

In [61]:
DATA_PATH = '../'
GREENHOUSE_TEAM = "Digilog"

team_folder_path = os.path.join(DATA_PATH,GREENHOUSE_TEAM)
team_df = load_data(team_folder_path)

print(f"Successfully loaded data from {GREENHOUSE_TEAM}")

handle_missing(team_df["GreenhouseClimate.csv"], 0.2)

  df = pd.read_csv(filepath)


Successfully loaded data from Digilog
Filtered all features having missing value > 20%
Number of features removed: 4
🔝 Highest R² method per feature:
                         Feature                  Method    Result
17                    AssimLight            linear_bfill  0.807793
21                      BlackScr    linear_interpolation  0.867427
30                        CO2air   backward/forward_fill  0.915223
42                       Cum_irr    spline_interpolation  0.863713
51                   EC_drain_PC    linear_interpolation  0.999405
63                         EnScr  quadaric_interpolation  0.999769
71                        HumDef    linear_interpolation  0.934009
81                      PipeGrow    linear_interpolation  0.990942
91                       PipeLow    linear_interpolation  0.816963
100                        Rhair   backward/forward_fill  0.979228
111                         Tair    linear_interpolation  0.945754
121                      Tot_PAR    linear_int

In [None]:
from statsmodels.tsa.stattools import adfuller
def adf_test(timeseries):
    print('Result of Augmented Dickey Fuller: ')
    df_test= adfuller(timeseries, autolag = "AIC")
    df_output = pd.Series(df_test[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations'])
    for key, value in df_test[4].items():
        df_output['Critical Value (%s)'%key] = value
    print(df_output)

In [40]:
adf_test(df_climate["int_red_sp"].astype("float").dropna())

Result of Augmented Dickey Fuller: 
Test Statistic           -1.313535e+01
p-value                   1.466031e-24
#Lags Used                5.300000e+01
Number of Observations    4.680100e+04
Critical Value (1%)      -3.430490e+00
Critical Value (5%)      -2.861602e+00
Critical Value (10%)     -2.566803e+00
dtype: float64
