# Implementing Hyfo's gap filling method in Python to fill gaps in time series data.

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import plotly.graph_objs as go

# --- Helper functions to mimic hyfo gap filling ---

def fill_gap_cor(data, corPeriod, dates):
    """
    Compute the correlation matrix from the data.
    For 'daily', use data as-is; for 'monthly' or 'yearly' aggregate by date parts.
    """
    if corPeriod == 'monthly':
        # Group each column by year and month and take mean
        df_monthly = None
        for col in data.columns:
            temp = pd.DataFrame({
                'year': dates.dt.year,
                'month': dates.dt.month,
                'val': data[col]
            })
            grouped = temp.groupby(['year', 'month'])['val'].mean()
            # Build DataFrame (use the same multiindex for all columns)
            col_df = pd.DataFrame({col: grouped})
            if df_monthly is None:
                df_monthly = col_df
            else:
                df_monthly = df_monthly.join(col_df, how='inner')
        corData = df_monthly
    elif corPeriod == 'yearly':
        df_yearly = None
        for col in data.columns:
            temp = pd.DataFrame({
                'year': dates.dt.year,
                'val': data[col]
            })
            grouped = temp.groupby('year')['val'].mean()
            col_df = pd.DataFrame({col: grouped})
            if df_yearly is None:
                df_yearly = col_df
            else:
                df_yearly = df_yearly.join(col_df, how='inner')
        corData = df_yearly
    else:  # daily
        corData = data.copy()
    # Drop rows with any NA values (like na.omit in R)
    corData = corData.dropna()
    return corData.corr()


def compute_cor_order(corN):
    """
    For each column in the correlation matrix, return a list of predictor names
    sorted by descending correlation (excluding self).
    """
    corOrder = {}
    for col in corN.columns:
        sorted_cols = corN[col].sort_values(ascending=False).index.tolist()
        # Remove self (correlation of 1)
        sorted_cols = [c for c in sorted_cols if c != col]
        corOrder[col] = sorted_cols
    return corOrder


def compute_lm_coef(data):
    """
    Create a regression coefficient matrix (with forced zero intercept) where
    for each pair (i, j) the coefficient is computed as: coef = sum(x*y)/sum(x*x)
    using all rows where both i and j are not NA.
    Diagonals are set to 1.
    """
    cols = data.columns
    n = len(cols)
    lm_coef = pd.DataFrame(np.eye(n), index=cols, columns=cols)
    for i in range(n):
        for j in range(n):
            if i == j:
                continue
            col_i = cols[i]
            col_j = cols[j]
            valid = data[[col_i, col_j]].dropna()
            if len(valid) > 0:
                x = valid[col_j].values
                y = valid[col_i].values
                denom = np.dot(x, x)
                coef = np.dot(x, y) / denom if denom != 0 else np.nan
                lm_coef.loc[col_i, col_j] = coef
            else:
                lm_coef.loc[col_i, col_j] = np.nan
    return lm_coef


def reorder_lm_coef(lm_coef, corOrder):
    """
    For each target, reorder the regression coefficients according to the sorted predictors.
    Returns a dictionary mapping target -> list of coefficients.
    """
    lm_coef_ordered = {}
    for target, predictors in corOrder.items():
        coefs = [lm_coef.loc[target, pred] for pred in predictors]
        lm_coef_ordered[target] = coefs
    return lm_coef_ordered


def fill_column(target_series, data, target_name, corOrder, lm_coef_ordered):
    """
    Iteratively fill gaps in target_series using the predictors ordered by correlation.
    For each missing value, if the predictor is available, the missing value is set to:
         filled = round(reg_coef * predictor_value, 3)
    """
    ts = target_series.copy()
    predictors = corOrder[target_name]
    coefs = lm_coef_ordered[target_name]
    num_pred = len(predictors)
    for j in range(num_pred):
        if ts.isna().sum() == 0:
            break
        pred = predictors[j]
        coef = coefs[j]
        # For indices where target is NA and predictor is not NA:
        na_indices = ts[ts.isna()].index
        for idx in na_indices:
            if pd.notna(data.loc[idx, pred]):
                ts.at[idx] = round(coef * data.loc[idx, pred], 3)
        if j == num_pred - 1 and ts.isna().sum() > 0:
            raise ValueError(f"Error: At some time instance all predictors are missing for {target_name}.")
    return ts


def fill_gap(df, corPeriod='daily'):
    """
    Main gap-filling function mimicking hyfo's fillGap():
      - Expects a DataFrame with the first column as date and the remaining as time series.
      - Computes the correlation matrix (possibly aggregating by period).
      - Orders predictors for each column by descending correlation.
      - Computes no-intercept linear regression coefficients between each pair.
      - Iteratively fills missing values in every column.
    Returns a new DataFrame with the filled data.
    """
    # Validate and convert date column
    date_col = df.columns[0]
    if not any(sep in str(df.iloc[0, 0]) for sep in ['-', '/']):
        raise ValueError("First column is not date or wrong date format. Check the format and convert using as.Date.")
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    dates = df[date_col]
    data = df.drop(columns=[date_col])
    
    # Compute correlation matrix (dropping rows with any missing data)
    corN = fill_gap_cor(data, corPeriod, dates)
    #print("\nCorrelation Coefficient:")
    #print(corN)
    
    # Get predictor order (for each target, list predictors sorted by descending correlation)
    corOrder = compute_cor_order(corN)
    #print("\nCorrelation Order:")
    #print(corOrder)
    
    # Compute the linear regression coefficients matrix (forced zero intercept)
    lm_coef = compute_lm_coef(data)
    #print("\nLinear Coefficients:")
    #print(lm_coef)
    
    # Reorder the coefficients according to the correlation order for each target
    lm_coef_ordered = reorder_lm_coef(lm_coef, corOrder)
    
    # Fill each column iteratively using the ordered predictors
    filled_data = {}
    for col in data.columns:
        filled_data[col] = fill_column(data[col], data, col, corOrder, lm_coef_ordered)
    
    filled_df = pd.DataFrame(filled_data)
    filled_df.insert(0, date_col, dates)
    return filled_df

# --- Main processing: reading, merging, filling, plotting and saving ---

def main():
    """
    Process all files in "Incomplete_data":
      - Read each CSV file (with a date column and other headers: Rain, Temp, WS, NR, VPD, PET, Sap velocity (cm/h), Total SF (cm3/h)).
      - Rename the Total SF column to a unique name based on the filename.
      - Merge all sapflow series on Date.
      - Fill gaps using the hyfo-like method.
      - For each file, merge the filled sapflow back into the original DataFrame and save to "Complete_data".
      - Plot (using Plotly) the original versus filled time series.
    """
    incomplete_folder = "Incomplete_data"
    complete_folder = "Complete_data"
    os.makedirs(complete_folder, exist_ok=True)
    
    # Get all CSV files matching "*-incomplete.csv"
    files = glob.glob(os.path.join(incomplete_folder, "*-incomplete.csv"))
    if not files:
        print("No files found in Incomplete_data folder.")
        return
    
    merged_list = []
    file_info = {}  # will map our unique sapflow column names to original file info
    for file in files:
        # Read file and drop completely empty rows
        df = pd.read_csv(file)
        df = df.dropna(how="all")
        # Ensure the first column is named "Date"
        if df.columns[0].startswith("Unnamed") or df.columns[0] == "":
            df = df.rename(columns={df.columns[0]: "Date"})
        else:
            cols = df.columns.tolist()
            cols[0] = "Date"
            df.columns = cols
        # Convert Date to datetime (assumed format like "%Y-%m-%d %H:%M:%S")
        df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
        
        # Find the Total SF column (it might be labeled in different ways)
        sapflow_col = None
        for col in df.columns:
            if "Total" in col and "SF" in col:
                sapflow_col = col
                break
        if sapflow_col is None:
            print(f"File {file} does not have a Total SF column.")
            continue
        
        # For merging, we only need Date and the sapflow column.
        df_merge = df[["Date", sapflow_col]].copy()
        # Create a unique name based on filename; e.g. "complicatedname-incomplete.csv" -> "sapflow_complicatedname"
        base = os.path.basename(file)
        unique_name = base.replace("-incomplete.csv", "")
        new_name = "sapflow_" + unique_name
        df_merge = df_merge.rename(columns={sapflow_col: new_name})
        merged_list.append(df_merge)
        
        # Save original full DataFrame for later updating
        original_df = pd.read_csv(file)
        if original_df.columns[0].startswith("Unnamed") or original_df.columns[0] == "":
            original_df = original_df.rename(columns={original_df.columns[0]: "Date"})
        else:
            cols = original_df.columns.tolist()
            cols[0] = "Date"
            original_df.columns = cols
        original_df["Date"] = pd.to_datetime(original_df["Date"], errors="coerce")
        file_info[new_name] = {
            "path": file,
            "original_df": original_df,
            "original_sapflow_col": sapflow_col
        }
    
    if not merged_list:
        print("No valid files to merge.")
        return
    
    # Merge all sapflow series on Date (outer join so that no time point is lost)
    merged_df = merged_list[0]
    for df in merged_list[1:]:
        merged_df = pd.merge(merged_df, df, on="Date", how="outer")
    merged_df = merged_df.sort_values("Date").reset_index(drop=True)
    
    # Apply the gap-filling algorithm (using 'daily' as the period)
    filled_merged = fill_gap(merged_df, corPeriod='daily')
    
    # For each file, extract its filled sapflow series and merge back into its original data
    for col_name, info in file_info.items():
        # Get filled series from filled_merged using the unique column name; use Date as index.
        filled_series = filled_merged.set_index("Date")[col_name]
        original_df = info["original_df"]
        # Merge on Date to update the sapflow column (using left join to preserve original rows)
        updated_df = pd.merge(original_df, filled_series.rename("Filled"), left_on="Date", right_index=True, how="left")
        # Replace the original Total SF column with the filled values
        updated_df[info["original_sapflow_col"]] = updated_df["Filled"]
        updated_df = updated_df.drop(columns=["Filled"])
        
        # Save to the Complete_data folder (changing '-incomplete' to '-complete' in filename)
        base = os.path.basename(info["path"])
        new_filename = base.replace("-incomplete.csv", "-complete.csv")
        out_path = os.path.join(complete_folder, new_filename)
        updated_df.to_csv(out_path, index=False)
        print(f"Saved: {out_path}")
        #print(updated_df.head())

        #path ANALYSIS 
        from pathlib import Path
        path_TS= str(Path.cwd().parents[3]) #moves two directories up
        path_analysis= path_TS + "/ANALYSIS/Input_files/" 
        updated_df.to_csv(os.path.join(path_analysis, new_filename), index=False)
      


        # Create a Plotly plot comparing original vs filled for this sapflow series
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=original_df["Date"], y=original_df[info["original_sapflow_col"]],
                                 mode="lines", name="Original"))
        fig.add_trace(go.Scatter(x=updated_df["Date"], y=updated_df[info["original_sapflow_col"]],
                                 mode="lines", name="Filled"))
        fig.update_layout(title=f"{col_name} - Total SF (cm3/h) Time Series: Original vs Filled",
                          xaxis_title="Date", yaxis_title=info["original_sapflow_col"])
        fig.show()
        

if __name__ == "__main__":
    main()


Saved: Complete_data/SF_W_SFM1I60T-DF03US-2022-complete.csv


Saved: Complete_data/SF_W_SFM1I60Q-DF21LS-2022-complete.csv


Saved: Complete_data/SF_W_SFM1J20P-DF49GT-2022-complete.csv


Saved: Complete_data/SF_W_SFM1K308-ES50LS-2022-complete.csv


Saved: Complete_data/SF_W_SFM1J20O-DF27US-2022-complete.csv


Saved: Complete_data/SF_W_SFM1J406-ES01US-2022-complete.csv


Saved: Complete_data/SF_W_SFM1I60R-ES48GT-2022-complete.csv


Saved: Complete_data/SF_W_SFM1J20N-ES42US-2022-complete.csv
