In [2]:
import os
import pandas as pd
import numpy as np


In [3]:
def load_data(dir):
    data_dict = {}
    if not os.path.exists(dir):
        raise FileExistsError(f"Directory not found.")
    for filename in os.listdir(dir):
        if filename.endswith('.csv'):
            filepath = os.path.join(dir, filename)
            df = pd.read_csv(filepath)
            cleaned_column = {col: col.strip('%').strip() for col in df.columns}
            df.rename(columns=cleaned_column, inplace =True)
            df = df.astype("float")
            data_dict[filename] = df
    return data_dict

In [4]:
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def create_missing_value(df, missing_fraction=0.1, seed=None):    
    df_with_missing = df.copy()
    if seed is not None:
        np.random.seed(seed)
    mask = np.random.rand(*df.shape) < missing_fraction
    df_with_missing = df_with_missing.mask(mask)
    
    return df_with_missing

def longest_notna(df):
    notna_mask = df.notna().all(axis=1)  # Ensure all columns are not NaN
    group_id = (notna_mask != notna_mask.shift()).cumsum()

    # Filter only the True blocks (where all values are not NaN)
    valid_blocks = df[notna_mask].copy()
    valid_blocks['group'] = group_id[notna_mask]

    # Find the longest group
    longest_group = valid_blocks['group'].value_counts().idxmax()

    # Return the longest block
    return valid_blocks[valid_blocks['group'] == longest_group].drop(columns='group')

def knn_mean(ts, n=4):
    ts = np.array(ts)
    out = np.copy(ts)
    for i, val in enumerate(ts):
        if np.isnan(val):
            lower = max(0, i - n)
            upper = min(len(ts), i + n + 1)
            neighbors = ts[lower:i].tolist() + ts[i+1:upper].tolist()
            neighbors = [x for x in neighbors if not np.isnan(x)]
            out[i] = np.mean(neighbors) if neighbors else np.nan
    return out

def seasonal_mean(series, season_lag, lr=0.7):

    out = series.copy()
    for i in range(len(series)):
        if pd.isna(series.iloc[i]):
            # Previous seasonal values
            prev_seasons = series.iloc[i - season_lag::-season_lag]
            if pd.isna(prev_seasons.mean()):
                # Combine previous and forward seasonal values
                forward_seasons = series.iloc[i + season_lag::season_lag]
                seasonal_values = pd.concat([prev_seasons, forward_seasons])
            else:
                seasonal_values = prev_seasons
            out.iloc[i] = seasonal_values.mean() * lr
    return out

def handle_missing(df, percent):
    # Remove columns have missing data above threshhold.
    missing_percent = df.isnull().mean()
    df_filtered = df.loc[:, missing_percent<percent]
    print(f"Filtered all features having missing value > 20%")
    print(f"Number of features removed: {len(df.columns)-len(df_filtered.columns)}")

    notna_mask = df_filtered.notna()

    missing_df = create_missing_value(df_filtered.loc(notna_mask))

    for col in missing_df.columns:

        imputation_dict = {}
        imputation_dict["backward/forward_fill"] = missing_df[col].bfill().ffill()
        imputation_dict["linear_interpolation"] = missing_df[col].interpolate(method = 'linear', axis = 0, limit_direction = 'forward')
        imputation_dict["quadaric_interpolation"] = missing_df[col].interpolate(method = 'cubicspline')
        imputation_dict["knn_mean"] = knn_mean(missing_df[col], 8)
        imputation_dict["seasonal_mean"] = seasonal_mean(missing_df[col], season_lag = 12, lr = 1.25)

        for key, value in imputation_dict.items():
            print(f"Evaluation from {key}")
            print(f"MAE: {mean_absolute_error(missing_df[col],value)}")
            print(f"MSE: {mean_squared_error(missing_df[col],value)}")
            print(f"R^2: {r2_score(missing_df[col],value)}")
        


In [5]:
DATA_PATH = '../'
GREENHOUSE_TEAM = "Digilog"
team_folder_path = os.path.join(DATA_PATH,GREENHOUSE_TEAM)
team_df = load_data(team_folder_path)
df = team_df["GreenhouseClimate.csv"][["CO2air"]]
# Get the longest block with all notna values
clean_block = longest_notna(df)
print(clean_block)

  df = pd.read_csv(filepath)


       CO2air
0       474.0
1       470.0
2       482.0
3       472.0
4       469.0
...       ...
20546   663.0
20547   683.0
20548   726.0
20549   719.0
20550   694.0

[20551 rows x 1 columns]


In [None]:
DATA_PATH = '../'
GREENHOUSE_TEAM = "Digilog"

team_folder_path = os.path.join(DATA_PATH,GREENHOUSE_TEAM)
team_df = load_data(team_folder_path)

print(f"Successfully loaded data from {GREENHOUSE_TEAM}")
team_df["GreenhouseClimate.csv"]["CO2air"].head(5)
handle_missing(team_df["GreenhouseClimate.csv"], 0.2)

  df = pd.read_csv(filepath)


Successfully loaded data from Digilog
Filtered all features having missing value > 20%
Number of features removed: 4


TypeError: unhashable type: 'DataFrame'

: 

In [None]:
from statsmodels.tsa.stattools import adfuller
def adf_test(timeseries):
    print('Result of Augmented Dickey Fuller: ')
    df_test= adfuller(timeseries, autolag = "AIC")
    df_output = pd.Series(df_test[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations'])
    for key, value in df_test[4].items():
        df_output['Critical Value (%s)'%key] = value
    print(df_output)

In [40]:
adf_test(df_climate["int_red_sp"].astype("float").dropna())

Result of Augmented Dickey Fuller: 
Test Statistic           -1.313535e+01
p-value                   1.466031e-24
#Lags Used                5.300000e+01
Number of Observations    4.680100e+04
Critical Value (1%)      -3.430490e+00
Critical Value (5%)      -2.861602e+00
Critical Value (10%)     -2.566803e+00
dtype: float64
