In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import KNNImputer
import os 

In [2]:
def outlier_removal(df, column):

    df[column] = df[column].replace(-1, np.nan)

    r = df[column].dropna().to_numpy()
    
    if r.size == 0:
        print("Coluna não contém valores suficientes para análise.")
        return df

    r_max = np.max(r) 
    r = r / r_max  

    perc_min = []
    p_min = np.linspace(0.1, 2, 20)
    for i in p_min:
        perc_min.append(np.percentile(r, i))
    diff_perc_min = np.diff(perc_min)
    index_min = np.argmax(diff_perc_min)  
    thres_min = np.mean(perc_min[index_min:index_min + 2])

    perc_max = []
    p_max = np.linspace(98, 100, 20)
    for i in p_max:
        perc_max.append(np.percentile(r, i))
    diff_perc_max = np.diff(perc_max)
    index_max = np.argmax(diff_perc_max)  
    thres_max = np.mean(perc_max[index_max:index_max + 2])

    r_filtered = np.where((r < thres_min) | (r > thres_max), np.nan, r)

    r_filtered = r_filtered * r_max  

    df_filtered = df.copy()
    df_filtered.loc[~df[column].isna(), column] = r_filtered

    return df_filtered


In [3]:
def impute_knn(df, k=5):
    imputer = KNNImputer(n_neighbors=k)
    df_copy = df.copy()
    df_copy['Throughput'] = imputer.fit_transform(df_copy[['Throughput']])
    return df_copy

# def impute_rolling_median(df, window_size=3):
#     df_copy = df.copy()
#     previous_na_count = df_copy['Throughput'].isna().sum()
#     while df_copy['Throughput'].isna().any():
#         df_copy['Throughput'] = df_copy['Throughput'].fillna(df_copy['Throughput'].rolling(window=window_size, center=True, min_periods=1).median())
#         current_na_count = df_copy['Throughput'].isna().sum()
#         if current_na_count >= previous_na_count:
#             break  # No progress made, so exit the loop
#         previous_na_count = current_na_count
#     # global_median = df_copy['Throughput'].median()
#     # df_copy['Throughput'] = df_copy['Throughput'].fillna(global_median)
#     return df_copy

# def impute_rolling_average(df, window_size=3):
#     df_copy = df.copy()
#     previous_na_count = df_copy['Throughput'].isna().sum()
#     while df_copy['Throughput'].isna().any():
#         df_copy['Throughput'] = df_copy['Throughput'].fillna(df_copy['Throughput'].rolling(window=window_size, center=True, min_periods=1).mean())
#         current_na_count = df_copy['Throughput'].isna().sum()
#         if current_na_count >= previous_na_count:
#             break  # No progress made, so exit the loop
#         previous_na_count = current_na_count
#     # global_mean = df_copy['Throughput'].mean()
#     # df_copy['Throughput'] = df_copy['Throughput'].fillna(global_mean)
#     return df_copy

# def impute_rolling_median(df, window_size=3):
#     df_copy = df.copy()
#     while df_copy['Throughput'].isna().any():
#         df_copy['Throughput'] = df_copy['Throughput'].fillna(df_copy['Throughput'].rolling(window=window_size, center=True, min_periods=1).median())
#     return df_copy

# def impute_rolling_average(df, window_size=3):
#     df_copy = df.copy()
#     while df_copy['Throughput'].isna().any():
#         df_copy['Throughput'] = df_copy['Throughput'].fillna(df_copy['Throughput'].rolling(window=window_size, center=True, min_periods=1).mean())
#     return df_copy

def impute_rolling_median(df, window_size=3):
    df['Throughput'] = df['Throughput'].fillna(df['Throughput'].rolling(window=window_size, min_periods=1).median())
    global_median = df['Throughput'].median()
    df['Throughput'] = df['Throughput'].fillna(global_median)
    return df

def impute_rolling_average(df, window_size=3):
    df['Throughput'] = df['Throughput'].fillna(df['Throughput'].rolling(window=window_size, min_periods=1).mean())
    global_mean = df['Throughput'].mean()
    df['Throughput'] = df['Throughput'].fillna(global_mean)
    return df

def linear_interpolation(df, limit_direction='both', method='linear'):
    df_imputed = df.interpolate(method=method, limit_direction=limit_direction)

    df['Throughput'] = df['Throughput'].fillna(df_imputed['Throughput'])

    return df


In [4]:
def decomposicao_svd(df):
    U, S, Vt = np.linalg.svd(df, full_matrices=True)
    return U, S, Vt

def grafico_variabilidade(variabilidade, S):
    plt.plot(range(1, len(variabilidade) + 1), variabilidade, marker='o', markersize=1, markerfacecolor='teal', markeredgecolor='teal', color='darkturquoise')
    plt.xlabel('Número de Valores Singulares')
    plt.ylabel('Variabilidade Acumulada')
    plt.title('Valores Singulares por Variabilidade Acumulada')
    plt.grid(color='lightgray', alpha=0.7)
    plt.show()

def componentes_principais(r, U, S, Vt):
    U_reduced = U[:, :r]
    S_reduced = S[:r]
    Vt_reduced = Vt[:r, :]
    return U_reduced, S_reduced, Vt_reduced
#trasnforma a matriz de volta em um dataframe
def matriztodf(dataframes):
    df = pd.DataFrame(dataframes)
    q = df.shape[0]*df.shape[1]
    df = df.transpose()
    df = df.to_numpy().reshape(-1, q)
    df = df.transpose()
    df = pd.DataFrame(df)
    if 0 in df.columns:
        df = df.rename(columns={0: 'Throughput'})
    return df

#calcula rmse dos valores gerados no svd com os valores 'originais'
def calcular_rmse(df1, df2, coluna):
    indices_comuns = df1.index.intersection(df2.index)
    valores_df1 = df1.loc[indices_comuns, coluna]
    valores_df2 = df2.loc[indices_comuns, coluna]
    rmse = np.sqrt(np.mean((valores_df1 - valores_df2) ** 2))
    return rmse

#gera arquivo csv final com a imputacao
def gerar_arq_csv(df1, df2, caminho_base, nome_arquivo_csv):
    df1['Throughput'] = pd.NA
    df1['Throughput'] = df2['Throughput']
    #exclui as ultimas linhas do arquivo que nao foi feita imputacao
    df1 = df1.dropna(subset=['Throughput'])
    caminho_svd = os.path.join(caminho_base, 'svd')
    if not os.path.exists(caminho_svd):
        os.makedirs(caminho_svd)
    caminho_arquivo_csv = os.path.join(caminho_svd, nome_arquivo_csv)
    # Salva o DataFrame resultante em um arquivo CSV
    df1.to_csv(caminho_arquivo_csv, index=False)
    print(f"CSV file '{caminho_arquivo_csv}' generated!")
    return df1

def matriz(path):
    df = pd.read_csv(path)
    df = outlier_removal(df, 'Throughput')
    df_datetime = df.copy()
    df_datetime.drop(columns=['Throughput'], inplace = True)
    df['Throughput'] = df['Throughput'].replace(-1, np.nan)
    
    Throughput = df['Throughput'].values
    num_dados = len(Throughput)
    num_colunas = num_dados // 28
    matriz = Throughput[:num_colunas*28].reshape(num_colunas, 28).T
    matriz_original = pd.DataFrame(matriz)
    df_interpolado = df["Throughput"].interpolate(method='linear', limit_direction='both')
    Throughput_=df_interpolado.values
    matriz_interpolado = Throughput_[:num_colunas*28].reshape(num_colunas, 28).T
    matriz_interpolado= pd.DataFrame(matriz_interpolado)
    mask = np.isnan(matriz_original.values)
    matriz_mascara = pd.DataFrame(mask)
    return matriz_original, matriz_mascara, matriz_interpolado, df_datetime#, r_max

In [5]:
def get_dataset_path_list(diretory):
    dataset_path_list = []
    for file in os.listdir(diretory):
        path = os.path.join(diretory, file)
        dataset_path_list.append(path)
    return dataset_path_list

In [None]:


def apply_basic_imputations(source_path, destination_path, csv_path_list):
    for caminho in csv_path_list:
        df = pd.read_csv(caminho)
        
        if df.shape[0] < 28:
            print(f'The {caminho} file does not have sufficient quantity of lines for imputation (28)')
            continue

        df = outlier_removal(df, 'Throughput')
        
        df_knn = impute_knn(df.copy())
        df_rolling_median = impute_rolling_median(df.copy())
        df_rolling_average = impute_rolling_average(df.copy())
        df_interpolation = linear_interpolation(df.copy())
        
        # Create output directories if they don't ex ist
        techniques = ['knn', 'mediana-movel', 'media-movel', 'interpolacao-linear']
        for technique in techniques:
            technique_dir = os.path.join(destination_path, technique)
            if not os.path.exists(technique_dir):
                os.makedirs(technique_dir)
                print(f"Saving path for {technique} created.")
        
        # Construct output file paths
        relative_path = os.path.relpath(caminho, source_path)
        output_knn = os.path.join(destination_path, 'knn', relative_path)
        output_median = os.path.join(destination_path, 'mediana-movel', relative_path)
        output_average = os.path.join(destination_path, 'media-movel', relative_path)
        output_interpolation = os.path.join(destination_path, 'interpolacao-linear', relative_path)
        
        # Ensure the output directories exist
        os.makedirs(os.path.dirname(output_knn), exist_ok=True)
        os.makedirs(os.path.dirname(output_median), exist_ok=True)
        os.makedirs(os.path.dirname(output_average), exist_ok=True)
        os.makedirs(os.path.dirname(output_interpolation), exist_ok=True)
        
        # Save the DataFrames to CSV
        df_knn.to_csv(output_knn, index=False)
        df_rolling_median.to_csv(output_median, index=False)
        df_rolling_average.to_csv(output_average, index=False)
        df_interpolation.to_csv(output_interpolation, index=False)
        
        print(f"Processed file: {caminho}")


#adicionar uma verificação para caso o tamanho do arquivo seja menor que 28 -> remover porque se nao fica só interpolacao linear?
def apply_svd_imputation(destination_path, csv_path_list):
    
    resultados = {}

    for caminho_csv in csv_path_list:

        df = pd.read_csv(caminho_csv)

        if (df.shape[0] < 28):
            print(f'The {caminho_csv} file does not have sufficient quantity of lines for imputation (28)')
            continue

        nome_arquivo = os.path.basename(caminho_csv)
        
        resultados[nome_arquivo] = {'interpolacao_linear': None, 'svd_final': None}
        df_matriz, df_mask, df_interpolado, df_datetime = matriz(caminho_csv)
        resultados[nome_arquivo]['interpolacao_linear'] = df_interpolado.copy()

        A_anterior = df_interpolado.values.copy()
        rmse = float('inf') 
        max_iter = 300
        n_iter = 0


        while rmse >= 1e-3 and n_iter<=max_iter:  
            U, S, Vt = decomposicao_svd(df_interpolado)
            variabilidade = np.cumsum(S**2) / np.sum(S**2)

            porcentagem_variabilidade = 0.95
            r = np.where(variabilidade >= porcentagem_variabilidade)[0][0] + 1
            
            #print(f'Número de valores singulares para atingir {porcentagem_variabilidade*100}% de variabilidade: {r}')

            U_reduzido, S_reduzido, Vt_reduzido = componentes_principais(r, U, S, Vt)
            S_reduzido_matriz = np.diag(S_reduzido)

            A_aproximada = np.dot(np.dot(U_reduzido, S_reduzido_matriz), Vt_reduzido)
            A_aproximada_df = pd.DataFrame(A_aproximada)

            df_matriz_preenchida = df_matriz.fillna(A_aproximada_df)
            
            resultados[nome_arquivo]['svd_final'] = df_matriz_preenchida

            # Atualiza df_interpolado para a próxima iteração
            df_interpolado = df_matriz_preenchida.values
            
            # Calcular o RMSE entre a matriz atual e a anterior
            rmse = np.sqrt(np.mean((A_aproximada - A_anterior) ** 2))

            # Atualiza A_anterior para a próxima comparação
            A_anterior = A_aproximada.copy()

            n_iter +=1

        # print(f'RMSE na iteração atual: {rmse}')
        # print(f'Finalizando processamento para {caminho_csv}')

        svd = matriztodf(resultados[nome_arquivo]["svd_final"])
        interpolacao = matriztodf(resultados[nome_arquivo]["interpolacao_linear"])
        mask = matriztodf(df_mask)
        dfs_reshaped = []
        dfs_reshaped.append(interpolacao) #0
        dfs_reshaped.append(svd) #1
        dfs_reshaped.append(mask) #2
        # Chama a função para plotar os dados
        # plot_imputed_data(dfs_reshaped)
        gerar_arq_csv(df_datetime, dfs_reshaped[1], destination_path, nome_arquivo)
        

In [9]:
def apply_all_imputations(source_path, destination_path):
    csv_path_list = get_dataset_path_list(source_path)
    # apply_basic_imputations(source_path, destination_path, csv_path_list)
    apply_svd_imputation(destination_path, csv_path_list)

In [7]:
source_path = '../datasets/choosen-best-svd/'
destination_path = '../datasets/svd-imputed-choosen-best-svd/'


In [None]:
# csv_path_list = get_dataset_path_list(source_path)

In [10]:
apply_all_imputations(source_path, destination_path)

CSV file '../datasets/svd-imputed-choosen-best-svd/svd\treated bbr esmond data ap-ba 07-03-2023_longest_interval.csv' generated!
CSV file '../datasets/svd-imputed-choosen-best-svd/svd\treated bbr esmond data ba-go 07-03-2023_longest_interval.csv' generated!
CSV file '../datasets/svd-imputed-choosen-best-svd/svd\treated bbr esmond data ba-pa 07-03-2023_longest_interval.csv' generated!
CSV file '../datasets/svd-imputed-choosen-best-svd/svd\treated bbr esmond data ba-pe 07-03-2023_longest_interval.csv' generated!
CSV file '../datasets/svd-imputed-choosen-best-svd/svd\treated bbr esmond data ce-am 07-03-2023_longest_interval.csv' generated!
CSV file '../datasets/svd-imputed-choosen-best-svd/svd\treated bbr esmond data ce-ro 07-03-2023_longest_interval.csv' generated!
CSV file '../datasets/svd-imputed-choosen-best-svd/svd\treated bbr esmond data es-pr 07-03-2023_longest_interval.csv' generated!
CSV file '../datasets/svd-imputed-choosen-best-svd/svd\treated bbr esmond data go-se 07-03-2023_l

In [None]:
def gabriel_eigen_impute(X, tol=1e-4, max_iter=100, m=None):
    """
    Perform imputation on the matrix X using the regularized GabrielEigen method.

    Parameters:
    - X: 2D numpy array with missing values as np.nan
    - tol: Convergence tolerance
    - max_iter: Maximum number of iterations
    - m: Rank of the approximation (if None, it will be determined automatically)

    Returns:
    - X_imputed: Imputed matrix
    """
    X = np.array(X, dtype=float)  # Ensure X is a NumPy array
    n, p = X.shape

    # Step 1: Replace missing values with column means
    col_means = np.nanmean(X, axis=0)
    inds = np.where(np.isnan(X))
    X[inds] = np.take(col_means, inds[1])

    # Initialize variables for convergence check
    X_prev = np.copy(X)
    converged = False
    iter_count = 0

    while not converged and iter_count < max_iter:
        # Step 2: Standardize the columns
        col_means = np.mean(X, axis=0)
        col_stds = np.std(X, axis=0, ddof=1)
        X_std = (X - col_means) / col_stds

        # Handle zero standard deviation
        col_stds[col_stds == 0] = 1
        X_std = (X - col_means) / col_stds

        # Step 3: Perform regularized SVD
        U, D, Vt = reg_svd(X_std, m)

        # Reconstruct the matrix using rank-m approximation
        X_hat_std = np.dot(U, np.dot(np.diag(D), Vt))

        # Step 5: De-standardize the imputed values
        X_hat = X_hat_std * col_stds + col_means

        # Only update missing values
        X[inds] = X_hat[inds]

        # Check for convergence
        diff = np.linalg.norm(X - X_prev) / np.linalg.norm(X_prev)
        if diff < tol:
            converged = True
        X_prev = np.copy(X)
        iter_count += 1

    return X

def reg_svd(X_std, m=None):
    """
    Compute the regularized SVD of X_std.

    Parameters:
    - X_std: Standardized matrix
    - m: Desired rank (if None, it will be determined based on explained variance)

    Returns:
    - U: Left singular vectors
    - D: Singular values
    - Vt: Right singular vectors (transposed)
    """
    n, p = X_std.shape
    k = min(n, p)

    # Step 4: Choose lambda via direct search
    lambdas = np.arange(0, 1.1, 0.1)
    min_f = np.inf
    best_lambda = 0
    best_U = None
    best_V = None

    for lmbda in lambdas:
        U, V, f = compute_reg_svd(X_std, k, lmbda)
        if f < min_f:
            min_f = f
            best_lambda = lmbda
            best_U = U
            best_V = V

    # Compute final SVD on U * V.T
    UVt = np.dot(best_U, best_V.T)
    U_final, D, Vt_final = np.linalg.svd(UVt, full_matrices=False)

    # Determine rank m if not specified
    if m is None:
        cumulative_energy = np.cumsum(D**2)
        total_energy = cumulative_energy[-1]
        m = np.searchsorted(cumulative_energy, 0.8 * total_energy) + 1

    # Keep only the first m components
    U_final = U_final[:, :m]
    D = D[:m]
    Vt_final = Vt_final[:m, :]

    return U_final, D, Vt_final

def compute_reg_svd(X, k, lmbda, tol=1e-4, max_iter=100):
    """
    Compute the regularized SVD for a given lambda.

    Parameters:
    - X: Standardized matrix
    - k: Desired rank
    - lmbda: Regularization parameter
    - tol: Convergence tolerance
    - max_iter: Maximum number of iterations

    Returns:
    - U: Left singular vectors
    - V: Right singular vectors
    - f: Objective function value
    """
    n, p = X.shape
    V = np.random.rand(p, k)
    U = np.zeros((n, k))
    f_prev = np.inf

    I_k = np.eye(k)

    for _ in range(max_iter):
        # Update U
        VVt = np.dot(V.T, V) + lmbda * I_k
        VVt_inv = np.linalg.pinv(VVt)
        U = np.dot(np.dot(X, V), VVt_inv)

        # Update V
        UUt = np.dot(U.T, U) + lmbda * I_k
        UUt_inv = np.linalg.pinv(UUt)
        V = np.dot(np.dot(X.T, U), UUt_inv)

        # Compute objective function
        UVt = np.dot(U, V.T)
        residual = X - UVt
        f = np.linalg.norm(residual, 'fro')**2 + lmbda * (np.linalg.norm(U, 'fro')**2 + np.linalg.norm(V, 'fro')**2)

        # Check for convergence
        if abs(f_prev - f) < tol:
            break
        f_prev = f

    return U, V, f

import pandas as pd
import numpy as np

def df_transformation(path):
    df = pd.read_csv(path)
    df['Data'] = pd.to_datetime(df['Data'].str.strip(), format='%d-%m-%Y')
    df['Timestamp'] = pd.to_datetime(df['Data'].dt.strftime('%Y-%m-%d') + ' ' + df['Intervalo'].str.split(' a ').str[0])
    df['UnixTimestamp'] = df['Timestamp'].apply(lambda x: int(x.timestamp()))
    new_df = pd.DataFrame()
    new_df['Timestamp'] = df['UnixTimestamp']
    new_df['Throughput'] = df['Throughput']

    return new_df    

def df_to_matrix_transformation(df):
    df['Throughput'] = df['Throughput'].replace(-1, np.nan)
    matrix = df.to_numpy()
    return matrix



In [None]:
import numpy as np
import pandas as pd

def gabriel_eigen_impute_df(df, tol=1e-4, max_iter=100, m=None):
    """
    Perform imputation on the DataFrame using the regularized GabrielEigen method.

    Parameters:
    - df: DataFrame with missing values as NaNs. Should contain a 'Timestamp' column to retain.
    - tol: Convergence tolerance
    - max_iter: Maximum number of iterations
    - m: Rank of the approximation (if None, it will be determined automatically)

    Returns:
    - df_imputed: DataFrame with NaNs imputed, retaining the original 'Timestamp' column
    """
    # Separate the 'Timestamp' column and the columns with values to impute
    timestamp_col = df['Timestamp']
    value_cols = df.drop(columns=['Timestamp'])
    
    # Convert the DataFrame values to a NumPy array for SVD processing
    X = value_cols.to_numpy(dtype=float)
    
    # Replace NaNs with column means as initial values
    col_means = np.nanmean(X, axis=0)
    inds = np.where(np.isnan(X))
    X[inds] = np.take(col_means, inds[1])
    
    # Initialize variables for convergence check
    X_prev = np.copy(X)
    converged = False
    iter_count = 0

    while not converged and iter_count < max_iter:
        # Standardize the columns
        col_means = np.mean(X, axis=0)
        col_stds = np.std(X, axis=0, ddof=1)
        col_stds[col_stds == 0] = 1  # Handle zero standard deviation
        X_std = (X - col_means) / col_stds

        # Perform regularized SVD
        U, D, Vt = reg_svd(X_std, m)

        # Reconstruct the matrix using rank-m approximation
        X_hat_std = np.dot(U, np.dot(np.diag(D), Vt))
        X_hat = X_hat_std * col_stds + col_means

        # Update missing values only
        X[inds] = X_hat[inds]

        # Check for convergence
        diff = np.linalg.norm(X - X_prev) / np.linalg.norm(X_prev)
        if diff < tol:
            converged = True
        X_prev = np.copy(X)
        iter_count += 1

    # Create a new DataFrame with imputed values
    df_imputed = pd.DataFrame(X, columns=value_cols.columns)
    df_imputed['Timestamp'] = timestamp_col.reset_index(drop=True)
    
    return df_imputed

# Example usage
# df = pd.read_csv('path_to_your_csv.csv')
# df_imputed = gabriel_eigen_impute_df(df)
# print(df_imputed.head())


In [None]:
def apply_reg_svd_imputation(source_path, destination_path, csv_path_list):
    for caminho in csv_path_list:
        df = pd.read_csv(caminho)

        df = outlier_removal(df, 'Throughput')
        
        df_reg = gabriel_eigen_impute_df(df.copy())
        # Create output directories if they don't exist
        techniques = ['reg_svd']
        for technique in techniques:
            technique_dir = os.path.join(destination_path, technique)
            if not os.path.exists(technique_dir):
                os.makedirs(technique_dir)
                print(f"Saving path for {technique} created.")
        
        # Construct output file paths
        relative_path = os.path.relpath(caminho, source_path)
        output_reg = os.path.join(destination_path, 'reg_svd', relative_path)
        
        # Ensure the output directories exist
        os.makedirs(os.path.dirname(output_reg), exist_ok=True)
        
        # Save the DataFrames to CSV
        df_reg.to_csv(output_reg, index=False)
        
        print(f"Processed file: {caminho}")


In [None]:
# source_path = '../datasets/treated longest interval with failures/'
# destination_path = '../datasets/imputed-treated-longest-interval-with-failures/'
# csv_path_list = get_dataset_path_list(source_path)

In [None]:
# apply_reg_svd_imputation(source_path, destination_path, csv_path_list)

In [7]:
def weighted_average(window):
    window = np.array(window)
    mask = ~np.isnan(window)
    if np.sum(mask) == 0:
        return np.nan
    weights = np.arange(1, np.sum(mask) + 1)
    return np.average(window[mask], weights=weights)

def weightned_rolling_mean(df_copy):
    # Replace -1 with NaN
    # df_copy = df_copy.replace(-1, np.nan)

    # Define the window size
    window_size = 30

    # Calculate the weighted rolling average
    weighted_rolling_mean = df_copy['Throughput'].rolling(window=window_size, min_periods=1).apply(weighted_average, raw=False)

    # Fill NaN values with the weighted rolling average
    df_copy['Throughput'] = df_copy['Throughput'].fillna(weighted_rolling_mean)

    df_copy = df_copy.bfill()
    return df_copy

In [8]:
def apply_weightned_rolling_mean(source_path, destination_path, csv_path_list):
    for caminho in csv_path_list:
        df = pd.read_csv(caminho)
        print(df.head(15))

        df = outlier_removal(df, 'Throughput')
        
        df_weight = weightned_rolling_mean(df.copy())
        print(df_weight.head(15))
        # Create output directories if they don't exist
        techniques = ['weightned_rolling_mean']
        for technique in techniques:
            technique_dir = os.path.join(destination_path, technique)
            if not os.path.exists(technique_dir):
                os.makedirs(technique_dir)
                print(f"Saving path for {technique} created.")
        
        # Construct output file paths
        relative_path = os.path.relpath(caminho, source_path)
        output_weightned_rolling = os.path.join(destination_path, 'weightned_rolling_mean', relative_path)
        
        # Ensure the output directories exist
        os.makedirs(os.path.dirname(output_weightned_rolling), exist_ok=True)
        
        # Save the DataFrames to CSV
        df_weight.to_csv(output_weightned_rolling, index=False)
        
        print(f"Processed file: {caminho}")


In [11]:
source_path = '../datasets/choosen-best-svd/'
destination_path = '../datasets/imputed-choosen-best-svd/'
csv_path_list = get_dataset_path_list(source_path)

In [12]:
apply_weightned_rolling_mean(source_path, destination_path, csv_path_list)

    Unnamed: 0          Timestamp   Throughput
0            0  04-01-23 00:00:00          NaN
1            1  04-01-23 06:00:00          NaN
2            2  04-01-23 12:00:00          NaN
3            3  04-01-23 18:00:00  849059002.5
4            4  05-01-23 00:00:00  912053206.0
5            5  05-01-23 06:00:00  912420459.0
6            6  05-01-23 12:00:00  922591115.0
7            7  05-01-23 18:00:00  921072349.0
8            8  06-01-23 00:00:00  920049754.5
9            9  06-01-23 06:00:00  922798939.0
10          10  06-01-23 12:00:00  923794550.0
11          11  06-01-23 18:00:00  901094740.0
12          12  07-01-23 00:00:00  920939008.0
13          13  07-01-23 06:00:00  820589277.0
14          14  07-01-23 12:00:00  918830535.0
    Unnamed: 0          Timestamp   Throughput
0            0  04-01-23 00:00:00  849059002.5
1            1  04-01-23 06:00:00  849059002.5
2            2  04-01-23 12:00:00  849059002.5
3            3  04-01-23 18:00:00  849059002.5
4            