# Soft Impute 

In [1]:
import numpy as np
from fancyimpute import SoftImpute
import pandas as pd

### Create throughput df

In [2]:
# Carregar o dataset (ajustar o caminho para o seu arquivo)
df = pd.read_csv("institution_subnets/institution_subnets/agg_1_hour/0.csv")

# Exemplo de como o seu dataframe pode ser estruturado:
# df.columns = ['id_time', 'n_flows', 'n_packets', 'n_bytes', 'n_dest_ip', 'n_dest_asn', 'n_dest_port',
#               'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 
#               'avg_duration', 'avg_ttl']

# Calcular o throughput para uma janela de tempo de 1 hora
# Vamos supor que 'id_time' seja o identificador de cada intervalo de tempo e que os dados são agregados por hora

# Agrupar os dados por intervalo de uma hora
# Caso 'id_time' não seja um timestamp, você precisará convertê-lo em um formato temporal adequado

# Exemplo de uma simples conversão (se necessário):
# df['id_time'] = pd.to_datetime(df['id_time'], format='%Y-%m-%d %H:%M:%S')

# Calcular o throughput por hora (em bytes por segundo ou bps)
# Vamos assumir que 'n_bytes' seja o total de bytes transmitidos e que cada intervalo é de 1 hora (3600 segundos)
df['throughput'] = df['n_bytes'] / 3600 * 8  # Para throughput em bps

# Se precisar do throughput em Bps (bytes por segundo), basta remover o "* 8":
# df['throughput_bps'] = df['n_bytes'] / 3600  # Para throughput em bytes por segundo

# Criar um novo DataFrame apenas com a coluna 'throughput_bps'
df_throughput = df[['throughput']]

In [3]:
df_throughput.head()

Unnamed: 0,throughput
0,32938560.0
1,45549000.0
2,60658140.0
3,68221890.0
4,67826790.0


### Create "Temporal Matrix"

In [4]:
throughput_vector = df['throughput'].values

data_len = len(throughput_vector)
print(f"Throughput data length = {data_len}")

columns_quantity = data_len // 60 # one week 

Throughput data length = 6717


In [5]:
temporal_matrix = throughput_vector[:columns_quantity*60].reshape(columns_quantity, 60).T
original_df = pd.DataFrame(temporal_matrix)
print(original_df.shape)

(60, 111)


In [6]:
original_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,101,102,103,104,105,106,107,108,109,110
0,32938560.0,175803600.0,43948030.0,141328900.0,19428400.0,84270170.0,32180030.0,146111900.0,61069810.0,162076700.0,...,138553900.0,31106530.0,110181000.0,34964070.0,108539900.0,28277940.0,90885000.0,17237780.0,56613950.0,26400760.0
1,45549000.0,184474700.0,36132000.0,163800200.0,17157740.0,81248550.0,41305300.0,165289700.0,47201670.0,160040200.0,...,159858900.0,41291830.0,124561400.0,37009720.0,110409700.0,21328670.0,96832200.0,18242390.0,61940950.0,25605270.0
2,60658140.0,229111000.0,23146200.0,231823400.0,30463540.0,91623490.0,46517500.0,210591400.0,49898070.0,208837900.0,...,166396900.0,48779470.0,132210100.0,50183970.0,115745500.0,50198450.0,96736230.0,22015070.0,69261600.0,29877250.0
3,68221890.0,275880800.0,37493240.0,265462500.0,37482670.0,111142100.0,55180270.0,269933900.0,36621390.0,263736500.0,...,170964600.0,59841080.0,126421500.0,80565510.0,135479500.0,48561470.0,94922830.0,30918870.0,74576570.0,38577620.0
4,67826790.0,265472200.0,58261580.0,299079600.0,61139110.0,154996000.0,76234570.0,287345700.0,57299470.0,252523200.0,...,204209000.0,72732020.0,141916400.0,86088430.0,127159600.0,56496720.0,96685260.0,46634500.0,78719420.0,45491460.0


In [7]:
imputer = SoftImpute(max_iters=1000, shrinkage_value=0.1)

In [8]:
random_state = 42
df_missing = original_df.copy()
missing_quantity = int(0.2*data_len) # quantity of points missing 
nan_indexes = np.random.default_rng(random_state).choice(
    df_missing.size, missing_quantity, replace=False
)

nan_rows = nan_indexes // df_missing.shape[1] 
nan_cols = nan_indexes % df_missing.shape[1]

df_missing.values[nan_rows, nan_cols] = np.nan

In [9]:
nan_mask = df_missing.isna()
nan_mask.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,101,102,103,104,105,106,107,108,109,110
0,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,True,False,False,False,False,True,...,True,True,False,False,False,True,False,False,False,False
3,False,False,False,False,False,True,False,False,True,False,...,False,False,True,False,True,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [10]:
df_missing.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,101,102,103,104,105,106,107,108,109,110
0,32938560.0,175803600.0,43948030.0,141328900.0,19428400.0,84270170.0,32180030.0,146111900.0,61069810.0,162076700.0,...,138553900.0,,,34964070.0,108539900.0,28277940.0,90885000.0,,56613950.0,26400760.0
1,45549000.0,184474700.0,36132000.0,163800200.0,17157740.0,81248550.0,41305300.0,165289700.0,47201670.0,160040200.0,...,159858900.0,41291830.0,124561400.0,37009720.0,110409700.0,21328670.0,96832200.0,18242390.0,61940950.0,25605270.0
2,,229111000.0,23146200.0,231823400.0,,91623490.0,46517500.0,210591400.0,49898070.0,,...,,,132210100.0,50183970.0,115745500.0,,96736230.0,22015070.0,69261600.0,29877250.0
3,68221890.0,275880800.0,37493240.0,265462500.0,37482670.0,,55180270.0,269933900.0,,263736500.0,...,170964600.0,59841080.0,,80565510.0,,48561470.0,94922830.0,30918870.0,74576570.0,38577620.0
4,67826790.0,265472200.0,58261580.0,299079600.0,61139110.0,154996000.0,76234570.0,287345700.0,57299470.0,252523200.0,...,204209000.0,72732020.0,141916400.0,86088430.0,127159600.0,,96685260.0,46634500.0,78719420.0,45491460.0


In [11]:
X_filled_softimpute = imputer.fit_transform(df_missing)



[SoftImpute] Max Singular Value of X_init = 7524992485.601967
[SoftImpute] Iter 1: observed MAE=0.005934 rank=60
[SoftImpute] Iter 2: observed MAE=0.005934 rank=60
[SoftImpute] Iter 3: observed MAE=0.005934 rank=60
[SoftImpute] Iter 4: observed MAE=0.005934 rank=60
[SoftImpute] Iter 5: observed MAE=0.005934 rank=60
[SoftImpute] Iter 6: observed MAE=0.005934 rank=60
[SoftImpute] Iter 7: observed MAE=0.005934 rank=60
[SoftImpute] Iter 8: observed MAE=0.005934 rank=60
[SoftImpute] Iter 9: observed MAE=0.005934 rank=60
[SoftImpute] Iter 10: observed MAE=0.005934 rank=60
[SoftImpute] Iter 11: observed MAE=0.005934 rank=60
[SoftImpute] Iter 12: observed MAE=0.005934 rank=60
[SoftImpute] Iter 13: observed MAE=0.005934 rank=60
[SoftImpute] Iter 14: observed MAE=0.005934 rank=60
[SoftImpute] Iter 15: observed MAE=0.005934 rank=60
[SoftImpute] Iter 16: observed MAE=0.005934 rank=60
[SoftImpute] Iter 17: observed MAE=0.005934 rank=60
[SoftImpute] Iter 18: observed MAE=0.005934 rank=60
[SoftImpute

In [12]:
imputed_df = pd.DataFrame(X_filled_softimpute)
imputed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,101,102,103,104,105,106,107,108,109,110
0,32938560.0,175803600.0,43948030.0,141328900.0,19428400.0,84270170.0,32180030.0,146111900.0,61069810.0,162076700.0,...,138553900.0,7.197137,15.51993,34964070.0,108539900.0,28277940.0,90885000.0,6.478788,56613950.0,26400760.0
1,45549000.0,184474700.0,36132000.0,163800200.0,17157740.0,81248550.0,41305300.0,165289700.0,47201670.0,160040200.0,...,159858900.0,41291830.0,124561400.0,37009720.0,110409700.0,21328670.0,96832200.0,18242390.0,61940950.0,25605270.0
2,1.464097,229111000.0,23146200.0,231823400.0,6.105126,91623490.0,46517500.0,210591400.0,49898070.0,30.07496,...,23.05604,0.3454422,132210100.0,50183970.0,115745500.0,2.277677,96736230.0,22015070.0,69261600.0,29877250.0
3,68221890.0,275880800.0,37493240.0,265462500.0,37482670.0,23.95277,55180270.0,269933900.0,5.535587,263736500.0,...,170964600.0,59841080.0,16.48001,80565510.0,24.49808,48561470.0,94922830.0,30918870.0,74576570.0,38577620.0
4,67826790.0,265472200.0,58261580.0,299079600.0,61139110.0,154996000.0,76234570.0,287345700.0,57299470.0,252523200.0,...,204209000.0,72732020.0,141916400.0,86088430.0,127159600.0,6.372965,96685260.0,46634500.0,78719420.0,45491460.0


### Verify if imputation ocurred currectly

In [14]:
orig_imputed = imputed_df[~nan_mask]
orig_original = original_df[~nan_mask]
preservou_originais = orig_imputed.equals(orig_original)

if preservou_originais:
    print("✅ Todos os valores originais foram preservados nos dados imputados.")
else:
    print("❌ Os dados originais foram modificados em pelo menos um ponto!")
    alterados = (imputed_df != original_df) & (~nan_mask)
    print(alterados.sum().sum(), "pontos originais foram alterados.")

# 2. Verificar se restou algum NaN no resultado da imputação
tem_nan_apos_imputacao = imputed_df.isna().any().any()

if tem_nan_apos_imputacao:
    print("⚠️ Ainda existem valores NaN no DataFrame imputado.")
else:
    print("✅ Não existem valores NaN no DataFrame imputado.")


✅ Todos os valores originais foram preservados nos dados imputados.
✅ Não existem valores NaN no DataFrame imputado.


### Evaluate

In [13]:

rmse_imputed = np.sqrt(
    np.mean((original_df.values/1000000 - imputed_df.values/1000000) ** 2)
)

amplitude = (
    df_throughput['throughput'].values.max() - df_throughput['throughput'].values.min()
)/1000000
nrmse_imputed = rmse_imputed / amplitude if amplitude != 0 else np.nan

# Step 5: Print results
print(f"RMSE (Imputed Data): {rmse_imputed:.4f}")
print(f"NRMSE (Imputed Data): {(nrmse_imputed * 100):.4f}%")


RMSE (Imputed Data): 56.2762
NRMSE (Imputed Data): 14.8354%


### Soft Impute with grid search 

In [None]:
df = pd.read_csv("institution_subnets/institution_subnets/agg_1_hour/0.csv")

df["throughput"] = df["n_bytes"] / 3600 * 8  # Para throughput em bps

df_throughput = df[["throughput"]]
