# 4. FOR EACH INTERVAL IN TEST SET: apply correction

In [93]:
import pickle
import pandas as pd
import numpy as np
import src.sf_funcs as sf
import glob
import seaborn as sns
sns.set_theme(style="whitegrid", font_scale=1.5)

In [94]:
n_bins = 10
times_to_gap = 3
pwrl_range = [10,100]

In [95]:
# Importing lookup table
lookup_table_2d = pd.read_csv(f"data/processed/lookup_table_2d_{n_bins}bins.csv", index_col=0)
lookup_table_3d = pd.read_csv(f"data/processed/lookup_table_3d_{n_bins}bins.csv", index_col=0)


Just importing one file at a time!


In [112]:
# Importing processed time series and structure functions
input_file_list = [sorted(glob.glob("data/processed/wind/wi_*05.pkl"))][0]
#input_file_list = [sorted(glob.glob("data/processed/psp/test/psp_*v02.pkl"))][0]

file_index_test = 1 # this simply refers to one of the files in the test files, not the "file_index" variable referring to the original raw file 

# Just importing one file at a time!
files_metadata, ints_metadata, ints, ints_gapped_metadata, ints_gapped, sfs, sfs_gapped = sf.load_and_concatenate_dataframes(
    [input_file_list[file_index_test]])

In [113]:
sfs_gapped_corrected.gap_handling.unique()

array(['naive', 'lint', 'corrected_2d', 'corrected_3d'], dtype=object)

In [114]:
# Apply 2D and 3D scaling to test set, report avg errors
print(f"Correcting interpolated test set intervals using 2D error heatmap with {n_bins} bins")
sfs_lint_corrected_2d = sf.compute_scaling(
    sfs_gapped[sfs_gapped["gap_handling"]=="lint"], "missing_percent", lookup_table_2d
)

Correcting interpolated test set intervals using 2D error heatmap with 10 bins


In [115]:
print(f"Correcting interpolated test set intervals using 3D error heatmap with {n_bins} bins")
sfs_lint_corrected_2d_3d = sf.compute_scaling_3d(
    sfs_lint_corrected_2d[sfs_lint_corrected_2d["gap_handling"]=="lint"], "missing_percent", lookup_table_3d
)

Correcting interpolated test set intervals using 3D error heatmap with 10 bins


In [116]:
correction_wide =  sfs_lint_corrected_2d_3d[["file_index", "int_index", "version", "lag", "missing_percent", "sf_2_corrected_2d", "sf_2_corrected_3d"]]
correction_long = pd.wide_to_long(correction_wide, ["sf_2"], i=["file_index", "int_index", "version", "lag", "missing_percent"], j="gap_handling", sep="_", suffix=r"\w+")
correction_bounds_wide =  sfs_lint_corrected_2d_3d[["file_index", "int_index", "version", "lag", "missing_percent", "sf_2_lower_corrected_2d", "sf_2_lower_corrected_3d", "sf_2_upper_corrected_2d", "sf_2_upper_corrected_3d"]]
correction_bounds_long = pd.wide_to_long(correction_bounds_wide, ["sf_2_lower", "sf_2_upper"], i=["file_index", "int_index", "version", "lag", "missing_percent"], j="gap_handling", sep="_", suffix=r"\w+")

corrections_long = pd.merge(correction_long, correction_bounds_long, how="inner", on=["file_index", "int_index", "version", "lag", "missing_percent", "gap_handling"]).reset_index()

In [117]:
# Adding the corrections, now as a form of "gap_handling", back to the gapped SF dataframe
sfs_gapped_corrected = pd.concat([sfs_gapped, corrections_long])

# Merging the original SFs with the corrected ones to then calculate errors
sfs_gapped_corrected = pd.merge(sfs, sfs_gapped_corrected, how="inner", on=["file_index", "int_index", "lag"], suffixes=("_orig", ""))

In [118]:
sfs_gapped_corrected

Unnamed: 0,int_index,file_index,lag,n_orig,missing_percent_orig,sf_2_orig,sf_2_se_orig,n,missing_percent,sf_2,sf_2_se,version,gap_handling,sf_2_lower,sf_2_upper
0,0,4,1.0,9999,0.0,0.019208,0.001052,4672.0,53.275328,0.019315,0.001630,0,naive,,
1,0,4,1.0,9999,0.0,0.019208,0.001052,4672.0,53.275328,0.010518,0.001630,0,lint,,
2,0,4,1.0,9999,0.0,0.019208,0.001052,4276.0,57.235724,0.020219,0.001660,1,naive,,
3,0,4,1.0,9999,0.0,0.019208,0.001052,4276.0,57.235724,0.008825,0.001660,1,lint,,
4,0,4,1.0,9999,0.0,0.019208,0.001052,8238.0,17.611761,0.019699,0.001224,2,naive,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11983,0,4,999.0,9001,0.0,1.955424,0.029148,,60.326630,2.575671,,0,corrected_3d,1.653954,5.817839
11984,0,4,999.0,9001,0.0,1.955424,0.029148,,80.446617,1.050516,,1,corrected_2d,0.758517,11.022705
11985,0,4,999.0,9001,0.0,1.955424,0.029148,,80.446617,1.964575,,1,corrected_3d,1.849105,2.095428
11986,0,4,999.0,9001,0.0,1.955424,0.029148,,31.729808,2.154849,,2,corrected_2d,1.952143,2.323564


#### Calculate lag-scale errors (sf_2_pe)
This is the first time we calculate these errors, for this specific dataset (they were calculated before for the training set)

Previously this didn't work as we had two sf_2_orig columns as the result of merging a dataframe that had already previously been merged. However, this initial merge is no longer taking place, as it is only now that we are calculating any errors *of any sort, including lag-specific ones*, for this particular dataset.

In [119]:
sfs_gapped_corrected["sf_2_pe"] = (sfs_gapped_corrected["sf_2"] - sfs_gapped_corrected["sf_2_orig"]) / sfs_gapped_corrected["sf_2_orig"] * 100

### Calculate interval-scale errors
This is the first time we do this. We do not need these values for the training set, because we only use that for calculating the correction factor, which uses lag-scale errors..

In [120]:
# Adding rows as placeholders for when we correct with 2D and 3D heatmaps and want to calculate errors

dup_df = ints_gapped_metadata.replace(["naive","lint"], ["corrected_2d", "corrected_3d"])
ints_gapped_metadata = pd.concat([ints_gapped_metadata, dup_df])

In [121]:
for i in files_metadata.file_index.unique():
    for j in range(len(ints_metadata["file_index"]==i)):
        for k in range(times_to_gap):
            for gap_handling in sfs_gapped_corrected.gap_handling.unique():
            # Calculate MAPE for 2D and 3D corrected SFs
                
                ints_gapped_metadata.loc[
                    (ints_gapped_metadata["file_index"]==i) &
                    (ints_gapped_metadata["int_index"]==j) & 
                    (ints_gapped_metadata["version"]==k) & 
                    (ints_gapped_metadata["gap_handling"]==gap_handling), 
                    "mape"] = np.mean(
                    np.abs(
                        sfs_gapped_corrected.loc[
                            (sfs_gapped_corrected["file_index"]==i) &
                            (sfs_gapped_corrected["int_index"]==j) & 
                            (sfs_gapped_corrected["version"]==k) & 
                            (sfs_gapped_corrected["gap_handling"]==gap_handling), 
                            "sf_2_pe"]))

                ints_gapped_metadata.loc[
                    (ints_gapped_metadata["file_index"]==i) &
                    (ints_gapped_metadata["int_index"]==j) & 
                    (ints_gapped_metadata["version"]==k) & 
                    (ints_gapped_metadata["gap_handling"]==gap_handling), 
                    "mpe"] = np.mean(
                        sfs_gapped_corrected.loc[
                            (sfs_gapped_corrected["file_index"]==i) &
                            (sfs_gapped_corrected["int_index"]==j) & 
                            (sfs_gapped_corrected["version"]==k) & 
                            (sfs_gapped_corrected["gap_handling"]==gap_handling), 
                            "sf_2_pe"])
                
                # Calculate power-law slope for 2D and 3D corrected SFs
                current_int = sfs_gapped_corrected.loc[
                    (sfs_gapped_corrected["file_index"]==i) &
                    (sfs_gapped_corrected["int_index"]==j) & 
                    (sfs_gapped_corrected["version"]==k) & 
                    (sfs_gapped_corrected["gap_handling"]==gap_handling)]

                # Fit a line to the log-log plot of the structure function over the given range

                slope = np.polyfit(
                    np.log(current_int.loc[(current_int["lag"] >= pwrl_range[0]) & (current_int["lag"] <= pwrl_range[1]), "lag"]),
                    np.log(current_int.loc[(current_int["lag"] >= pwrl_range[0]) & (current_int["lag"] <= pwrl_range[1]), "sf_2"]),
                    1,
                )[0]

                ints_gapped_metadata.loc[
                    (ints_gapped_metadata["file_index"]==i) &
                    (ints_gapped_metadata["int_index"]==j) & 
                    (ints_gapped_metadata["version"]==k) & 
                    (ints_gapped_metadata["gap_handling"]==gap_handling), 
                    "slope"] = slope

In [122]:
slope = np.polyfit(
    np.log(current_int.loc[(current_int["lag"] >= pwrl_range[0]) & (current_int["lag"] <= pwrl_range[1]), "lag"]),
    np.log(current_int.loc[(current_int["lag"] >= pwrl_range[0]) & (current_int["lag"] <= pwrl_range[1]), "sf_2"]),
    1,
)[0]

In [123]:
# Calculate slope errors
ints_gapped_metadata = pd.merge(ints_gapped_metadata, ints_metadata.drop(["int_start", "int_end"], axis=1), how="inner", on=["file_index", "int_index"], suffixes=("", "_orig"))

In [124]:
# maybe come back to this method of getting true slopes, could be fun

# # Create a dictionary from df2 with composite keys
# value2_dict = df2.set_index(['key1', 'key2'])['value2'].to_dict()

# # Create a composite key in df1 and map the values
# df1['composite_key'] = list(zip(df1['key1'], df1['key2']))
# df1['value2'] = df1['composite_key'].map(value2_dict)

In [125]:
ints_gapped_metadata["slope_pe"] = (ints_gapped_metadata["slope"] - ints_gapped_metadata["slope_orig"]) / ints_gapped_metadata["slope_orig"] * 100
ints_gapped_metadata["slope_ape"] = np.abs(ints_gapped_metadata["slope_pe"])

In [126]:
# Export the dataframes in one big pickle file
output_file_path = input_file_list[file_index_test].replace(".pkl", "_corrected.pkl")

with open(output_file_path, "wb") as f:
    pickle.dump(
        {
            "files_metadata": files_metadata,
            "ints_metadata": ints_metadata,
            "ints": ints,
            "ints_gapped_metadata": ints_gapped_metadata,
            "ints_gapped": ints_gapped,
            "sfs": sfs,
            "sfs_gapped": sfs_gapped_corrected,
        },
        f,
    )

In [127]:
sfs_gapped_corrected.gap_handling.unique()

array(['naive', 'lint', 'corrected_2d', 'corrected_3d'], dtype=object)