In [1]:
from pathlib import Path

import numpy as np
import polars as pl
from tqdm.auto import tqdm

In [2]:
data_dir = "../../data/leap"
N_ROWS = 500000000
files = list(Path(data_dir).glob("raw_train*"))
print(f"# of train files: {len(files)}")

# of train files: 100


In [3]:
weight_df = pl.read_csv(Path(data_dir, "sample_submission.csv"), n_rows=1)
label_cols = weight_df.columns[1:]
tmp_df = pl.read_csv(Path(data_dir, "train.csv"), n_rows=1)
feat_cols = tmp_df.select(pl.exclude(label_cols)).columns[1:]
len(feat_cols), len(label_cols)

(556, 368)

In [4]:
def calculate_statistics(cols, files, chunk_size=50):
    stats = []
    gs = (len(cols) - 1) // chunk_size + 1
    for i in tqdm(range(gs)):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        sub_cols = cols[start: end]
        dfs = []
        num_data = 0
        for filename in files:
            dfs.append(pl.read_parquet(filename, columns=sub_cols))
            num_data += len(dfs[-1])
            if num_data > N_ROWS:
                break
        df = pl.concat(dfs)
        for col in sub_cols:
            stats.append({
                "mean": df[col].mean(),
                "std": df[col].std(),
                "q1_4": df[col].quantile(0.25),
                "q2_4": df[col].quantile(0.5),
                "q3_4": df[col].quantile(0.75),
            })
    stats_df = (
        pl.from_dicts(stats)
        .transpose(include_header=True, header_name="stats", column_names=cols)
    )
    return stats_df


feat_stats_df = calculate_statistics(feat_cols, files)
label_stats_df = calculate_statistics(label_cols, files)

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

In [5]:
feat_stats_df.shape, label_stats_df.shape

((5, 557), (5, 369))

In [6]:
feat_stats_df

stats,state_t_0,state_t_1,state_t_2,state_t_3,state_t_4,state_t_5,state_t_6,state_t_7,state_t_8,state_t_9,state_t_10,state_t_11,state_t_12,state_t_13,state_t_14,state_t_15,state_t_16,state_t_17,state_t_18,state_t_19,state_t_20,state_t_21,state_t_22,state_t_23,state_t_24,state_t_25,state_t_26,state_t_27,state_t_28,state_t_29,state_t_30,state_t_31,state_t_32,state_t_33,state_t_34,state_t_35,…,pbuf_N2O_23,pbuf_N2O_24,pbuf_N2O_25,pbuf_N2O_26,pbuf_N2O_27,pbuf_N2O_28,pbuf_N2O_29,pbuf_N2O_30,pbuf_N2O_31,pbuf_N2O_32,pbuf_N2O_33,pbuf_N2O_34,pbuf_N2O_35,pbuf_N2O_36,pbuf_N2O_37,pbuf_N2O_38,pbuf_N2O_39,pbuf_N2O_40,pbuf_N2O_41,pbuf_N2O_42,pbuf_N2O_43,pbuf_N2O_44,pbuf_N2O_45,pbuf_N2O_46,pbuf_N2O_47,pbuf_N2O_48,pbuf_N2O_49,pbuf_N2O_50,pbuf_N2O_51,pbuf_N2O_52,pbuf_N2O_53,pbuf_N2O_54,pbuf_N2O_55,pbuf_N2O_56,pbuf_N2O_57,pbuf_N2O_58,pbuf_N2O_59
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""mean""",215.612015,227.878262,237.309279,247.912285,256.227916,259.451296,255.271369,246.698109,236.980836,230.274286,225.054863,220.928353,217.108561,213.894105,210.537094,207.052169,202.847109,200.041338,199.271441,201.226921,203.712135,206.900332,210.434994,214.236472,218.053071,221.900889,225.667783,229.347179,232.912757,236.379712,239.742185,243.019442,246.204743,249.292946,252.269269,255.123318,…,4.8292e-07,4.8788e-07,4.9026e-07,4.9079e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07
"""std""",6.654266,8.653689,8.248617,6.786184,6.242818,8.249606,10.157319,10.112921,9.218096,8.81164,8.137254,7.43026,6.723265,6.321632,6.513009,7.6074,9.966565,11.792149,10.03546,8.244552,5.475349,4.073948,4.320332,5.478911,6.961327,8.309486,9.521203,10.517859,11.344936,12.00057,12.514359,12.893314,13.160995,13.32794,13.405421,13.407873,…,2.0731e-08,1.0981e-08,3.9816e-09,9.4472e-10,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22
"""q1_4""",211.795845,222.243162,231.759324,243.668977,252.799997,256.235228,251.635947,243.449935,234.437349,228.097416,223.197921,219.205218,215.187527,211.551624,207.162932,201.871982,194.857171,189.825833,190.678675,194.104774,199.497332,204.247093,208.499765,211.28707,213.509142,215.931538,218.53578,221.356252,224.305857,227.357786,230.46927,233.648802,236.847711,240.04982,243.221606,246.322576,…,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07
"""q2_4""",215.558519,228.043463,236.547149,247.060054,255.854526,260.29541,257.143055,248.743841,239.033878,232.473792,227.008668,222.471784,217.942912,213.953813,209.621341,205.616373,201.811222,199.379132,198.640697,200.417245,203.053164,206.646559,210.628659,215.049439,219.470938,223.699178,227.787865,231.76676,235.647794,239.425994,243.096914,246.65641,250.090561,253.374147,256.474984,259.427914,…,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07
"""q3_4""",219.287002,233.795692,242.07833,251.166304,259.263806,263.889139,261.154271,252.387389,242.001918,235.063319,229.312532,224.574035,220.025695,216.310257,213.453864,211.56516,210.132128,209.441833,206.956568,207.596944,207.551485,209.507191,212.932085,217.680374,222.951198,228.15249,233.138784,237.828987,242.221519,246.335236,250.199286,253.830744,257.231715,260.441533,263.444719,266.252868,…,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07


In [7]:
feat_stats_df.head()

stats,state_t_0,state_t_1,state_t_2,state_t_3,state_t_4,state_t_5,state_t_6,state_t_7,state_t_8,state_t_9,state_t_10,state_t_11,state_t_12,state_t_13,state_t_14,state_t_15,state_t_16,state_t_17,state_t_18,state_t_19,state_t_20,state_t_21,state_t_22,state_t_23,state_t_24,state_t_25,state_t_26,state_t_27,state_t_28,state_t_29,state_t_30,state_t_31,state_t_32,state_t_33,state_t_34,state_t_35,…,pbuf_N2O_23,pbuf_N2O_24,pbuf_N2O_25,pbuf_N2O_26,pbuf_N2O_27,pbuf_N2O_28,pbuf_N2O_29,pbuf_N2O_30,pbuf_N2O_31,pbuf_N2O_32,pbuf_N2O_33,pbuf_N2O_34,pbuf_N2O_35,pbuf_N2O_36,pbuf_N2O_37,pbuf_N2O_38,pbuf_N2O_39,pbuf_N2O_40,pbuf_N2O_41,pbuf_N2O_42,pbuf_N2O_43,pbuf_N2O_44,pbuf_N2O_45,pbuf_N2O_46,pbuf_N2O_47,pbuf_N2O_48,pbuf_N2O_49,pbuf_N2O_50,pbuf_N2O_51,pbuf_N2O_52,pbuf_N2O_53,pbuf_N2O_54,pbuf_N2O_55,pbuf_N2O_56,pbuf_N2O_57,pbuf_N2O_58,pbuf_N2O_59
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""mean""",215.612015,227.878262,237.309279,247.912285,256.227916,259.451296,255.271369,246.698109,236.980836,230.274286,225.054863,220.928353,217.108561,213.894105,210.537094,207.052169,202.847109,200.041338,199.271441,201.226921,203.712135,206.900332,210.434994,214.236472,218.053071,221.900889,225.667783,229.347179,232.912757,236.379712,239.742185,243.019442,246.204743,249.292946,252.269269,255.123318,…,4.8292e-07,4.8788e-07,4.9026e-07,4.9079e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07
"""std""",6.654266,8.653689,8.248617,6.786184,6.242818,8.249606,10.157319,10.112921,9.218096,8.81164,8.137254,7.43026,6.723265,6.321632,6.513009,7.6074,9.966565,11.792149,10.03546,8.244552,5.475349,4.073948,4.320332,5.478911,6.961327,8.309486,9.521203,10.517859,11.344936,12.00057,12.514359,12.893314,13.160995,13.32794,13.405421,13.407873,…,2.0731e-08,1.0981e-08,3.9816e-09,9.4472e-10,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22,2.1175999999999997e-22
"""q1_4""",211.795845,222.243162,231.759324,243.668977,252.799997,256.235228,251.635947,243.449935,234.437349,228.097416,223.197921,219.205218,215.187527,211.551624,207.162932,201.871982,194.857171,189.825833,190.678675,194.104774,199.497332,204.247093,208.499765,211.28707,213.509142,215.931538,218.53578,221.356252,224.305857,227.357786,230.46927,233.648802,236.847711,240.04982,243.221606,246.322576,…,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07
"""q2_4""",215.558519,228.043463,236.547149,247.060054,255.854526,260.29541,257.143055,248.743841,239.033878,232.473792,227.008668,222.471784,217.942912,213.953813,209.621341,205.616373,201.811222,199.379132,198.640697,200.417245,203.053164,206.646559,210.628659,215.049439,219.470938,223.699178,227.787865,231.76676,235.647794,239.425994,243.096914,246.65641,250.090561,253.374147,256.474984,259.427914,…,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07
"""q3_4""",219.287002,233.795692,242.07833,251.166304,259.263806,263.889139,261.154271,252.387389,242.001918,235.063319,229.312532,224.574035,220.025695,216.310257,213.453864,211.56516,210.132128,209.441833,206.956568,207.596944,207.551485,209.507191,212.932085,217.680374,222.951198,228.15249,233.138784,237.828987,242.221519,246.335236,250.199286,253.830744,257.231715,260.441533,263.444719,266.252868,…,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07


In [8]:
label_stats_df.head()

stats,ptend_t_0,ptend_t_1,ptend_t_2,ptend_t_3,ptend_t_4,ptend_t_5,ptend_t_6,ptend_t_7,ptend_t_8,ptend_t_9,ptend_t_10,ptend_t_11,ptend_t_12,ptend_t_13,ptend_t_14,ptend_t_15,ptend_t_16,ptend_t_17,ptend_t_18,ptend_t_19,ptend_t_20,ptend_t_21,ptend_t_22,ptend_t_23,ptend_t_24,ptend_t_25,ptend_t_26,ptend_t_27,ptend_t_28,ptend_t_29,ptend_t_30,ptend_t_31,ptend_t_32,ptend_t_33,ptend_t_34,ptend_t_35,…,ptend_v_31,ptend_v_32,ptend_v_33,ptend_v_34,ptend_v_35,ptend_v_36,ptend_v_37,ptend_v_38,ptend_v_39,ptend_v_40,ptend_v_41,ptend_v_42,ptend_v_43,ptend_v_44,ptend_v_45,ptend_v_46,ptend_v_47,ptend_v_48,ptend_v_49,ptend_v_50,ptend_v_51,ptend_v_52,ptend_v_53,ptend_v_54,ptend_v_55,ptend_v_56,ptend_v_57,ptend_v_58,ptend_v_59,cam_out_NETSW,cam_out_FLWDS,cam_out_PRECSC,cam_out_PRECC,cam_out_SOLS,cam_out_SOLL,cam_out_SOLSD,cam_out_SOLLD
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""mean""",1e-05,-7e-06,-3e-06,-3e-06,-4e-06,-2e-06,-3.1367e-07,3.9781e-07,1.838e-07,-1.6051e-08,1.3832e-07,1.8617e-07,2.9528e-07,3.7515e-07,5.674e-07,9.3168e-07,2e-06,2e-06,7.6125e-07,-3.897e-07,-3.5255e-07,-2e-06,-4e-06,-6e-06,-7e-06,-7e-06,-6e-06,-5e-06,-4e-06,-3e-06,-2e-06,-9.0574e-07,-2.7731e-07,1.9049e-07,4.8513e-07,6.5106e-07,…,-2.868e-09,2.0688e-08,3.4469e-08,3.1557e-08,7.1815e-09,-6.3038e-08,-1.5727e-07,-2.4531e-07,-3.0017e-07,-3.0094e-07,-2.2172e-07,-7.6118e-08,1.1783e-07,3.2319e-07,4.9466e-07,6.1731e-07,6.3501e-07,5.3735e-07,3.3996e-07,8.3744e-08,-2.3336e-07,-6.8066e-07,-1e-06,-1e-06,-1e-06,-5.333e-07,6.2734e-07,1e-06,6.5831e-07,158.311244,351.273302,2.7343e-09,2.9194e-08,61.803931,67.324685,33.451249,17.676368
"""std""",3.2e-05,4.4e-05,5.3e-05,6.9e-05,9.1e-05,0.00011,0.000103,7.9e-05,5e-05,3.9e-05,2.9e-05,2.3e-05,1.7e-05,1.3e-05,9e-06,7e-06,7e-06,8e-06,1.1e-05,1.4e-05,1.5e-05,1.6e-05,1.8e-05,2e-05,2.3e-05,2.7e-05,3.2e-05,3.7e-05,4.3e-05,4.9e-05,5.4e-05,5.8e-05,6.1e-05,6.3e-05,6.4e-05,6.5e-05,…,8e-06,8e-06,7e-06,7e-06,7e-06,7e-06,8e-06,9e-06,9e-06,1e-05,1.1e-05,1.1e-05,1.2e-05,1.3e-05,1.4e-05,1.5e-05,1.6e-05,1.7e-05,1.8e-05,2e-05,2.2e-05,2.5e-05,2.8e-05,2.9e-05,3e-05,2.9e-05,2.8e-05,2.4e-05,3.4e-05,246.661342,71.979277,7.39e-09,8.1856e-08,110.255081,116.424189,46.397329,29.70672
"""q1_4""",-1.3e-05,-3.9e-05,-4.3e-05,-6e-05,-8.5e-05,-9.9e-05,-8.8e-05,-6.7e-05,-4.4e-05,-3.4e-05,-2.6e-05,-1.9e-05,-1.4e-05,-1e-05,-7e-06,-5e-06,-3e-06,-3e-06,-3e-06,-5e-06,-5e-06,-6e-06,-9e-06,-1.2e-05,-1.5e-05,-1.7e-05,-1.8e-05,-1.9e-05,-1.9e-05,-2e-05,-2.1e-05,-2.2e-05,-2.3e-05,-2.3e-05,-2.4e-05,-2.5e-05,…,-5.5398e-07,-5.2138e-07,-4.9227e-07,-4.6982e-07,-4.4817e-07,-4.3488e-07,-4.3559e-07,-4.4466e-07,-4.6692e-07,-5.0584e-07,-5.6217e-07,-6.3387e-07,-7.3155e-07,-8.551e-07,-1e-06,-1e-06,-1e-06,-2e-06,-2e-06,-3e-06,-3e-06,-4e-06,-5e-06,-6e-06,-7e-06,-8e-06,-7e-06,-6e-06,-9e-06,0.0,311.199335,0.0,0.0,0.0,0.0,0.0,0.0
"""q2_4""",8e-06,-9e-06,-1.5e-05,-3.5e-05,-6e-05,-6.3e-05,-4.8e-05,-3.3e-05,-2e-05,-1.4e-05,-1.1e-05,-9e-06,-7e-06,-4e-06,-2e-06,4.9865e-07,2e-06,2e-06,2e-06,-6.1262e-07,-1e-06,-3e-06,-4e-06,-6e-06,-7e-06,-8e-06,-9e-06,-9e-06,-9e-06,-1e-05,-1.1e-05,-1.1e-05,-1.2e-05,-1.2e-05,-1.2e-05,-1.3e-05,…,1.6653e-18,4.4409e-18,5.181e-18,4.811e-18,2.9606e-18,1.1102e-18,-2.313e-18,-8.1416e-18,-1.2212e-17,-1.1842e-17,-3.7007e-18,3.7007e-19,5.9212e-18,5.9614e-16,1.5969e-14,2.1081e-13,4.4083e-13,1.6354e-13,8.4976e-14,4.3469e-15,5.4956e-17,-1.3878e-17,-3.6602e-15,-6.1598e-12,-5.9597e-10,-1.2688e-08,-6.2603e-10,5.262e-11,2.409e-07,0.016224,363.88266,0.0,3.4721e-09,4.9415e-18,1.3405e-15,0.890961,0.735684
"""q3_4""",3.6e-05,2.8e-05,4.7e-05,6.8e-05,9.6e-05,0.000115,9.4e-05,6.7e-05,4.3e-05,3.3e-05,2.5e-05,2e-05,1.6e-05,1.2e-05,9e-06,7e-06,7e-06,6e-06,5e-06,3e-06,3e-06,6.8904e-07,-5.8928e-07,-2e-06,-2e-06,-2e-06,-3e-06,-3e-06,-3e-06,-3e-06,-3e-06,-3e-06,-3e-06,-2e-06,-2e-06,-1e-06,…,5.2116e-07,5.0744e-07,4.8461e-07,4.5758e-07,4.3468e-07,4.1226e-07,3.985e-07,4.0108e-07,4.2253e-07,4.6338e-07,5.3128e-07,6.346e-07,7.7652e-07,9.7377e-07,1e-06,2e-06,2e-06,2e-06,2e-06,3e-06,3e-06,4e-06,4e-06,5e-06,6e-06,7e-06,7e-06,8e-06,1.2e-05,247.601915,409.656305,3.3229e-10,1.9853e-08,77.337254,91.911309,57.72635,24.646142


In [9]:
feat_stats_df.write_parquet("../../data/leap/feat_stats.parquet")
label_stats_df.write_parquet("../../data/leap/label_stats.parquet")

In [11]:
# feat_stats_df = pl.read_parquet("../../data/leap/feat_stats.parquet")
# label_stats_df = pl.read_parquet("../../data/leap/label_stats.parquet")

In [12]:
# import sys
# sys.path.append("..")
# from leap.utils import IN_SCALAR_COLUMNS, IN_VECTOR_COLUMNS, OUT_SCALAR_COLUMNS, OUT_VECTOR_COLUMNS

In [13]:
# for col in IN_VECTOR_COLUMNS:
#     feat_stats_df = feat_stats_df.with_columns(pl.concat_list(f"^{col}_\d+$").alias(col))
# feat_stats_df = feat_stats_df.select(["stats"] + IN_SCALAR_COLUMNS + IN_VECTOR_COLUMNS)
# for col in OUT_VECTOR_COLUMNS:
#     label_stats_df = label_stats_df.with_columns(pl.concat_list(f"^{col}_\d+$").alias(col))
# label_stats_df = label_stats_df.select(["stats"] + OUT_SCALAR_COLUMNS + OUT_VECTOR_COLUMNS)

In [14]:
# feat_stats_df

In [15]:
# label_stats_df

In [21]:
# import sys
# sys.path.append("..")
# import re
# from pathlib import Path

# import polars as pl

# from leap.utils import IN_SCALAR_COLUMNS, IN_VECTOR_COLUMNS, OUT_SCALAR_COLUMNS, OUT_VECTOR_COLUMNS

In [22]:
# COLUMNS = pl.read_csv("../../data/leap/train.csv", n_rows=1).columns[1:]
# len(columns)

In [23]:
# data_dir = "../ClimSim/dataset_statistics"
# in_scalar_files = sorted(list(Path(data_dir, "input2D").glob("*.txt")))
# in_vector_files = sorted(list(Path(data_dir, "input3D").glob("*.txt")))
# out_scalar_files = sorted(list(Path(data_dir, "output2D").glob("*.txt")))
# out_vector_files = sorted(list(Path(data_dir, "output3D").glob("*.txt")))
# len(in_scalar_files), len(in_vector_files), len(out_scalar_files), len(out_vector_files)

In [24]:
# in_scalar_files
# in_vector_files

In [25]:
# df["ptend_q0002_30"]

In [26]:
# (-df["state_q0002_30"] / 1200 - df["ptend_q0002_30"]).abs().describe()

In [27]:
# def get_col_name(filename):
#     filename = str(filename.stem)
#     filename = filename.replace("mli_", "")
#     filename = filename.replace("mlo_", "")
#     filename = filename.replace("lev", "")
#     no = re.findall(r"_\d{2}", filename)
#     assert len(no) < 2
#     if len(no) == 1:
#         no_str = no[0][1:]
#         no_int = int(no_str)
#         filename = filename.replace(f"_{no_str}", f"_{no_int-1}")
#     return filename
    
# def load_stats(files):
#     statistics = []
#     for filename in files:
#         col = get_col_name(filename)
#         try:
#             assert col in COLUMNS, col
#             f = open(filename, "r")
#             std = float(next(f).strip().split(":")[1])
#             statistics.append({
#                 "col": col,
#                 "std": std,
#             })
#             # print(type(std))
#         except:
#             print(filename.stem, col)
#     return pl.from_dicts(statistics)


# in_scalar_stats = load_stats(in_scalar_files)
# in_vector_stats = load_stats(in_vector_files)
# out_scalar_stats = load_stats(out_scalar_files)
# out_vector_stats = load_stats(out_vector_files)

In [28]:
# len(IN_SCALAR_COLUMNS), len(in_scalar_stats)

In [29]:
# len(IN_VECTOR_COLUMNS), len(in_vector_stats)

In [30]:
# len(OUT_SCALAR_COLUMNS), len(out_scalar_stats)

In [31]:
# OUT_SCALAR_COLUMNS

In [32]:
# out_scalar_stats["col"].to_list()

In [33]:
# len(OUT_VECTOR_COLUMNS), len(out_vector_stats)

In [34]:
# out_vector_stats["col"].unique().to_list()

In [35]:
# out_vector_stats.filter(pl.col(

In [36]:
# pl.read_parquet(Path(data_dir, "x_mean.parquet"))["state_t_8"]

In [37]:
# pl.read_parquet(Path(data_dir, "x_std.parquet"))["state_t_8"]

In [41]:
# %%time
# files = list(Path(data_dir).glob("processed_train*"))
# print(f"# of train files: {len(files)}")
# np.random.seed(seed)
# np.random.shuffle(files)
# dfs = []
# num_data = 0
# for filename in tqdm(files):
#     print(filename)
#     dfs.append(pl.read_parquet(filename, columns=feat_cols if target=="input" else label_cols))
#     num_data += len(dfs[-1])
#     if num_data > N_ROWS:
#         break
# df = pl.concat(dfs)
# df.shape

In [42]:
# %%time
# test_df = pl.read_parquet(Path(data_dir, "processed_test.parquet"), columns=feat_cols)
# test_df = pl.read_csv(Path(data_dir, "test.csv"), columns=feat_cols)
# test_df.shape

In [43]:
# df = pl.concat([train_df, test_df])

In [44]:
# df.shape

In [8]:
mean_df = df.mean()
if target == "input":
    std_df = df.std()
else:
    std_df = (df * df).mean().with_columns(pl.all().sqrt())
q2_df = df.quantile(0.25)
q3_df = df.quantile(0.5)
q4_df = df.quantile(0.75)

In [9]:
std_df

ptend_t_0,ptend_t_1,ptend_t_2,ptend_t_3,ptend_t_4,ptend_t_5,ptend_t_6,ptend_t_7,ptend_t_8,ptend_t_9,ptend_t_10,ptend_t_11,ptend_t_12,ptend_t_13,ptend_t_14,ptend_t_15,ptend_t_16,ptend_t_17,ptend_t_18,ptend_t_19,ptend_t_20,ptend_t_21,ptend_t_22,ptend_t_23,ptend_t_24,ptend_t_25,ptend_t_26,ptend_t_27,ptend_t_28,ptend_t_29,ptend_t_30,ptend_t_31,ptend_t_32,ptend_t_33,ptend_t_34,ptend_t_35,ptend_t_36,…,ptend_v_31,ptend_v_32,ptend_v_33,ptend_v_34,ptend_v_35,ptend_v_36,ptend_v_37,ptend_v_38,ptend_v_39,ptend_v_40,ptend_v_41,ptend_v_42,ptend_v_43,ptend_v_44,ptend_v_45,ptend_v_46,ptend_v_47,ptend_v_48,ptend_v_49,ptend_v_50,ptend_v_51,ptend_v_52,ptend_v_53,ptend_v_54,ptend_v_55,ptend_v_56,ptend_v_57,ptend_v_58,ptend_v_59,cam_out_NETSW,cam_out_FLWDS,cam_out_PRECSC,cam_out_PRECC,cam_out_SOLS,cam_out_SOLL,cam_out_SOLSD,cam_out_SOLLD
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.046274,1.007117,1.011479,1.005761,0.99895,0.995492,0.995733,0.996317,0.99605,0.995646,0.995601,0.995819,0.996633,0.996386,0.996923,1.003405,1.040239,1.056278,1.027009,1.019531,1.010373,1.021547,1.036911,1.047699,1.048465,1.037275,1.02597,1.016581,1.011718,1.009276,1.007124,1.005025,1.004665,1.0043,1.004898,1.005166,1.004857,…,1.010913,1.014106,1.015672,1.016246,1.011444,1.009567,1.014182,1.015702,1.017372,1.017778,1.019712,1.019894,1.020911,1.021947,1.022461,1.020504,1.018023,1.015712,1.013176,1.011682,1.013197,1.020688,1.028576,1.0238,1.015366,1.009482,1.008214,1.009326,1.017949,1.182756,4.982298,1.060937,1.067328,1.142551,1.150654,1.229466,1.160102


In [10]:
prefix = "x_" if target=="input" else "y_"
mean_df.write_parquet(f"{data_dir}/{prefix}mean.parquet")
std_df.write_parquet(f"{data_dir}/{prefix}std.parquet")
q2_df.write_parquet(f"{data_dir}/{prefix}q2.parquet")
q3_df.write_parquet(f"{data_dir}/{prefix}q3.parquet")
q4_df.write_parquet(f"{data_dir}/{prefix}q4.parquet")

In [27]:
feat_stats_df

stats,state_ps,pbuf_SOLIN,pbuf_LHFLX,pbuf_SHFLX,pbuf_TAUX,pbuf_TAUY,pbuf_COSZRS,cam_in_ALDIF,cam_in_ALDIR,cam_in_ASDIF,cam_in_ASDIR,cam_in_LWUP,cam_in_ICEFRAC,cam_in_LANDFRAC,cam_in_OCNFRAC,cam_in_SNOWHLAND,state_t,state_q0001,state_q0002,state_q0003,state_u,state_v,pbuf_ozone,pbuf_CH4,pbuf_N2O
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,list[f64],list[f64],list[f64],list[f64],list[f64],list[f64],list[f64],list[f64],list[f64]
"""mean""",98596.753302,349.574107,70.217865,19.635538,-0.001617,0.006534,0.249722,0.563013,0.578684,0.560561,0.577075,394.053242,0.039658,0.299548,0.660795,0.052935,"[217.624071, 230.399096, … 286.524676]","[0.000001, 0.000001, … 0.009108]","[4.3187e-34, 4.2736e-34, … 0.000004]","[4.6433e-13, 4.8620e-13, … 0.000002]","[5.643097, 4.856036, … -0.032662]","[5.87136, 2.346457, … -0.413235]","[2.8092e-7, 5.0594e-7, … 4.4878e-8]","[1.3595e-7, 1.5698e-7, … 9.9861e-7]","[2.4710e-8, 3.0938e-8, … 4.9086e-7]"
"""std""",4281.44446,451.500068,80.06469,52.737542,0.073231,0.066127,0.322532,0.430416,0.417756,0.449322,0.436952,76.000813,0.14721,0.389728,0.413036,0.208154,"[5.911469, 7.881941, … 14.423179]","[9.1574e-9, 1.0014e-8, … 0.005518]","[1.7905e-35, 2.3907e-35, … 0.000014]","[2.9706e-14, 4.0427e-14, … 0.000007]","[39.261531, 38.160153, … 4.899528]","[22.313432, 16.557431, … 4.196683]","[2.9091e-8, 5.2395e-8, … 2.0310e-8]","[6.7360e-8, 7.6307e-8, … 2.1176e-22]","[1.2311e-8, 1.4957e-8, … 0.0]"
"""q1_4""",97644.601762,0.0,7.708939,-1.869708,-0.024017,-0.014355,0.0,0.061828,0.128667,0.06,0.081895,341.146872,0.0,0.0,0.169668,0.0,"[214.466538, 225.190072, … 277.886281]","[0.000001, 0.000001, … 0.004103]","[4.2023e-34, 4.0893e-34, … 1.2309e-8]","[4.4197e-13, 4.5914e-13, … 0.0]","[-24.595596, -25.329931, … -3.593738]","[-7.697528, -7.378908, … -3.0184]","[2.5644e-7, 4.6186e-7, … 2.6914e-8]","[8.3876e-8, 1.0119e-7, … 9.9861e-7]","[1.4966e-8, 1.9497e-8, … 4.9086e-7]"
"""q2_4""",100052.912057,9.01463,44.678269,5.372208,0.000891,0.001966,0.006446,0.6789,0.673715,0.82385,0.836097,416.213761,0.0,0.031226,0.94313,0.0,"[216.921706, 230.500589, … 291.490732]","[0.000001, 0.000001, … 0.008628]","[4.3825e-34, 4.3400e-34, … 1.0297e-7]","[4.6582e-13, 4.8649e-13, … 1.1918e-11]","[3.857327, 0.627279, … -0.221262]","[4.700998, 2.219141, … -0.461416]","[2.9452e-7, 5.3045e-7, … 3.8275e-8]","[1.7047e-7, 1.9578e-7, … 9.9861e-7]","[2.7090e-8, 3.3989e-8, … 4.9086e-7]"
"""q3_4""",101171.068774,693.719003,108.966804,17.634344,0.027608,0.023579,0.495651,1.0,1.0,1.0,1.0,456.510835,0.0,0.584406,1.0,0.0,"[220.26758, 235.729563, … 298.013045]","[0.000001, 0.000001, … 0.014505]","[4.4589e-34, 4.4848e-34, … 0.000001]","[4.8625e-13, 5.1613e-13, … 7.9558e-8]","[33.068731, 31.326169, … 3.231524]","[18.362083, 11.952884, … 2.312402]","[3.0137e-7, 5.4278e-7, … 6.4872e-8]","[1.8082e-7, 2.0767e-7, … 9.9861e-7]","[3.4248e-8, 4.2493e-8, … 4.9086e-7]"


In [68]:
def normalize(df, feat_cols, label_cols, method, path, reverse=False, eps=1e-8):
    if method == "standard":
        if feat_cols is not None:
            x_mat = df.select(feat_cols).to_numpy()
            x_mean = pl.read_parquet(Path(path, "x_mean.parquet")).select(feat_cols).to_numpy()
            x_std = pl.read_parquet(Path(path, "x_std.parquet")).select(feat_cols).to_numpy()
            if reverse:
                x_mat = x_mat * x_std + x_mean
            else:
                x_std = x_std.clip(eps)
                x_mat = (x_mat - x_mean) / x_std
        if label_cols is not None:
            y_mat = df.select(label_cols).to_numpy()
            y_mean = pl.read_parquet(Path(path, "y_mean.parquet")).select(label_cols).to_numpy()
            y_std = pl.read_parquet(Path(path, "y_std.parquet")).select(label_cols).to_numpy()
            if reverse:
                y_mat = y_mat * y_std + y_mean
            else:
                y_std = y_std.clip(eps)
                y_mat = (y_mat - y_mean) / y_std
    elif method == "robust":
        if feat_cols is not None:
            x_mat = df.select(feat_cols).to_numpy()
            x_q2 = pl.read_parquet(Path(path, "x_q2.parquet")).select(feat_cols).to_numpy()
            x_q3 = pl.read_parquet(Path(path, "x_q3.parquet")).select(feat_cols).to_numpy()
            x_q4 = pl.read_parquet(Path(path, "x_q4.parquet")).select(feat_cols).to_numpy()
            if reverse:
                x_mat = x_mat * (x_q4 - x_q2) + x_q3
            else:
                x_mat = (x_mat - x_q3) / np.maximum(x_q4 - x_q2, eps)
        if label_cols is not None:
            y_mat = df.select(label_cols).to_numpy()
            y_q2 = pl.read_parquet(Path(path, "y_q2.parquet")).select(label_cols).to_numpy()
            y_q3 = pl.read_parquet(Path(path, "y_q3.parquet")).select(label_cols).to_numpy()
            y_q4 = pl.read_parquet(Path(path, "y_q4.parquet")).select(label_cols).to_numpy()
            if reverse:
                y_mat = y_mat * (y_q4 - y_q2) + y_q3
            else:
                y_mat = (y_mat - y_q3) / np.maximum(y_q4 - y_q2, eps)
    else:
        raise NotImplementedError
    if feat_cols is not None:
        df = df.with_columns(
            [
                pl.lit(x_mat[:, i]).alias(col)
                for i, col in enumerate(feat_cols)
            ]        
        )
    if label_cols is not None:
        df = df.with_columns(
            [
                pl.lit(y_mat[:, i]).alias(col)
                for i, col in enumerate(label_cols)
            ]        
        )
    return df

In [69]:
df = pl.read_parquet("../../data/leap/raw_train_0_100041.parquet")

In [70]:
df2 = normalize(df, IN_SCALAR_COLUMNS+IN_VECTOR_COLUMNS, OUT_SCALAR_COLUMNS+OUT_VECTOR_COLUMNS, "standard", "../../data/leap/", reverse=False)

shape: (100_042, 925)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ sample_id ┆ state_t_0 ┆ state_t_1 ┆ state_t_2 ┆ … ┆ cam_out_S ┆ cam_out_S ┆ cam_out_S ┆ cam_out_ │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ OLS       ┆ OLL       ┆ OLSD      ┆ SOLLD    │
│ str       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ train_0   ┆ 213.80611 ┆ 222.24445 ┆ 229.25987 ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│           ┆ 7         ┆ 4         ┆ 9         ┆   ┆           ┆           ┆           ┆          │
│ train_1   ┆ 213.17743 ┆ 225.85106 ┆ 229.66361 ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│           ┆           ┆ 4         ┆ 8         ┆   ┆           ┆    

ValueError: can only call `.item()` if the dataframe is of shape (1, 1), or if explicit row/col values are provided; frame has shape (1, 24)

In [None]:
df3 = normalize(df2, feat_cols, label_cols, "robust", data_dir, reverse=True)

In [98]:
df.head()

sample_id,state_t_0,state_t_1,state_t_2,state_t_3,state_t_4,state_t_5,state_t_6,state_t_7,state_t_8,state_t_9,state_t_10,state_t_11,state_t_12,state_t_13,state_t_14,state_t_15,state_t_16,state_t_17,state_t_18,state_t_19,state_t_20,state_t_21,state_t_22,state_t_23,state_t_24,state_t_25,state_t_26,state_t_27,state_t_28,state_t_29,state_t_30,state_t_31,state_t_32,state_t_33,state_t_34,state_t_35,…,ptend_v_31,ptend_v_32,ptend_v_33,ptend_v_34,ptend_v_35,ptend_v_36,ptend_v_37,ptend_v_38,ptend_v_39,ptend_v_40,ptend_v_41,ptend_v_42,ptend_v_43,ptend_v_44,ptend_v_45,ptend_v_46,ptend_v_47,ptend_v_48,ptend_v_49,ptend_v_50,ptend_v_51,ptend_v_52,ptend_v_53,ptend_v_54,ptend_v_55,ptend_v_56,ptend_v_57,ptend_v_58,ptend_v_59,cam_out_NETSW,cam_out_FLWDS,cam_out_PRECSC,cam_out_PRECC,cam_out_SOLS,cam_out_SOLL,cam_out_SOLSD,cam_out_SOLLD
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""train_0""",213.806117,222.244454,229.259879,245.743959,258.337482,265.772467,263.978456,254.879872,243.946147,236.70699,230.934476,226.459626,222.014096,216.91297,210.688514,205.181518,201.649949,199.354256,196.582987,198.08732,200.30267,204.5107,209.294262,214.299322,219.180883,223.644692,227.899269,231.883467,235.859155,239.75449,243.663599,247.494148,251.145756,254.734832,258.118795,261.322883,…,0.077274,0.050239,0.003729,-0.010824,-0.014926,-0.021633,0.00233,0.021499,-0.002053,0.019015,-0.003287,-0.022154,-0.029056,0.002452,-0.073034,-0.156892,-0.047535,-0.382321,-0.357464,-1.05782,-0.368499,0.981896,0.725763,0.257836,0.099358,0.007303,-0.041045,-0.025254,0.040925,0.0,4.8528,0.0,3.6e-05,0.0,0.0,0.0,0.0
"""train_1""",213.17743,225.851064,229.663618,246.828333,261.026415,269.078431,267.736565,256.896227,244.169421,236.844423,231.586369,227.873491,224.125186,219.864133,214.768666,209.773682,206.593758,205.144601,202.21968,203.165579,203.691769,206.691885,210.018967,213.651746,217.22671,220.732834,224.271241,227.879259,231.523113,235.206556,238.911192,242.569836,246.095623,249.661886,253.170195,256.545214,…,0.028818,-0.248265,0.067292,-0.067687,-0.069926,0.047825,0.019616,-0.042126,0.051219,0.026741,-0.017223,-0.132325,-0.002659,0.120503,-0.11757,-0.306287,-0.354408,-0.138098,-0.476167,-0.764134,-0.376146,0.677455,0.838971,0.430475,0.134241,0.025993,-0.031359,0.014665,0.02915,0.0,4.653445,0.0,0.03835,0.0,0.0,0.0,0.0
"""train_2""",217.105685,220.448106,234.653398,244.422951,254.023818,259.651472,257.709514,251.064513,241.8796,234.487318,228.294373,223.660529,219.108751,214.820508,208.639566,201.055995,193.462408,190.267832,188.690119,189.957238,196.482059,203.461783,210.517854,217.264497,223.298206,228.871994,234.369621,239.208271,243.596733,247.688112,251.639426,255.233656,258.531051,261.569665,264.470169,267.068563,…,-0.235248,-0.08398,-0.191327,-0.086259,-0.099767,0.019407,-0.023919,-0.041256,0.063648,-0.054575,-0.038737,-0.06728,-0.030177,0.040633,0.11985,0.414091,0.795404,0.704266,0.872761,0.239491,-0.227583,-0.804225,-1.448928,-1.190546,-0.677759,-0.430936,-0.129996,0.06448,1.806126,0.0,5.576699,0.0,0.090722,0.0,0.0,0.0,0.0
"""train_3""",217.773994,225.611775,234.104091,247.745365,257.411402,263.470947,261.131775,253.30325,242.316814,234.396266,227.95502,223.999858,219.658845,215.24492,210.214695,204.137721,196.509274,191.893671,189.929401,190.806367,196.69688,203.68075,210.684974,217.256992,223.168849,228.660408,233.845497,238.535216,242.74732,246.688901,250.429055,253.907015,257.260424,260.436627,263.363255,266.154815,…,-0.208491,-0.300406,-0.113838,-0.010512,-0.002416,-0.023076,0.039271,0.002193,-0.014658,0.020225,-0.022797,0.022293,0.099048,-0.006137,0.169554,0.333702,0.166173,-0.105038,-0.176849,-0.075022,0.202431,0.540401,-0.051315,-0.306277,-0.27705,-0.199926,-0.199002,-0.373422,0.297932,0.0,5.556165,0.0,0.309659,0.0,0.0,0.0,0.0
"""train_4""",216.349337,230.526083,233.650252,248.196013,262.50073,270.055663,268.863606,258.161645,244.44262,236.779096,231.508378,227.968412,224.863747,221.415977,217.325766,213.113691,209.607872,207.829591,204.866553,205.55398,205.036638,207.604275,210.501407,213.802681,217.053629,220.416544,223.803482,227.157857,230.464429,233.723924,236.938842,240.118405,243.26515,246.338046,249.61741,253.012313,…,-0.068307,-0.02148,0.012922,0.021751,-0.053205,-0.206038,0.143681,-0.007349,-0.005018,0.010547,-0.024656,0.022587,-0.003921,0.02003,-0.005201,0.037001,0.049513,0.08375,0.044469,-0.106598,-0.221449,-0.242982,0.355647,0.309179,-0.036999,-0.077653,-0.045229,0.069254,-0.186223,0.0,4.469592,0.0,0.004127,0.0,0.0,0.0,0.0


In [99]:
df2.head()

sample_id,state_t_0,state_t_1,state_t_2,state_t_3,state_t_4,state_t_5,state_t_6,state_t_7,state_t_8,state_t_9,state_t_10,state_t_11,state_t_12,state_t_13,state_t_14,state_t_15,state_t_16,state_t_17,state_t_18,state_t_19,state_t_20,state_t_21,state_t_22,state_t_23,state_t_24,state_t_25,state_t_26,state_t_27,state_t_28,state_t_29,state_t_30,state_t_31,state_t_32,state_t_33,state_t_34,state_t_35,…,ptend_v_31,ptend_v_32,ptend_v_33,ptend_v_34,ptend_v_35,ptend_v_36,ptend_v_37,ptend_v_38,ptend_v_39,ptend_v_40,ptend_v_41,ptend_v_42,ptend_v_43,ptend_v_44,ptend_v_45,ptend_v_46,ptend_v_47,ptend_v_48,ptend_v_49,ptend_v_50,ptend_v_51,ptend_v_52,ptend_v_53,ptend_v_54,ptend_v_55,ptend_v_56,ptend_v_57,ptend_v_58,ptend_v_59,cam_out_NETSW,cam_out_FLWDS,cam_out_PRECSC,cam_out_PRECC,cam_out_SOLS,cam_out_SOLL,cam_out_SOLSD,cam_out_SOLLD
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""train_0""",-0.713707,-0.752541,-1.055099,-0.327045,0.233787,0.686094,0.798901,0.781238,0.757878,0.736784,0.727926,0.716836,0.681739,0.508096,0.259737,0.076422,0.052921,0.03848,-0.071003,-0.111658,-0.232328,-0.347658,-0.533177,-0.205376,-0.068896,-0.017096,0.004964,0.016984,0.033833,0.049833,0.067759,0.084918,0.101094,0.119905,0.136837,0.151588,…,0.74192,0.47553,0.034847,-0.101688,-0.147458,-0.240753,0.028156,0.288896,-0.029141,0.258235,-0.04196,-0.251296,-0.283825,0.020978,-0.539086,-1.021146,-0.277952,-2.027258,-1.714103,-4.517254,-1.406403,3.34451,2.076213,0.617423,0.202631,0.012774,-0.086667,-0.050205,0.076582,-4.3e-05,-0.078884,0.0,-0.205722,-1.8982999999999999e-22,-9.0914e-20,-0.014807,-0.027857
"""train_1""",-0.835263,-0.42127,-1.00876,-0.173034,0.666733,1.193321,1.269759,1.029124,0.787481,0.75581,0.818148,0.905947,0.933945,0.828958,0.631217,0.396814,0.302135,0.279706,0.20511,0.188219,0.112203,0.106871,-0.267726,-0.325049,-0.285604,-0.254113,-0.236872,-0.216655,-0.196311,-0.176892,-0.159134,-0.143924,-0.130515,-0.111664,-0.090103,-0.069783,…,0.27668,-2.349927,0.628888,-0.635902,-0.690831,0.532246,0.237049,-0.566073,0.727144,0.363152,-0.219879,-1.50101,-0.025976,1.031099,-0.867826,-1.99351,-2.073169,-0.732086,-2.283334,-3.263112,-1.435591,2.306751,2.402319,1.042922,0.279261,0.052405,-0.066243,0.025775,0.055748,-4.3e-05,-0.21236,0.0,-0.03209,-1.8982999999999999e-22,-9.0914e-20,-0.014807,-0.027857
"""train_2""",-0.075743,-0.917538,-0.436065,-0.514665,-0.460758,-0.253036,0.013458,0.312187,0.483888,0.42949,0.362534,0.342445,0.334645,0.280596,0.073189,-0.211412,-0.359809,-0.340061,-0.457635,-0.591748,-0.620726,-0.566238,-0.084989,0.342593,0.387695,0.408391,0.436265,0.444375,0.444522,0.445341,0.448547,0.444588,0.439797,0.431902,0.428106,0.41781,…,-2.258656,-0.794907,-1.788087,-0.810387,-0.985648,0.215979,-0.289047,-0.554387,0.903598,-0.741143,-0.494533,-0.763178,-0.294772,0.347683,0.884656,2.695192,4.653256,3.735176,4.185367,1.022708,-0.868588,-2.743905,-4.188164,-2.952371,-1.504519,-0.916448,-0.274234,0.12059,3.199785,-4.3e-05,0.405792,0.0,0.205251,-1.8982999999999999e-22,-9.0914e-20,-0.014807,-0.027857
"""train_3""",0.053474,-0.443249,-0.499111,-0.04279,0.084678,0.332977,0.442237,0.587412,0.541855,0.416885,0.315567,0.387832,0.400363,0.326739,0.216598,0.003597,-0.206218,-0.272328,-0.396929,-0.541606,-0.598887,-0.520608,-0.023775,0.341206,0.37335,0.391169,0.401328,0.405103,0.399437,0.395528,0.390761,0.382936,0.381524,0.380181,0.377344,0.375472,…,-2.001763,-2.843454,-1.063903,-0.098755,-0.023872,-0.256815,0.474554,0.029473,-0.208089,0.274656,-0.291036,0.252878,0.967539,-0.052511,1.251532,2.171962,0.972242,-0.556759,-0.847973,-0.320371,0.772594,1.83957,-0.162223,-0.772935,-0.624252,-0.426623,-0.419742,-0.712891,0.531308,-4.3e-05,0.392044,0.0,1.197437,-1.8982999999999999e-22,-9.0914e-20,-0.014807,-0.027857
"""train_4""",-0.221981,0.008135,-0.551199,0.021215,0.904113,1.343255,1.410967,1.184691,0.823702,0.746766,0.807354,0.918643,1.022179,0.99768,0.86403,0.629844,0.454075,0.391563,0.334767,0.329256,0.24892,0.297,-0.091014,-0.297156,-0.304798,-0.279858,-0.268052,-0.258748,-0.252503,-0.250804,-0.253299,-0.257846,-0.260325,-0.263391,-0.253031,-0.233477,…,-0.655832,-0.203317,0.120762,0.204348,-0.525639,-2.293022,1.736272,-0.098751,-0.071245,0.143234,-0.314777,0.256216,-0.038297,0.171387,-0.038391,0.24084,0.289778,0.44443,0.213341,-0.455211,-0.845176,-0.830775,1.010063,0.743965,-0.096915,-0.167361,-0.09549,0.129677,-0.325315,-4.3e-05,-0.335456,0.0,-0.187182,-1.8982999999999999e-22,-9.0914e-20,-0.014807,-0.027857


In [100]:
df3.head()

sample_id,state_t_0,state_t_1,state_t_2,state_t_3,state_t_4,state_t_5,state_t_6,state_t_7,state_t_8,state_t_9,state_t_10,state_t_11,state_t_12,state_t_13,state_t_14,state_t_15,state_t_16,state_t_17,state_t_18,state_t_19,state_t_20,state_t_21,state_t_22,state_t_23,state_t_24,state_t_25,state_t_26,state_t_27,state_t_28,state_t_29,state_t_30,state_t_31,state_t_32,state_t_33,state_t_34,state_t_35,…,ptend_v_31,ptend_v_32,ptend_v_33,ptend_v_34,ptend_v_35,ptend_v_36,ptend_v_37,ptend_v_38,ptend_v_39,ptend_v_40,ptend_v_41,ptend_v_42,ptend_v_43,ptend_v_44,ptend_v_45,ptend_v_46,ptend_v_47,ptend_v_48,ptend_v_49,ptend_v_50,ptend_v_51,ptend_v_52,ptend_v_53,ptend_v_54,ptend_v_55,ptend_v_56,ptend_v_57,ptend_v_58,ptend_v_59,cam_out_NETSW,cam_out_FLWDS,cam_out_PRECSC,cam_out_PRECC,cam_out_SOLS,cam_out_SOLL,cam_out_SOLSD,cam_out_SOLLD
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""train_0""",213.806117,222.244454,229.259879,245.743959,258.337482,265.772467,263.978456,254.879872,243.946147,236.70699,230.934476,226.459626,222.014096,216.91297,210.688514,205.181518,201.649949,199.354256,196.582987,198.08732,200.30267,204.5107,209.294262,214.299322,219.180883,223.644692,227.899269,231.883467,235.859155,239.75449,243.663599,247.494148,251.145756,254.734832,258.118795,261.322883,…,0.077274,0.050239,0.003729,-0.010824,-0.014926,-0.021633,0.00233,0.021499,-0.002053,0.019015,-0.003287,-0.022154,-0.029056,0.002452,-0.073034,-0.156892,-0.047535,-0.382321,-0.357464,-1.05782,-0.368499,0.981896,0.725763,0.257836,0.099358,0.007303,-0.041045,-0.025254,0.040925,0.0,4.8528,0.0,3.6e-05,0.0,0.0,0.0,0.0
"""train_1""",213.17743,225.851064,229.663618,246.828333,261.026415,269.078431,267.736565,256.896227,244.169421,236.844423,231.586369,227.873491,224.125186,219.864133,214.768666,209.773682,206.593758,205.144601,202.21968,203.165579,203.691769,206.691885,210.018967,213.651746,217.22671,220.732834,224.271241,227.879259,231.523113,235.206556,238.911192,242.569836,246.095623,249.661886,253.170195,256.545214,…,0.028818,-0.248265,0.067292,-0.067687,-0.069926,0.047825,0.019616,-0.042126,0.051219,0.026741,-0.017223,-0.132325,-0.002659,0.120503,-0.11757,-0.306287,-0.354408,-0.138098,-0.476167,-0.764134,-0.376146,0.677455,0.838971,0.430475,0.134241,0.025993,-0.031359,0.014665,0.02915,0.0,4.653445,0.0,0.03835,0.0,0.0,0.0,0.0
"""train_2""",217.105685,220.448106,234.653398,244.422951,254.023818,259.651472,257.709514,251.064513,241.8796,234.487318,228.294373,223.660529,219.108751,214.820508,208.639566,201.055995,193.462408,190.267832,188.690119,189.957238,196.482059,203.461783,210.517854,217.264497,223.298206,228.871994,234.369621,239.208271,243.596733,247.688112,251.639426,255.233656,258.531051,261.569665,264.470169,267.068563,…,-0.235248,-0.08398,-0.191327,-0.086259,-0.099767,0.019407,-0.023919,-0.041256,0.063648,-0.054575,-0.038737,-0.06728,-0.030177,0.040633,0.11985,0.414091,0.795404,0.704266,0.872761,0.239491,-0.227583,-0.804225,-1.448928,-1.190546,-0.677759,-0.430936,-0.129996,0.06448,1.806126,0.0,5.576699,0.0,0.090722,0.0,0.0,0.0,0.0
"""train_3""",217.773994,225.611775,234.104091,247.745365,257.411402,263.470947,261.131775,253.30325,242.316814,234.396266,227.95502,223.999858,219.658845,215.24492,210.214695,204.137721,196.509274,191.893671,189.929401,190.806367,196.69688,203.68075,210.684974,217.256992,223.168849,228.660408,233.845497,238.535216,242.74732,246.688901,250.429055,253.907015,257.260424,260.436627,263.363255,266.154815,…,-0.208491,-0.300406,-0.113838,-0.010512,-0.002416,-0.023076,0.039271,0.002193,-0.014658,0.020225,-0.022797,0.022293,0.099048,-0.006137,0.169554,0.333702,0.166173,-0.105038,-0.176849,-0.075022,0.202431,0.540401,-0.051315,-0.306277,-0.27705,-0.199926,-0.199002,-0.373422,0.297932,0.0,5.556165,0.0,0.309659,0.0,0.0,0.0,0.0
"""train_4""",216.349337,230.526083,233.650252,248.196013,262.50073,270.055663,268.863606,258.161645,244.44262,236.779096,231.508378,227.968412,224.863747,221.415977,217.325766,213.113691,209.607872,207.829591,204.866553,205.55398,205.036638,207.604275,210.501407,213.802681,217.053629,220.416544,223.803482,227.157857,230.464429,233.723924,236.938842,240.118405,243.26515,246.338046,249.61741,253.012313,…,-0.068307,-0.02148,0.012922,0.021751,-0.053205,-0.206038,0.143681,-0.007349,-0.005018,0.010547,-0.024656,0.022587,-0.003921,0.02003,-0.005201,0.037001,0.049513,0.08375,0.044469,-0.106598,-0.221449,-0.242982,0.355647,0.309179,-0.036999,-0.077653,-0.045229,0.069254,-0.186223,0.0,4.469592,0.0,0.004127,0.0,0.0,0.0,0.0


In [23]:
# df["cam_out_SOLS"].describe()

In [24]:
# df.filter(pl.col("cam_out_SOLS") > 0).sort("cam_out_SOLS")#[0, "cam_out_SOLS"]

In [25]:
# df.filter(pl.col("sample_id") == "train_85360")

In [49]:
data_dir = "../../data/leap/"

In [52]:
pl.read_parquet(Path(data_dir, "x_mean.parquet"))["pbuf_CH4_10"]

pbuf_CH4_10
f64
4.8321e-07


In [122]:
pl.read_parquet(Path(data_dir, "y_mean.parquet"))["ptend_t_59"]

ptend_t_59
f64
-0.856929
