In [1]:
from utils import *
import os.path
from scipy.stats import qmc
from tqdm import tqdm
from pymoo.operators.sampling.lhs import LHS
import polars as pl

In [2]:
def get_functions(seed=42, n_transforms=300):
    def power16(y):
        return np.power(y, 1/6)
    
    def power13(y):
        return np.power(y, 1/3)
    
    def power12(y):
        return np.power(y, 1/2)
    
    def power2(y):
        return np.power(y, 2)
    
    def power3(y):
        return np.power(y, 3)
    
    def quantile_25(y):
        return np.quantile(y, 0.25)
    
    def quantile_75(y):
        return np.quantile(y, 0.75)
    
    def quantile_05(y):
        return np.quantile(y, 0.05)
    
    def quantile_95(y):
        return np.quantile(y, 0.95)
    
    scalars = [0.2, 0.3, 0.5, 0.7, 1, 2, 3, 5, 7, 9]
    transforms = [np.sin, np.cos, power16, power13, power12, power2, np.log1p]
    aggs = [np.mean, np.median, np.std, quantile_25, quantile_75, quantile_05, quantile_95]
    
    np.random.seed(seed)
    generated_names = set()
    
    for _ in range(n_transforms):
        while True:
            scalar = np.random.choice(scalars)
            transform = np.random.choice(transforms)
            agg = np.random.choice(aggs)
            
            name = f's_{scalar}__t_{transform.__name__}__a_{agg.__name__}'
            
            if name not in generated_names:
                generated_names.add(name)  # Add the new name to the set
                break  # Exit the loop when a unique name is generated
        
        def return_f(y, scalar=scalar, transform=transform, agg=agg):
            #y = group.struct.field("y").to_numpy()
            return agg(transform(scalar * y))
        
        yield name, return_f
        
        
#for name, fun in get_functions():
#    print(name)

In [3]:
save_dir = 'lio_problem'
create_directory_if_not_exist(save_dir)

In [4]:
sample_size = 250 #times dimention
problem_dim = 5

In [5]:
sample_file_norm = f'{save_dir}/samples__dim_{problem_dim}__samplesize_{sample_size}.norm.parquet'

In [6]:
df = pl.scan_parquet(sample_file_norm).collect()

In [7]:
df

problem,instance,y,optimum,x_0,x_1,x_2,x_3,x_4
i64,i64,f64,f64,f64,f64,f64,f64,f64
1,1,0.577262,79.48,-3.149596,1.601646,4.805492,-2.859105,2.913973
1,1,0.318008,79.48,0.78446,1.718528,3.697858,2.556515,2.751843
1,1,0.169762,79.48,-0.651181,-0.647084,-1.350508,-3.267599,-4.495132
1,1,0.458729,79.48,-3.057518,-2.772905,4.3904,-4.642849,-2.411301
1,1,0.116464,79.48,1.708115,1.237669,-0.079004,2.538148,1.013913
…,…,…,…,…,…,…,…,…
24,15,0.705769,149.81,3.550095,-2.349579,0.610341,-4.475076,-4.671802
24,15,0.460143,149.81,-2.14731,-4.21336,-0.549176,-1.346791,-0.135708
24,15,0.485692,149.81,-4.200217,2.243036,-3.145434,-0.55592,-1.302764
24,15,0.396352,149.81,-3.610818,-0.501484,-2.538735,4.095798,0.147821


In [8]:
ela_save_dir = f'{save_dir}/dummy'
create_directory_if_not_exist(ela_save_dir)

In [9]:
methods = list(get_functions())
#methods

In [10]:
for r in tqdm(df.select(['problem', 'instance']).unique().sort(['instance']).rows(named=True)):
    instance = r['instance']
    problem = r['problem']
    save_file = f'{ela_save_dir}/p_{problem}__i_{instance}.parquet'
    
    if os.path.exists(save_file):
        continue
    
    sdf = df.filter(pl.col('problem')==problem).filter(pl.col('instance')==instance)
    y = sdf['y'].to_numpy()
    
    features = {}
    for mname, method in methods:
        fe = method(y)
        features[mname]= fe

    features['problem'] = problem
    features['instance'] = instance
    
    l = features
    features = pl.DataFrame([features])
    features = features.with_columns([
        pl.col('*').cast(pl.Float64)
    ]).with_columns([
        pl.col('problem').cast(pl.Int64),  # Keep 'problem' as Int64
        pl.col('instance').cast(pl.Int64)  # Keep 'instance' as Int64
    ])

    features.write_parquet(save_file)

100%|██████████| 360/360 [00:00<00:00, 79655.49it/s]


In [11]:
sdsdf=Sdsd

NameError: name 'Sdsd' is not defined

In [None]:
def dummy_features(df):
    def y_sin_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.sin(y).mean()

    def y_cos_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.cos(y).mean()

    def y_3_sin_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.sin(3*y).mean()

    def y_3_cos_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.cos(3*y).mean()

    def y_5_sin_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.sin(5*y).mean()

    def y_5_cos_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.cos(5*y).mean()

    def y_7_sin_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.sin(7*y).mean()

    def y_7_cos_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.cos(7*y).mean()

    def y_pow_1_6_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 1/6).mean()

    def y_pow_1_5_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 1/5).mean()

    def y_pow_1_4_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 1/4).mean()

    def y_pow_1_3_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 1/3).mean()

    def y_pow_1_2_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 1/2).mean()

    def y_pow_1_6_std(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 1/6).std()

    def y_pow_1_5_std(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 1/5).std()

    def y_pow_1_4_std(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 1/4).std()

    def y_pow_1_3_std(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 1/3).std()

    def y_pow_1_2_std(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 1/2).std()
    
    def y_9_sin_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.sin(9*y).mean()

    def y_9_cos_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.cos(9*y).mean()

    def y_11_sin_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.sin(11*y).mean()

    def y_11_cos_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.cos(11*y).mean()

    def y_exp_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.exp(y).mean()

    def y_exp_std(group):
        y = group.struct.field("y").to_numpy()
        return np.exp(y).std()

    def y_exp_inv_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.exp(-y).mean()

    def y_exp_inv_std(group):
        y = group.struct.field("y").to_numpy()
        return np.exp(-y).std()

    def y_log1p_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.log1p(y).mean()

    def y_log1p_std(group):
        y = group.struct.field("y").to_numpy()
        return np.log1p(y).std()

    def y_pow_2_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 2).mean()

    def y_pow_3_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 3).mean()

    def y_pow_4_mean(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 4).mean()

    def y_pow_5_std(group):
        y = group.struct.field("y").to_numpy()
        return np.power(y, 5).std()
    
    

    #feature_funks = [
    #    y_sin_mean, 
    #    y_cos_mean, 
    #    y_3_sin_mean, 
    #    y_3_cos_mean, 
    #    y_5_sin_mean, 
    #    y_5_cos_mean, 
    #    y_7_sin_mean, 
    #    y_7_cos_mean,
    #    y_pow_1_6_mean,
    #    y_pow_1_5_mean,
    #    y_pow_1_4_mean,
    #    y_pow_1_3_mean,
    #    y_pow_1_2_mean,
    #    y_pow_1_6_std,
    #    y_pow_1_5_std,
    #    y_pow_1_4_std,
    #    y_pow_1_3_std,
    #    y_pow_1_2_std,
    #    
    #    y_9_sin_mean,
    #    y_9_cos_mean,
    #    y_11_sin_mean,
    #    y_11_cos_mean,
    #    y_exp_mean,
    #    y_exp_std,
    #    y_exp_inv_mean,
    #    y_exp_inv_std,
    #    y_log1p_mean,
    #    y_log1p_std,
    #    y_pow_2_mean,
    #    y_pow_3_mean,
    #    y_pow_4_mean,
    #    y_pow_5_std,
    #]

    cols = [pl.col("y"), pl.col("x_0"), pl.col("x_1"), pl.col("x_2"), pl.col("x_3"), pl.col("x_4")]
    #opps = [pl.struct(cols).map_elements(f, return_dtype=pl.Float64).alias(f.__name__) for f in feature_funks]

    opps = []
    for n, f in get_functions():
        opps.append(pl.struct(cols).map_elements(f, return_dtype=pl.Float64).alias(n))
    
    features = df.group_by(["problem", "instance"]).agg(opps).sort(['problem', 'instance'])
    return features

In [None]:
save_dir = 'lio_problem'
create_directory_if_not_exist(save_dir)

In [None]:
sample_size = 250 #times dimention
problem_dim = 5

In [None]:
sample_file_norm = f'{save_dir}/samples__dim_{problem_dim}__samplesize_{sample_size}.norm.parquet'
sample_file_norm_dummy = f'{save_dir}/samples__dim_{problem_dim}__samplesize_{sample_size}.dummy.norm.parquet'

In [None]:
df = pl.scan_parquet(sample_file_norm).collect()
df = df.filter(pl.col('instance')<=100)
df

In [None]:
#sf=sdfs

In [None]:
features = df.pipe(dummy_features)
features.write_parquet(sample_file_norm_dummy)

In [None]:
sdf=sdf

In [None]:
sample_file = f'{save_dir}/samples__dim_{problem_dim}__samplesize_{sample_size}.parquet'
sample_file_dummy = f'{save_dir}/samples__dim_{problem_dim}__samplesize_{sample_size}.dummy.parquet'

In [None]:
df = pl.scan_parquet(sample_file).collect()
print(df)

In [None]:
features = df.pipe(dummy_features)
features.write_parquet(sample_file_dummy)

In [None]:
asdf=asdf

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
X = features.select([f.__name__ for f in feature_funks]).to_numpy()
y = features['problem'].to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1)

# Train the RandomForest model
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Model accuracy: {accuracy:.2f}")