In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
import sys
sys.path.append('../src')

In [3]:
from dataset import load_simple_trig_synth, load_simple_linear_synth, load_exponential_interaction_synth, load_summed_squares_exponential_synth, load_trigonometric_polynomial_synth

In [9]:
results = {}


datasets = ['simple_trig_synth', 
            'simple_linear_synth', 
            'exponential_interaction_synth',
            'summed_squares_exponential_synth',
            'trigonometric_polynomial_synth',
            ]
for dataset in datasets:
    results[dataset] = []
    if dataset=='simple_trig_synth':
        X, y = load_simple_trig_synth()
    elif dataset=='simple_linear_synth':
        X, y = load_simple_linear_synth()
    elif dataset=='exponential_interaction_synth':
        X, y = load_exponential_interaction_synth()
    elif dataset=='summed_squares_exponential_synth':
        X, y = load_summed_squares_exponential_synth()
    elif dataset=='trigonometric_polynomial_synth':
        X, y = load_trigonometric_polynomial_synth()
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    for seed in range(5):
        # Set a seed for reproducibility
        np.random.seed(seed)
        
        # N_ensemble_configurations controls the number of model predictions that are ensembled with feature and class rotations (See our work for details).
        # When N_ensemble_configurations > #features * #classes, no further averaging is applied.
        # Instantiate the XGBoost classifier
        classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', seed=seed)

        # Fit the classifier to the sampled training data
        classifier.fit(X_train, y_train)

        # Predict the labels of the test set
        y_pred = classifier.predict(X_test)

        acc_score = accuracy_score(y_test, y_pred)
        print('Accuracy', acc_score)
        results[dataset].append(acc_score)

Accuracy 0.9785
Accuracy 0.9785
Accuracy 0.9785
Accuracy 0.9785
Accuracy 0.9785
Accuracy 0.995
Accuracy 0.995
Accuracy 0.995
Accuracy 0.995
Accuracy 0.995
Accuracy 0.721
Accuracy 0.721
Accuracy 0.721
Accuracy 0.721
Accuracy 0.721
Accuracy 0.6345
Accuracy 0.6345
Accuracy 0.6345
Accuracy 0.6345
Accuracy 0.6345
Accuracy 0.996
Accuracy 0.996
Accuracy 0.996
Accuracy 0.996
Accuracy 0.996


In [10]:
# Convert dictionary to DataFrame
df = pd.DataFrame.from_dict(results, orient='index', columns=[f'seed_{i}' for i in range(5)])
df.reset_index(inplace=True)
df.rename(columns={'index': 'dataset'}, inplace=True)

# Save to CSV
csv_filename = '/home/er647/projects/feature-wise-active-learning/baseline_results/xgboost_results.csv'  # Saving in the /mnt/data directory
df.to_csv(csv_filename, index=False)

In [11]:
df['avg_seed'] = df.loc[:, 'seed_0':'seed_4'].mean(axis=1)
df

Unnamed: 0,dataset,seed_0,seed_1,seed_2,seed_3,seed_4,avg_seed
0,simple_trig_synth,0.9785,0.9785,0.9785,0.9785,0.9785,0.9785
1,simple_linear_synth,0.995,0.995,0.995,0.995,0.995,0.995
2,exponential_interaction_synth,0.721,0.721,0.721,0.721,0.721,0.721
3,summed_squares_exponential_synth,0.6345,0.6345,0.6345,0.6345,0.6345,0.6345
4,trigonometric_polynomial_synth,0.996,0.996,0.996,0.996,0.996,0.996
