In [46]:
import pandas as pd
from scipy.io import arff
from treecombiner import TreeCombiner
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [47]:
DATASETS_COLUMNS = {
    'Height': 'childHeight',
    'salary_football': 'Wage'
} 

SEED = 14208

def process_dataset(filepath, extension, dataset_name):
    """
    Load and preprocess a dataset from a file.

    Parameters:
        filepath (str): The path to the dataset file.
        extension (str): The file extension (e.g., '.arff', '.csv').
        dataset_name (str): The name of the dataset.

    Returns:
        tuple: A tuple containing the training and validation data.
    """
    # Carga de datos
    if extension == '.arff':
        data = arff.loadarff(filepath)
        df = pd.DataFrame(data[0])
    elif extension == '.csv':
        df = pd.read_csv(filepath)
    else:
        raise ValueError("Unsupported file type. Use 'arff' or 'csv'.")

    # Preprocesamiento de datos
    train_df, validation_df = train_test_split(df, test_size=0.2, random_state=SEED)
    y_train, y_valid = train_df[DATASETS_COLUMNS[dataset_name]], validation_df[DATASETS_COLUMNS[dataset_name]]
    X_train, X_valid = train_df.drop(DATASETS_COLUMNS[dataset_name], axis=1), validation_df.drop(DATASETS_COLUMNS[dataset_name], axis=1)
    X_train = pd.get_dummies(X_train) # a chequear
    X_valid = pd.get_dummies(X_valid)
    X_train, X_valid = X_train.align(X_valid, join='left', axis=1, fill_value=0)

    return X_train, X_valid, y_train, y_valid

X_train, X_valid, y_train, y_valid = process_dataset('Height.csv', '.csv', 'Height')

In [48]:
rf = RandomForestRegressor(random_state=SEED, max_depth=1)
rf.fit(X_train, y_train)

In [49]:
import random

trees = rf.estimators_
random.shuffle(trees)

In [50]:
samples_used = rf.estimators_samples_

samplesidx = set()
for samples in samples_used:
    for sampleidx in samples:
        samplesidx.add(sampleidx)
samplesidx = list(samplesidx)

X_union = X_train.iloc[samplesidx].to_numpy()
y_union = y_train.iloc[samplesidx].to_numpy()

In [51]:
test_combiner_g1 = TreeCombiner(trees[:5], X_union, y_union)
test_combiner_g2 = TreeCombiner(trees[5:11], X_union, y_union)
test_combiner_g3 = TreeCombiner(trees[11:17], X_union, y_union)
test_combiner_g4 = TreeCombiner(trees[17:23], X_union, y_union)
test_combiner_g5 = TreeCombiner(trees[23:29], X_union, y_union)
test_combiner_g6 = TreeCombiner(trees[29:35], X_union, y_union)
test_combiner_g7 = TreeCombiner(trees[35:41], X_union, y_union)
test_combiner_g8 = TreeCombiner(trees[41:46], X_union, y_union)
test_combiner_g9 = TreeCombiner(trees[46:52], X_union, y_union)
test_combiner_g10 = TreeCombiner(trees[52:58], X_union, y_union)
test_combiner_g11 = TreeCombiner(trees[58:64], X_union, y_union)
test_combiner_g12 = TreeCombiner(trees[64:69], X_union, y_union)
test_combiner_g13 = TreeCombiner(trees[69:75], X_union, y_union)
test_combiner_g14 = TreeCombiner(trees[75:81], X_union, y_union)
test_combiner_g15 = TreeCombiner(trees[81:87], X_union, y_union)
test_combiner_g16 = TreeCombiner(trees[87:93], X_union, y_union)
test_combiner_g17 = TreeCombiner(trees[93:], X_union, y_union)

In [52]:
test_combiner_g1.combine_trees()
test_combiner_g2.combine_trees()
test_combiner_g3.combine_trees()
test_combiner_g4.combine_trees()
test_combiner_g5.combine_trees()
test_combiner_g6.combine_trees()
test_combiner_g7.combine_trees()
test_combiner_g8.combine_trees()
test_combiner_g9.combine_trees()
test_combiner_g10.combine_trees()
test_combiner_g11.combine_trees()
test_combiner_g12.combine_trees()
test_combiner_g13.combine_trees()
test_combiner_g14.combine_trees()
test_combiner_g15.combine_trees()
test_combiner_g16.combine_trees()
test_combiner_g17.combine_trees()

In [54]:
mse_sum = 0
pred_new = []
for i in range(187):
    test_value = X_valid.values[i]
    test_value_correct = y_valid.values[i]

    sum_predict = 0
    sum_predict += test_combiner_g1.predict(test_value)
    sum_predict += test_combiner_g2.predict(test_value)
    sum_predict += test_combiner_g3.predict(test_value)
    sum_predict += test_combiner_g4.predict(test_value)
    sum_predict += test_combiner_g5.predict(test_value)
    sum_predict += test_combiner_g6.predict(test_value)
    sum_predict += test_combiner_g7.predict(test_value)
    sum_predict += test_combiner_g8.predict(test_value)
    sum_predict += test_combiner_g9.predict(test_value)
    sum_predict += test_combiner_g10.predict(test_value)
    sum_predict += test_combiner_g11.predict(test_value)
    sum_predict += test_combiner_g12.predict(test_value)
    sum_predict += test_combiner_g13.predict(test_value)
    sum_predict += test_combiner_g14.predict(test_value)
    sum_predict += test_combiner_g15.predict(test_value)
    sum_predict += test_combiner_g16.predict(test_value)
    sum_predict += test_combiner_g17.predict(test_value)
    sum_predict /= 17
    pred_new.append(sum_predict)
    print(f"Prediction: {sum_predict} | Value: {test_value_correct}")


Prediction: 64.11197771587752 | Value: 63.0
Prediction: 64.11197771587752 | Value: 69.0
Prediction: 69.12783505154641 | Value: 68.0
Prediction: 69.12783505154641 | Value: 69.0
Prediction: 69.12783505154641 | Value: 68.5
Prediction: 64.11197771587752 | Value: 68.5
Prediction: 64.11197771587752 | Value: 63.0
Prediction: 64.11197771587752 | Value: 65.5
Prediction: 64.11197771587752 | Value: 64.7
Prediction: 69.12783505154641 | Value: 69.2
Prediction: 69.12783505154641 | Value: 67.0
Prediction: 64.11197771587752 | Value: 63.5
Prediction: 69.12783505154641 | Value: 72.0
Prediction: 64.11197771587752 | Value: 61.0
Prediction: 64.11197771587752 | Value: 62.5
Prediction: 64.11197771587752 | Value: 62.5
Prediction: 69.12783505154641 | Value: 66.5
Prediction: 69.12783505154641 | Value: 66.0
Prediction: 64.11197771587752 | Value: 63.0
Prediction: 64.11197771587752 | Value: 62.5
Prediction: 69.12783505154641 | Value: 74.2
Prediction: 69.12783505154641 | Value: 70.0
Prediction: 64.11197771587752 | 

In [55]:
from sklearn.metrics import mean_squared_error as mse
pred = rf.predict(X_valid)
print(f'MSE OLD = {mse(pred, y_valid)}')
print(f'MSE NEW = {mse(pred_new, y_valid)}')

MSE OLD = 5.062796691552399
MSE NEW = 5.067785803720847
