# Oneshot import

In [1]:
import os
import numpy as np
import pandas as pd
import json
from tabeb.interface import get_model, get_leaderboard
from tabeb.tasks import TabEBRegressionTask, TabEBClassificationTask
from sklearn.model_selection import GroupShuffleSplit
ROOT_DIR = os.path.join(os.path.dirname(os.getcwd()), "tabeb")

NUM_TRAIN = 16

In [2]:
def load_data_tmp(data_name, data_dir):
    df_data = pd.read_parquet(os.path.join(data_dir, data_name, f"{data_name}.parquet"))
    df_data.fillna(value=np.nan, inplace=True)
    with open(os.path.join(data_dir, data_name, f"{data_name}_metadata.json")) as f:
        data_metadata = json.load(f)
    return df_data, data_metadata

def get_splits(data, data_metadata, num_train=NUM_TRAIN, random_state=42):
    """Set train/test split given the random state."""
    target_name = data_metadata["target_name"]
    if num_train is None:
        num_train = int(len(data) * 0.8)
    X = data.drop(columns=target_name)
    y = data[target_name]
    y = np.array(y)

    if data_metadata.get("repeated", False):
        entity_name = data_metadata["entity_name"]
    else:
        entity_name = np.arange(len(y))

    groups = np.array(data.groupby(entity_name).ngroup())
    num_groups = len(np.unique(groups))
    gss = GroupShuffleSplit(
        n_splits=1,
        test_size=int(num_groups - num_train),
        random_state=random_state,
    )
    idx_train, idx_test = next(iter(gss.split(X=y, groups=groups)))

    X_train, X_test = X.iloc[idx_train], X.iloc[idx_test]
    y_train, y_test = y[idx_train], y[idx_test]

    return X_train, X_test, y_train, y_test

# Data

Load the toy datasets provided with the package

In [3]:
ROOT_DIR = os.path.join(os.path.dirname(os.getcwd()), "tabeb")
data_dir = os.path.join(ROOT_DIR, "data")
data_name = "wine_pl"

data_dict = {}
data_dict['wine_pl'] = load_data_tmp("wine_pl", data_dir)
data_dict['wine_vivino_price'] = load_data_tmp("wine_vivino_price", data_dir)
data_dict['spotify'] = load_data_tmp("spotify", data_dir)
for name, (df_data, data_metadata) in data_dict.items():
    print(data_metadata)
    display(df_data.head(3))
    print()

{'name': 'wine_pl', 'task': 'regression', 'target_name': 'price', 'entity_name': 'name', 'repeated': False, 'dtype': {'name': 'object', 'price': 'float64', 'country': 'object', 'region': 'object', 'winery': 'object', 'variety': 'object', 'alcohol': 'float64', 'acidity': 'float64', 'sugar': 'float64', 'ph': 'float64', 'sulphates': 'float64', 'quality': 'float64', 'rating': 'float64', 'taste': 'float64', 'color': 'object', 'type': 'object', 'vintage': 'int64', 'harvest': 'object', 'age': 'int64', 'temperature': 'float64', 'grape': 'object', 'food': 'object', 'description': 'object', 'image': 'object'}}


Unnamed: 0,name,price,country,region,appellation,vineyard,vintage,volume,ABV,serving_temperature,wine_type,taste,style,vegan,natural,grapes
0,Szampan Moet & Chandon Brut Imperial Magnum w ...,2.823474,France,Champagne,Champagne AOC,Moet & Chandon,,1500.0,,9,,dry,average,False,False,Chardonnay
1,11 Filari Primitivo di Manduria San Marzano,2.037426,Italy,Puglia,Primitivo di Manduria DOP,,2017.0,500.0,17.5,18,red,sweet,full,False,False,Primitivo
2,Szampan Moet & Chandon Brut Imperial Jeroboam ...,3.300813,France,Champagne,Champagne AOC,Moet & Chandon,,3000.0,,9,,dry,average,False,False,Chardonnay



{'name': 'wine_vivino_price', 'task': 'regression', 'target_name': 'Price', 'entity_name': 'Name', 'repeated': False}


Unnamed: 0,Name,Region,Winery,Rating,Number_Of_Ratings,Price,Year,Wine_Type
0,Pomerol 2011,"Pomerol, France",Château La Providence,4.2,100.0,4.553877,2011,red
1,Lirac 2017,"Lirac, France",Château Mont-Redon,4.3,100.0,2.74084,2017,red
2,Erta e China Rosso di Toscana 2015,"Toscana, Italy",Renzo Masi,3.9,100.0,2.008214,2015,red



{'name': 'spotify', 'task': 'classification', 'target_name': 'popularity', 'entity_name': 'track', 'repeated': False}


Unnamed: 0,track,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,popularity,decade
0,Jealous Kind Of Fella,Garland Green,0.417,0.62,3,-7.727,Major,0.0403,0.49,0.0,0.0779,0.845,185.655,173533.0,3,32.94975,9,1,60s
1,Initials B.B.,Serge Gainsbourg,0.498,0.505,3,-12.475,Major,0.0337,0.018,0.107,0.176,0.797,101.801,213613.0,4,48.8251,10,0,60s
2,Melody Twist,Lord Melody,0.657,0.649,5,-13.392,Major,0.038,0.846,4e-06,0.119,0.908,115.94,223960.0,4,37.22663,12,0,60s





# Main

## Encoders

In [4]:
import os
ROOT_DIR = os.path.join(os.path.dirname(os.getcwd()), "tabeb") # ROOT_DIR should point to "<prefix>/tabeb/tabeb"
data_dir = os.path.join(ROOT_DIR, "data") # data_dir should point to "<prefix>/tabeb/tabeb/data"

Currently, there are 3 models available

In [5]:
random_encoder = get_model("random_encoder")
carte_encoder = get_model("carte_encoder", task="regression")
skrub_encoder = get_model("skrub_encoder")

  from .autonotebook import tqdm as notebook_tqdm


Let's test the encoders

In [6]:
# Get train/test splits for the wine_vivino_price dataset
X_train, X_test, y_train, y_test = get_splits(data_dict['wine_vivino_price'][0], data_dict['wine_vivino_price'][1])

In [7]:
random_embedding = random_encoder.fit_transform(X_train, y_train)
carte_embedding = carte_encoder.fit_transform(X_train, y_train)
skrub_embedding = skrub_encoder.fit_transform(X_train, y_train)
print(f"Random embedding shape: {random_embedding.shape}")
print(f"Carte embedding shape: {carte_embedding.shape}")
print(f"Skrub embedding shape: {skrub_embedding.shape}")

  pretrain_model_dict = torch.load(
  scaler = amp.GradScaler()
  with amp.autocast():  # Enable autocasting
  with amp.autocast():  # Enable autocasting
Model No. xx:   9%|▉         | 46/500 [00:02<00:23, 19.47it/s]


Random embedding shape: (16, 128)
Carte embedding shape: (16, 75)
Skrub embedding shape: (16, 61)


## Tasks

To perform a task (e.g. regression, classification) in TabEB, first we get the corresponding TabEBBaseTask object: 

In [8]:
task_regression = TabEBRegressionTask(data_dir=data_dir)
task_classification = TabEBClassificationTask(data_dir=data_dir)

These objects contains what are nessesary to run the task:

- The datasets
- The evaluation protocol

Let's run the evaluations

!Attention: in this version, for demonstration purposes, TabEB (implicitly) limits the number of observations for each dataset at 500. The results show below are hence meaningless.

In [9]:
model_list = [random_encoder, carte_encoder, skrub_encoder]
result_dir = os.path.join(ROOT_DIR, "results")
df_regression_scores = task_regression.evaluate(model_list, save_dir=result_dir)
df_classification_scores = task_classification.evaluate(model_list, save_dir=result_dir)

  pretrain_model_dict = torch.load(
  scaler = amp.GradScaler()
  with amp.autocast():  # Enable autocasting
  with amp.autocast():  # Enable autocasting
Model No. xx:  10%|█         | 52/500 [00:01<00:13, 32.74it/s]
  pretrain_model_dict = torch.load(
  scaler = amp.GradScaler()
  with amp.autocast():  # Enable autocasting
  with amp.autocast():  # Enable autocasting
Model No. xx:  21%|██        | 104/500 [00:02<00:08, 47.89it/s]
  pretrain_model_dict = torch.load(
  scaler = amp.GradScaler()
  with amp.autocast():  # Enable autocasting
  with amp.autocast():  # Enable autocasting
Model No. xx:   9%|▉         | 47/500 [00:01<00:13, 32.83it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i

In [10]:
print("****Regresion Task****")
display(df_regression_scores)
print("****Classification Task****")
display(df_classification_scores)

****Regresion Task****


Unnamed: 0,encoder,wine_pl,wine_vivino_price,regression_average
0,random_encoder,0.403011,0.984584,0.693798
1,carte_encoder,0.618307,1.430602,1.024455
2,skrub_encoder,1.888127,1.726246,1.807186


****Classification Task****


Unnamed: 0,encoder,spotify,classification_average
0,random_encoder,0.490325,0.490325
1,carte_encoder,0.557282,0.557282
2,skrub_encoder,0.564957,0.564957


## Leaderboard

In [11]:
df_leaderboard = get_leaderboard(["regression", "classification"], result_dir)
display(df_leaderboard)

Unnamed: 0,encoder,regression_average,classification_average
0,carte_encoder,1.024455,0.557282
1,random_encoder,0.693798,0.490325
2,skrub_encoder,1.807186,0.564957
