<a href="https://colab.research.google.com/github/dimitarpg13/transformer_examples/blob/main/notebooks/bert/Wine_Prediction_with_RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Load Libraries and Raw Datasets

## Load the parquet files with the training and test data


In [None]:
import numpy as np
import pandas as pd
import pprint
from IPython.display import display
from google.colab import drive
drive.mount('/content/drive')

#%env CUDA_LAUNCH_BLOCKING=1

TRAIN_DATASET_PATH = '/content/drive/MyDrive/prediction-with-assortment/wines/data/click_train.parquet'
TEST_DATASET_PATH = '/content/drive/MyDrive/prediction-with-assortment/wines/data/click_test.parquet'
CATALOG_DATASET_PATH = '/content/drive/MyDrive/prediction-with-assortment/wines/data/catalog.parquet'

click_train = pd.read_parquet(TRAIN_DATASET_PATH)
click_test = pd.read_parquet(TEST_DATASET_PATH)
catalog = pd.read_parquet(CATALOG_DATASET_PATH)

click_all = pd.concat([click_train, click_test], ignore_index=True)
display(click_all.head())
print(f"Number of rows in the training dataset: {len(click_train)}")
print(f"Number of rows in the test dataset: {len(click_test)}")
print(f"Number of rows in the combined dataset: {len(click_all)}")


# split into 60% training, 20% test and 20% validation
click_train_new, click_validate_new, click_test_new = \
              np.split(click_all.sample(frac=1, random_state=42),
                       [int(.6*len(click_all)), int(.8*len(click_all))])

#TODO: experiment with the new split datasets
print(f"Number of rows in the resampled training dataset: {len(click_train_new)}")
print(f"Number of rows in the resampled validation dataset: {len(click_validate_new)}")
print(f"Number of rows in the resampled test dataset: {len(click_test_new)}")



Mounted at /content/drive


Unnamed: 0,query,results,new_labels
0,italian red wine pairings for game meat,"[135856, 135834, 135833, 136365, 135999, 13761...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,vinho de portugal red wine,"[155465, 102122, 102127, 101576, 101682]","[0, 0, 0, 0, 1]"
2,white wines from venica & venica winery,"[101886, 193520, 174403, 137037, 137392]","[0, 0, 0, 1, 0]"
3,$50-$75 red wine for special occasion,"[174211, 179528, 112116, 112007, 111614]","[0, 0, 0, 0, 1]"
4,yalumba cabernet sauvignon shiraz,"[174195, 111413, 174186, 174295, 180111, 174279]","[0, 0, 0, 0, 0, 1]"


Number of rows in the training dataset: 12932
Number of rows in the test dataset: 3233
Number of rows in the combined dataset: 16165
Number of rows in the resampled training dataset: 9699
Number of rows in the resampled validation dataset: 3233
Number of rows in the resampled test dataset: 3233


  return bound(*args, **kwds)


In [None]:
!pip install transformers datasets huggingface_hub tensorboard

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

### Show the first few items of the product catalog

In [None]:

display(catalog.head())

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Code,...,image,url,snippet,scrape_match,full_response,size,price,priceCurrency,VintageSummary,product_embed_description
0,111654,Bandol,Red,Varietal/100%,[Mourvedre],[Beef],14.0,Medium-bodied,High,FR,...,https://cdn.klwines.com/images/skus/1269107x.jpg,https://shop.klwines.com/products/details/1269107,"More approachable, yet still with classic Mour...",0.926921,"{'kind': 'customsearch#search', 'url': {'type'...",750ml Wine,39.99,USD,1972-2021,Wine Name: Bandol; Wine Type: Red; Wine Elabor...
1,111791,Bandol Rouge,Red,Varietal/100%,[Mourvedre],[Beef],13.6,Full-bodied,Medium,FR,...,https://cdn.klwines.com/images/skus/1088800x.jpg,https://shop.klwines.com/products/details/1088800,"... Bandol Blanc, or the distinctive cuvées of...",0.917125,"{'kind': 'customsearch#search', 'url': {'type'...",750ml Wine,39.99,USD,1969-2021,Wine Name: Bandol Rouge; Wine Type: Red; Wine ...
2,112029,Bandol,Red,Varietal/100%,[Mourvedre],[Beef],13.5,Full-bodied,Medium,FR,...,https://cdn.klwines.com/images/skus/1546145x.jpg,https://shop.klwines.com/products/details/1546145,In the early '70s Michel and Louis Bronzo acqu...,0.888006,"{'kind': 'customsearch#search', 'url': {'type'...",750ml Wine,24.99,USD,1969-2021,Wine Name: Bandol; Wine Type: Red; Wine Elabor...
3,112169,Madiran,Red,Varietal/100%,[Tannat],[Beef],14.0,Medium-bodied,Medium,FR,...,https://cdn.klwines.com/images/skus/1323833x.jpg,https://shop.klwines.com/products/details/1323833,This is the home estate of Madiran superstar A...,0.933378,"{'kind': 'customsearch#search', 'url': {'type'...",750ml Wine,19.99,USD,"1972-2022, N.V.",Wine Name: Madiran; Wine Type: Red; Wine Elabo...
4,112740,Cuvée Prestige Madiran,Red,Varietal/100%,[Tannat],[Beef],15.0,Full-bodied,Medium,FR,...,https://cdn.klwines.com/images/skus/1713664x.jpg,https://shop.klwines.com/products/details/1713664,2015 Château Montus Pacherenc du Vic-Bilh Blan...,0.856064,"{'kind': 'customsearch#search', 'url': {'type'...",750ml Wine,17.99,USD,1968-2020,Wine Name: Cuvée Prestige Madiran; Wine Type: ...


### Show all columns in the product catalog

In [None]:

pprint.pp(catalog.columns.tolist())

['WineID',
 'WineName',
 'Type',
 'Elaborate',
 'Grapes',
 'Harmonize',
 'ABV',
 'Body',
 'Acidity',
 'Code',
 'Country',
 'RegionID',
 'RegionName',
 'WineryID',
 'WineryName',
 'Website',
 'Vintages',
 'AverageReviewScore',
 'TotalReviews',
 'query',
 'title',
 'review',
 'image',
 'url',
 'snippet',
 'scrape_match',
 'full_response',
 'size',
 'price',
 'priceCurrency',
 'VintageSummary',
 'product_embed_description']


In [None]:
pprint.pp(click_train.columns.tolist())

['query', 'results', 'new_labels']


In [None]:
display(click_train.head())

Unnamed: 0,query,results,new_labels
12039,italian red wine pairings for game meat,"[135856, 135834, 135833, 136365, 135999, 13761...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7939,vinho de portugal red wine,"[155465, 102122, 102127, 101576, 101682]","[0, 0, 0, 0, 1]"
1604,white wines from venica & venica winery,"[101886, 193520, 174403, 137037, 137392]","[0, 0, 0, 1, 0]"
4283,$50-$75 red wine for special occasion,"[174211, 179528, 112116, 112007, 111614]","[0, 0, 0, 0, 1]"
321,yalumba cabernet sauvignon shiraz,"[174195, 111413, 174186, 174295, 180111, 174279]","[0, 0, 0, 0, 0, 1]"


In [None]:
pprint.pp(f"Number of items in the new_labels column for row 0: {len(click_train.iloc[[0]]['new_labels'].to_list()[0])}")

pprint.pp(f"Number of items in the results column for row 0: {len(click_train.iloc[[0]]['results'].to_list()[0])}")

def or_update(row, offending_rows):
    if len(row.new_labels) != len(row.results):
        offending_rows.append(row)
    elif sum(row.new_labels) != 1:
        offending_rows.append(row)

offending_rows_tr = list()

click_train.apply(lambda row: or_update(row, offending_rows_tr), axis=1)
pprint.pp(f'Offending rows in the training dataset: {offending_rows_tr}')



offending_rows_ts = list()
click_test.apply(lambda row: or_update(row, offending_rows_ts), axis=1)
pprint.pp(f'Offending rows in the test dataset: {offending_rows_ts}')


offending_rows_tr_new = list()

click_train_new.apply(lambda row: or_update(row, offending_rows_tr_new), axis=1)
pprint.pp(f'Offending rows in the training dataset: {offending_rows_tr_new}')


offending_rows_ts_new = list()
click_test.apply(lambda row: or_update(row, offending_rows_ts_new), axis=1)
pprint.pp(f'Offending rows in the test dataset: {offending_rows_ts_new}')


'Number of items in the new_labels column for row 0: 31'
'Number of items in the results column for row 0: 31'
'Offending rows in the training dataset: []'
'Offending rows in the test dataset: []'
'Offending rows in the training dataset: []'
'Offending rows in the test dataset: []'


In [None]:
def click_update(row, clicked_wine_ids, wine_ids_per_query, query_results_counts, query_strings):
    indices = np.where(row.new_labels == 1)
    #print(row.results[indices[0][0]])
    wine_id = int(row.results[indices[0][0]])
    if wine_id in clicked_wine_ids:
        clicked_wine_ids[wine_id] += 1
    else:
        clicked_wine_ids[wine_id] = 1
    wine_ids_per_query.append(len(row.results))
    result_set = frozenset(row.results)
    if result_set in query_results_counts:
        query_results_counts[result_set] += 1
    else:
        query_results_counts[result_set] = 1
    query_strings.add(row.query)

print('Details on the original dataset (train + test):')
query_strings_tr = set()
query_results_counts_tr = dict()
wine_ids_per_query_tr = list()
clicked_wine_ids_tr = dict()
click_train.apply(lambda row: click_update(row, clicked_wine_ids_tr, wine_ids_per_query_tr, query_results_counts_tr, query_strings_tr), axis=1)
pprint.pp(f'Number of clicked unique wine IDs in the training dataset: {len(clicked_wine_ids_tr)}')
pprint.pp(f'Number of unique query results in the training dataset: {len(query_results_counts_tr)}')
pprint.pp(f'Number of unique query strings in the training dataset: {len(query_strings_tr)}')

query_strings_ts = set()
query_results_counts_ts = dict()
wine_ids_per_query_ts = list()
clicked_wine_ids_ts = dict()
click_test.apply(lambda row: click_update(row, clicked_wine_ids_ts, wine_ids_per_query_ts, query_results_counts_ts, query_strings_ts), axis=1)
pprint.pp(f'Number of clicked unique wine IDs in the test dataset: {len(clicked_wine_ids_ts)}')
pprint.pp(f'Number of unique query results in the testing dataset: {len(query_results_counts_ts)}')
pprint.pp(f'Number of unique query strings in the testing dataset: {len(query_strings_ts)}')

cnt_tr_query_str_not_in_ts =0
for query_str in query_strings_tr:
    if query_str not in query_strings_ts:
        cnt_tr_query_str_not_in_ts += 1
pprint.pp(f'Number of unique query strings in the training dataset that are not in the test dataset: {cnt_tr_query_str_not_in_ts}')

cnt_ts_query_str_not_in_tr =0
for query_str in query_strings_ts:
    if query_str not in query_strings_tr:
        cnt_ts_query_str_not_in_tr += 1
pprint.pp(f'Number of unique query strings in the testing dataset that are not in the training dataset: {cnt_ts_query_str_not_in_tr}')



print('\nDetails on the newly resampled dataset (train_new + test_new):')
query_strings_tr_new = set()
query_results_counts_tr_new = dict()
wine_ids_per_query_tr_new = list()
clicked_wine_ids_tr_new = dict()
click_train_new.apply(lambda row: click_update(row, clicked_wine_ids_tr_new, wine_ids_per_query_tr_new, query_results_counts_tr_new, query_strings_tr_new), axis=1)
pprint.pp(f'Number of clicked unique wine IDs in the newly resampled training dataset: {len(clicked_wine_ids_tr_new)}')
pprint.pp(f'Number of unique query results in the newly resampled training dataset: {len(query_results_counts_tr_new)}')
pprint.pp(f'Number of unique query strings in the newly resampled training dataset: {len(query_strings_tr_new)}')

query_strings_ts_new = set()
query_results_counts_ts_new = dict()
wine_ids_per_query_ts_new = list()
clicked_wine_ids_ts_new = dict()
click_test_new.apply(lambda row: click_update(row, clicked_wine_ids_ts_new, wine_ids_per_query_ts_new, query_results_counts_ts_new, query_strings_ts_new), axis=1)
pprint.pp(f'Number of clicked unique wine IDs in the newly resampled test dataset: {len(clicked_wine_ids_ts_new)}')
pprint.pp(f'Number of unique query results in the newly resampled testing dataset: {len(query_results_counts_ts_new)}')
pprint.pp(f'Number of unique query strings in the newly resampled testing dataset: {len(query_strings_ts_new)}')


Details on the original dataset (train + test):
'Number of clicked unique wine IDs in the training dataset: 2543'
'Number of unique query results in the training dataset: 12913'
'Number of unique query strings in the training dataset: 12325'
'Number of clicked unique wine IDs in the test dataset: 1812'
'Number of unique query results in the testing dataset: 3232'
'Number of unique query strings in the testing dataset: 3163'
('Number of unique query strings in the training dataset that are not in the '
 'test dataset: 12144')
('Number of unique query strings in the testing dataset that are not in the '
 'training dataset: 2982')

Details on the newly resampled dataset (train_new + test_new):
('Number of clicked unique wine IDs in the newly resampled training dataset: '
 '2499')
'Number of unique query results in the newly resampled training dataset: 9691'
'Number of unique query strings in the newly resampled training dataset: 9285'
'Number of clicked unique wine IDs in the newly resamp

In [None]:
import statistics

def calculate_stats(data):
    """Calculates the mean, standard deviation, and variance of a list of numbers.

    Args:
        data: A list of numerical data.

    Returns:
        A dictionary containing the mean, standard deviation, and variance.
    """
    if not data:
        return {"mean": None, "stdev": None, "variance": None}

    mean = statistics.mean(data)
    stdev = statistics.stdev(data)
    variance = statistics.variance(data)

    return {"mean": mean, "stdev": stdev, "variance": variance}

# stats about the clicked wine id
stats_clicked_wine_id_tr = calculate_stats(list(clicked_wine_ids_tr.values()))
pprint.pp(f'Average number of clicks for a wine id in the training dataset: {stats_clicked_wine_id_tr["mean"]}')
pprint.pp(f'Standard Deviation of the number of clicks for a wine id in the training dataset: {stats_clicked_wine_id_tr["stdev"]}')
pprint.pp(f'Variance of the number of clicks for a wine id in the training dataset: {stats_clicked_wine_id_tr["variance"]}')

stats_clicked_wine_id_ts = calculate_stats(list(clicked_wine_ids_ts.values()))
pprint.pp(f'Average number of clicks for a wine id in the test dataset: {stats_clicked_wine_id_ts["mean"]}')
pprint.pp(f'Standard Deviation of the number of clicks for a wine id in the test dataset: {stats_clicked_wine_id_ts["stdev"]}')
pprint.pp(f'Variance of the number of clicks for a wine id in the test dataset: {stats_clicked_wine_id_ts["variance"]}')

stats_clicked_wine_id_tr_new = calculate_stats(list(clicked_wine_ids_tr_new.values()))
pprint.pp(f'Average number of clicks for a wine id in the newly resampled training dataset: {stats_clicked_wine_id_tr_new["mean"]}')
pprint.pp(f'Standard Deviation of the number of clicks for a wine id in the newly resampled training dataset: {stats_clicked_wine_id_tr_new["stdev"]}')
pprint.pp(f'Variance of the number of clicks for a wine id in the newly resampled training dataset: {stats_clicked_wine_id_tr_new["variance"]}')

stats_clicked_wine_id_ts_new = calculate_stats(list(clicked_wine_ids_ts_new.values()))
pprint.pp(f'Average number of clicks for a wine id in the newly resampled test dataset: {stats_clicked_wine_id_ts_new["mean"]}')
pprint.pp(f'Standard Deviation of the number of clicks for a wine id in the newly resampled test dataset: {stats_clicked_wine_id_ts_new["stdev"]}')
pprint.pp(f'Variance of the number of clicks for a wine id in the newly resampled test dataset: {stats_clicked_wine_id_ts_new["variance"]}')

# stats about the number of wine ids per query
stats_wine_ids_per_query_tr = calculate_stats(list(wine_ids_per_query_tr))
pprint.pp(f'Average number of wine ids per query in the training dataset: {stats_wine_ids_per_query_tr["mean"]}')
pprint.pp(f'Standard Deviation of the number of wine ids per query in the training dataset: {stats_wine_ids_per_query_tr["stdev"]}')
pprint.pp(f'Variance of the number of wine ids per query in the training dataset: {stats_wine_ids_per_query_tr["variance"]}')

stats_wine_ids_per_query_ts = calculate_stats(list(wine_ids_per_query_ts))
pprint.pp(f'Average number of wine ids per query in the test dataset: {stats_wine_ids_per_query_ts["mean"]}')
pprint.pp(f'Standard Deviation of the number of wine ids per query in the test dataset: {stats_wine_ids_per_query_ts["stdev"]}')
pprint.pp(f'Variance of the number of wine ids per query in the test dataset: {stats_wine_ids_per_query_ts["variance"]}')

stats_wine_ids_per_query_tr_new = calculate_stats(list(wine_ids_per_query_tr_new))
pprint.pp(f'Average number of wine ids per query in the newly resampled training dataset: {stats_wine_ids_per_query_tr_new["mean"]}')
pprint.pp(f'Standard Deviation of the number of wine ids per query in the newly resampled training dataset: {stats_wine_ids_per_query_tr_new["stdev"]}')
pprint.pp(f'Variance of the number of wine ids per query in the newly resampled training dataset: {stats_wine_ids_per_query_tr_new["variance"]}')

stats_wine_ids_per_query_ts_new = calculate_stats(list(wine_ids_per_query_ts_new))
pprint.pp(f'Average number of wine ids per query in the newly resampled test dataset: {stats_wine_ids_per_query_ts_new["mean"]}')
pprint.pp(f'Standard Deviation of the number of wine ids per query in the newly resampled test dataset: {stats_wine_ids_per_query_ts_new["stdev"]}')
pprint.pp(f'Variance of the number of wine ids per query in the newly resampled test dataset: {stats_wine_ids_per_query_ts_new["variance"]}')


('Average number of clicks for a wine id in the training dataset: '
 '5.085332284703107')
('Standard Deviation of the number of clicks for a wine id in the training '
 'dataset: 2.935745334066074')
('Variance of the number of clicks for a wine id in the training dataset: '
 '8.618600666490726')
'Average number of clicks for a wine id in the test dataset: 1.784216335540839'
('Standard Deviation of the number of clicks for a wine id in the test '
 'dataset: 1.0687645393811398')
('Variance of the number of clicks for a wine id in the test dataset: '
 '1.1422576406385798')
('Average number of clicks for a wine id in the newly resampled training '
 'dataset: 3.881152460984394')
('Standard Deviation of the number of clicks for a wine id in the newly '
 'resampled training dataset: 2.365323808221432')
('Variance of the number of clicks for a wine id in the newly resampled '
 'training dataset: 5.594756717739137')
('Average number of clicks for a wine id in the newly resampled test dataset: '


In [None]:
wine_names = catalog['WineName'].unique().tolist()
wine_ids = catalog['WineID'].unique().tolist()
NUM_OF_WINE_IDS = len(wine_ids)
NUM_OF_WINE_NAMES = len(wine_names)
pprint.pp(f"Number of unique labels: {NUM_OF_WINE_NAMES}")
pprint.pp(f"Number of unique wine ids: {len(wine_ids)}")
pprint.pp(f"Wine names: {wine_names[1:6]}")
pprint.pp(f"Wine IDs: {wine_ids[1:6]}")


'Number of unique labels: 1553'
'Number of unique wine ids: 2600'
("Wine names: ['Bandol Rouge', 'Madiran', 'Cuvée Prestige Madiran', 'Palette "
 "Rouge', '10 Year Old Tawny Port']")
'Wine IDs: [111791, 112029, 112169, 112740, 112763]'


In [None]:
from itertools import islice

def take(n, iterable):
    """Return the first n items of the iterable as a list."""
    return list(islice(iterable, n))


id2wineid={id:wine_id for id,wine_id in enumerate(wine_ids)}

wineid2id={wine_id:id for id,wine_id in enumerate(wine_ids)}

pprint.pp(take(5,wineid2id.items()))

[(111654, 0), (111791, 1), (112029, 2), (112169, 3), (112740, 4)]


In [None]:
df=catalog[['WineName',"WineID", "product_embed_description"]]
def update(row, wineid2winename, wineid2ped):
    wineid2winename.update({row.WineID: row.WineName})
    wineid2ped.update({row.WineID: row.product_embed_description})

# create wineid2winename and wineid2ped dict

wineid2winename = dict()
wineid2ped = dict()

df.apply(lambda row: update(row, wineid2winename, wineid2ped), axis=1)
pprint.pp(f"WineID to WineName dict (5 items): {take(5, wineid2winename.items())}")

pprint.pp(f"Number of unique WineIDs in the catalog: {len(set(list(wineid2ped.keys())))}")
pprint.pp(f"Number of unique product_embed_descriptions in the catalog: {len(set(list(wineid2ped.values())))}")
pprint.pp(f"Number of unique wine names in the catalog: {len(set(list(wineid2winename.values())))}")

wine_ids_from_data = list(wineid2winename.keys())

NUM_OF_WINE_IDS_FROM_DATA = len(wine_ids_from_data)
pprint.pp(f"Number of wine ids from data: {NUM_OF_WINE_IDS_FROM_DATA}")


("WineID to WineName dict (5 items): [(111654, 'Bandol'), (111791, 'Bandol "
 "Rouge'), (112029, 'Bandol'), (112169, 'Madiran'), (112740, 'Cuvée Prestige "
 "Madiran')]")
'Number of unique WineIDs in the catalog: 2600'
'Number of unique product_embed_descriptions in the catalog: 2600'
'Number of unique wine names in the catalog: 1553'
'Number of wine ids from data: 2600'


In [None]:
# use wine id of the clicked wine as label in the training , test and validation data

def add_label(row):
    indices = np.where(row.new_labels == 1)
    wine_id = int(row.results[indices[0][0]])
    row['label'] = wineid2id[wine_id]
    return row

click_train = click_train.apply(lambda row: add_label(row), axis=1)
click_test = click_test.apply(lambda row: add_label(row), axis=1)


click_train_new = click_train_new.apply(lambda row: add_label(row), axis=1)
click_test_new = click_test_new.apply(lambda row: add_label(row), axis=1)
click_validate_new = click_validate_new.apply(lambda row: add_label(row), axis=1)

### import the necessary components for the fine tuning of RoBERTa base model

In [None]:
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
from datasets import Dataset as DDataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding
)
from huggingface_hub import HfFolder, notebook_login

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

Alternative Solution - load the RoBERTa base directly

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_id = "roberta-base"
dataset_id = "wines"
# relace the value with your model: ex <hugging-face-user>/<model-name>
repository_id = "dimitarpg13/roberta-finetuned-wines"


In [None]:

# Load dataset


train_dataset = DDataset.from_pandas(click_train)
test_dataset = DDataset.from_pandas(click_test)
val_dataset = DDataset.from_pandas(click_validate_new)


train_dataset



Dataset({
    features: ['query', 'results', 'new_labels', 'label', '__index_level_0__'],
    num_rows: 12932
})

In [None]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = RobertaForSequenceClassification.from_pretrained(model_id, output_hidden_states=True, num_labels=NUM_OF_WINE_IDS, id2label=id2wineid, label2id=wineid2id)
model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
train_texts = list(click_train['query'])
val_texts = list(click_validate_new['query'])
test_texts = list(click_test['query'])

train_labels = list(click_train['label'])
val_labels = list(click_validate_new['label'])
test_labels = list(click_test['label'])

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings  = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)


In [None]:
class DataLoader(Dataset):
    """
    Custom Dataset class for handling tokenized text data and corresponding labels.
    Inherits from torch.utils.data.Dataset.
    """
    def __init__(self, encodings, labels):
        """
        Initializes the DataLoader class with encodings and labels.

        Args:
            encodings (dict): A dictionary containing tokenized input text data
                              (e.g., 'input_ids', 'token_type_ids', 'attention_mask').
            labels (list): A list of integer labels for the input text data.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Returns a dictionary containing tokenized data and the corresponding label for a given index.

        Args:
            idx (int): The index of the data item to retrieve.

        Returns:
            item (dict): A dictionary containing the tokenized data and the corresponding label.
        """
        # Retrieve tokenized data for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the label for the given index to the item dictionary
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        """
        Returns the number of data items in the dataset.

        Returns:
            (int): The number of data items in the dataset.
        """
        return len(self.labels)


In [None]:
train_dataloader = DataLoader(train_encodings, train_labels)

val_dataloader = DataLoader(val_encodings, val_labels)

test_dataset = DataLoader(test_encodings, test_labels)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.

    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of
              that observation belonging to a certain class.

    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids


    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions[0].argmax(-1)

    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=1)

    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)
    #from IPython.core.debugger import Pdb; Pdb().set_trace()

    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


In [None]:

# TrainingArguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=150,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="epoch",
    logging_steps=1,
    learning_rate=1e-5,
    weight_decay=0.01,
    warmup_steps=5,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    fp16=True,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)


def custom_loss_fct(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    outputs = model(**inputs)
    logits = outputs.get("logits") # the output embeddings
    #TODO: finish the expression for the loss and return loss, outputs tuple

def nll_loss(logits, labels):
    return torch.nn.functional.nll_loss(logits, labels)

# Define a custom Trainer with the loss function
class CustomTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
            #from IPython.core.debugger import Pdb; Pdb().set_trace()
            labels = inputs.get("labels")
            # forward pass
            outputs = model(**inputs)
            logits = outputs.get("logits") # the output embeddings

            # TODO: compute custom loss by using weights for the label set
            loss_fct = torch.nn.CrossEntropyLoss(weight=None)
            #loss_fct = nll_loss
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

            return (loss, outputs) if return_outputs else loss


# Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)





In [None]:
# fine tune the model
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,7.8629,7.852528,0.000928,2e-06,0.998345,0.000414
2,7.7872,7.798197,0.00464,0.000165,0.979756,0.00481
3,7.6725,7.696985,0.007114,0.000326,0.971621,0.006832
4,7.5418,7.58613,0.012063,0.001003,0.959039,0.012725
5,7.3997,7.468987,0.016084,0.002633,0.947328,0.01923
6,7.2604,7.349889,0.021033,0.00355,0.944539,0.021705
7,7.1155,7.236616,0.023198,0.004224,0.936378,0.028672
8,6.9688,7.121903,0.028147,0.005537,0.934138,0.031457
9,6.824,7.015456,0.028457,0.005551,0.945043,0.027902
10,6.6814,6.904967,0.032168,0.007164,0.93299,0.033892


TrainOutput(global_step=60750, training_loss=3.1731432693544237, metrics={'train_runtime': 4505.7533, 'train_samples_per_second': 430.516, 'train_steps_per_second': 13.483, 'total_flos': 3.46832018374464e+16, 'train_loss': 3.1731432693544237, 'epoch': 150.0})

In [None]:
trainer.save_model(repository_id)

events.out.tfevents.1745467038.a382b1389962.5684.0:   0%|          | 0.00/211k [00:00<?, ?B/s]

In [None]:
# evaluate the fine tuning model with the validation and test datasets

q=[trainer.evaluate(eval_dataset=df_org) for df_org in [train_dataloader, val_dataloader, test_dataset]]

pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]

Unnamed: 0,eval_loss,eval_Accuracy,eval_F1,eval_Precision,eval_Recall
train,2.270546,0.668883,0.572447,0.883947,0.567345
val,2.896829,0.554593,0.445551,0.8004,0.539623
test,5.338786,0.096814,0.061375,0.606485,0.197435


In [None]:
def predict(text):
    """
    Predicts the class label for a given input text

    Args:
        text (str): The input text for which the class label needs to be predicted.

    Returns:
        probs (torch.Tensor): Class probabilities for the input text.
        pred_label_idx (torch.Tensor): The index of the predicted class label.
        pred_label (str): The predicted class label.
    """
    # Tokenize the input text and move tensors to the GPU if available
    # TODO: check if max_length should be really 514
    inputs = tokenizer(text, padding=True, truncation=True, max_length=514, return_tensors="pt").to("cuda")

    # Get model output (logits)
    outputs = model(**inputs)

    probs = outputs[0].softmax(1)
    """ Explanation outputs: The BERT model returns a tuple containing the output logits (and possibly other elements depending on the model configuration). In this case, the output logits are the first element in the tuple, which is why we access it using outputs[0].

    outputs[0]: This is a tensor containing the raw output logits for each class. The shape of the tensor is (batch_size, num_classes) where batch_size is the number of input samples (in this case, 1, as we are predicting for a single input text) and num_classes is the number of target classes.

    softmax(1): The softmax function is applied along dimension 1 (the class dimension) to convert the raw logits into class probabilities. Softmax normalizes the logits so that they sum to 1, making them interpretable as probabilities. """

    # Get the index of the class with the highest probability
    # argmax() finds the index of the maximum value in the tensor along a specified dimension.
    # By default, if no dimension is specified, it returns the index of the maximum value in the flattened tensor.
    pred_label_idx = probs.argmax()

    # Now map the predicted class index to the actual class label
    # Since pred_label_idx is a tensor containing a single value (the predicted class index),
    # the .item() method is used to extract the value as a scalar
    pred_label = model.config.id2label[pred_label_idx.item()]

    return probs, pred_label_idx, pred_label

In [None]:
# Test with an example query 1 (see the document)
text = "cabernet sauvignon shiraz"

res=predict(text)

In [None]:
res[0].argmax(-1)

tensor([447], device='cuda:0')

In [None]:
id2wineid[447]

174264

In [None]:
catalog[catalog['WineID'] == 174264]

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Code,...,image,url,snippet,scrape_match,full_response,size,price,priceCurrency,VintageSummary,product_embed_description
447,174264,Max's Shiraz-Cabernet,Red,Varietal/100%,[Syrah/Shiraz],"[Beef, Lamb, Poultry]",14.5,Very full-bodied,High,AU,...,https://cdn.klwines.com/images/skus/1497364x.jpg,https://shop.klwines.com/products/details/1497364,A blend of 57% Cabernet Sauvignon and 43% Shir...,0.908263,"{'kind': 'customsearch#search', 'url': {'type'...",750ml Wine,69.99,USD,1951-2021,Wine Name: Max's Shiraz-Cabernet; Wine Type: R...


In [None]:
s=catalog[catalog['WineID'] == 174264]['product_embed_description'].iloc[0]
pprint.pp(s)

("Wine Name: Max's Shiraz-Cabernet; Wine Type: Red; Wine Elaborate: "
 'Varietal/100%; Grape Sources: Syrah/Shiraz; Wine Harmonize: Beef, Lamb, '
 'Poultry; Alcohol By Volume: 14.5; Wine Body: Very full-bodied; Wine Acidity: '
 'High; Bottle Size: 750ml Wine; Price Currency: USD; Price: 69.99; Country of '
 'Origin: Australia; Region of Origin: South Australia; Winery Name: Penfolds; '
 'Vintage Options: 1951-2021; Average Review Score: 3.948; Professional '
 'Review: A blend of 57% Cabernet Sauvignon and 43% Shiraz, this has a very '
 'impressively complete feel, a hallmark of the 2018 vintage wines, and '
 'there’s a myriad of characters with cabernet’s cedary and gently herbal '
 'notes sitting atop a core of rich red-plum and dark-berry Shiraz fruit '
 'aromas. So integrated. The palate has a very silky texture, so plush and '
 'polished with a wealth of rich and intense dark-plum, dark-berry and '
 'blackcurrant flavors. The oak is completely soaked with ripe, fresh fruit. '
 'Thi

In [None]:
s=catalog[catalog['WineID'] == 174264]['full_response'].iloc[0]
pprint.pp(s)

("{'kind': 'customsearch#search', 'url': {'type': 'application/json', "
 "'template': "
 "'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'}, "
 '\'queries\': {\'request\': [{\'title\': "Google Custom Search - '
 'site:https://www.klwines.com/ Penfolds Max\'s Shiraz-Cabernet", '
 '\'totalResults\': \'44\', \'searchTerms\': "site:https://www.klwines.com/ '
 'Penfolds Max\'s Shiraz-Cabernet

In [None]:
# Test with an example query 2 (see the document)
text = "$5-$10 italian red wine"


In [None]:
res=predict(text)

In [None]:
res

(tensor([[4.3280e-05, 1.3565e-05, 5.7085e-05,  ..., 9.3430e-05, 1.5527e-04,
          1.2872e-03]], device='cuda:0', grad_fn=<SoftmaxBackward0>),
 tensor(230, device='cuda:0'),
 135863)

In [None]:
id2wineid[230]

135863

In [None]:
catalog[catalog['WineID'] == 135863]

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Code,...,image,url,snippet,scrape_match,full_response,size,price,priceCurrency,VintageSummary,product_embed_description
230,135863,Lucente,Red,Varietal/100%,[Merlot],"[Beef, Lamb, Veal, Poultry, Cured Meat]",14.5,Medium-bodied,Medium,IT,...,https://cdn.klwines.com/images/skus/genericred...,https://shop.klwines.com/products/details/1699339,"Both rich and salty, this red expresses black ...",0.904812,"{'kind': 'customsearch#search', 'url': {'type'...",750ml Wine,14.99,USD,1950-2021,Wine Name: Lucente; Wine Type: Red; Wine Elabo...


In [None]:
# Test with an example query 3 (see the document)
text = "very full-bodied red wines"
res=predict(text)

In [None]:
res

(tensor([[1.8362e-03, 2.2105e-03, 2.0396e-04,  ..., 2.7422e-05, 2.0719e-05,
          6.1856e-05]], device='cuda:0', grad_fn=<SoftmaxBackward0>),
 tensor(579, device='cuda:0'),
 111547)

In [None]:
catalog[catalog['WineID'] == 111547]

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Code,...,image,url,snippet,scrape_match,full_response,size,price,priceCurrency,VintageSummary,product_embed_description
579,111547,Margaux,Red,Assemblage/Blend,"[Cabernet Sauvignon, Cabernet Franc, Merlot, P...","[Beef, Lamb, Game Meat, Poultry]",14.0,Full-bodied,High,FR,...,https://cdn.klwines.com/images/skus/1388690x.jpg,https://shop.klwines.com/products/details/1388690,"2015 Labégorce, Margaux. 95WE. 94JD ... A bloc...",0.905995,"{'kind': 'customsearch#search', 'url': {'type'...",750ml Wine,49.99,USD,1959-2021,Wine Name: Margaux; Wine Type: Red; Wine Elabo...


In [None]:
s=catalog[catalog['WineID'] == 111547]['product_embed_description'].iloc[0]
pprint.pp(s)

('Wine Name: Margaux; Wine Type: Red; Wine Elaborate: Assemblage/Blend; Grape '
 'Sources: Cabernet Sauvignon, Cabernet Franc, Merlot, Petit Verdot; Wine '
 'Harmonize: Beef, Lamb, Game Meat, Poultry; Alcohol By Volume: 14; Wine Body: '
 'Full-bodied; Wine Acidity: High; Bottle Size: 750ml Wine; Price Currency: '
 'USD; Price: 49.99; Country of Origin: France; Region of Origin: Margaux; '
 'Winery Name: Château Labégorce; Vintage Options: 1959-2021; Average Review '
 'Score: 4.075; Professional Review: There is a tough, tannic core to this '
 'otherwise ripe and fruity wine. All to the good, as this will allow the '
 'bold, ripe wine to age well. Black-currant fruits are already showing their '
 "hand and will intensify, bringing in richness and a dense texture. *Editors' "
 'Choice* (RV)')


In [None]:
s=catalog[catalog['WineID'] == 111547]['snippet'].iloc[0]
print(s)

2015 Labégorce, Margaux. 95WE. 94JD ... A blockbuster from Margaux is the 2015 Château Labégorce and I was blown away by this beauty.


In [None]:
s=catalog[catalog['WineID'] == 111547]['full_response'].iloc[0]
pprint.pp(s)

("{'kind': 'customsearch#search', 'url': {'type': 'application/json', "
 "'template': "
 "'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'}, "
 "'queries': {'request': [{'title': 'Google Custom Search - "
 "site:https://www.klwines.com/ Château Labégorce Margaux', 'totalResults': "
 "'748', 'searchTerms': 'site:https://www.klwines.com/ Château Labégorce "
 "Margaux', 'count': 1, 'startIn

In [None]:
# evaluate the model loading it from my HuggingFace account

model2 = AutoModelForSequenceClassification.from_pretrained(repository_id, output_hidden_states=True, num_labels=NUM_OF_WINE_IDS, id2label=id2wineid, label2id=wineid2id)
model2.to(device)

trainer = CustomTrainer(
    model=model2,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    compute_metrics=compute_metrics,
)


In [None]:
q=[trainer.evaluate(eval_dataset=df_org) for df_org in [train_dataloader, val_dataloader, test_dataset]]

pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]

Unnamed: 0,eval_loss,eval_model_preparation_time,eval_Accuracy,eval_F1,eval_Precision
train,2.270546,0.0032,0.668883,0.572447,0.883947
val,2.896829,0.0032,0.554593,0.445551,0.8004
test,5.338786,0.0032,0.096814,0.061375,0.606485
