# Requirements

In [21]:
!pip install transformers[torch]

Collecting accelerate>=0.26.0 (from transformers[torch])
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.2.1-py3-none-any.whl (336 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.2.1


In [4]:
# Add as many imports as you need.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch




In [5]:
from datasets import load_dataset, Dataset

# Laboratory Exercise - Run Mode (8 points)

## Introduction
This laboratory assignment's primary objective is to fine-tune a pre-trained language model for binary classification on a dataset consisting of wine reviews. The dataset contains two attributes: **description** and **points**. The description is a brief text describing the wine and the points represent a quality metric ranging from 1 to 100. If some wine has at least 90 points it is considered **exceptional**. Your task involves predicting if some wine is **exceptional** based on its review.

## The Wine Reviews Dataset

Load the dataset using the `datasets` library.

In [6]:
# Write your code here. Add as many boxes as you need.
wine_reviews = load_dataset('csv', data_files='./data/wine-reviews.csv')

In [7]:
wine_reviews

DatasetDict({
    train: Dataset({
        features: ['description', 'points'],
        num_rows: 10000
    })
})

In [8]:
df = wine_reviews["train"].to_pandas()
df

Unnamed: 0,description,points
0,"Translucent in color, silky in the mouth, this...",85
1,"On the palate, this wine is rich and complex, ...",92
2,The producer blends 57% Chardonnay from the Ma...,92
3,"Pure Baga in all its glory, packed with dry an...",93
4,Think of Subsídio as a contribution rather tha...,89
...,...,...
9995,"From the folks at Merryvale, this is a blend o...",92
9996,"An easy, versatile food wine or cocktail sippe...",84
9997,Black-fruit aromas blend nicely into chocolate...,93
9998,"This combines 61% Cabernet Sauvignon, 14% Syra...",90


## Target Extraction
Extract the target **exceptional** for each wine review. If some wine has at least 90 points it is considered **exceptional**.

In [9]:
# Write your code here. Add as many boxes as you need.
df['exceptional'] = (df['points'] >= 90).astype(int)
df

Unnamed: 0,description,points,exceptional
0,"Translucent in color, silky in the mouth, this...",85,0
1,"On the palate, this wine is rich and complex, ...",92,1
2,The producer blends 57% Chardonnay from the Ma...,92,1
3,"Pure Baga in all its glory, packed with dry an...",93,1
4,Think of Subsídio as a contribution rather tha...,89,0
...,...,...,...
9995,"From the folks at Merryvale, this is a blend o...",92,1
9996,"An easy, versatile food wine or cocktail sippe...",84,0
9997,Black-fruit aromas blend nicely into chocolate...,93,1
9998,"This combines 61% Cabernet Sauvignon, 14% Syra...",90,1


In [10]:
wine_reviews["train"] = wine_reviews["train"].add_column("label", df['exceptional'])

In [11]:
wine_reviews

DatasetDict({
    train: Dataset({
        features: ['description', 'points', 'label'],
        num_rows: 10000
    })
})

## Dataset Splitting
Partition the dataset into training and testing sets with an 80:20 ratio.


In [12]:
wine_reviews = wine_reviews['train'].train_test_split(test_size=0.2)

In [13]:
wine_reviews

DatasetDict({
    train: Dataset({
        features: ['description', 'points', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['description', 'points', 'label'],
        num_rows: 2000
    })
})

## Tokenization
Tokenize the texts using the `AutoTokenizer` class.

In [14]:
# Write your code here. Add as many boxes as you need.
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_function(examples):
    return tokenizer(
        examples["description"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

train_dataset = wine_reviews['train'].map(tokenize_function, batched=True)
test_dataset = wine_reviews['test'].map(tokenize_function, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [15]:
test_dataset

Dataset({
    features: ['description', 'points', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2000
})

## Fine-tuning a Pre-trained Language Model for Classification
Fine-tune a pre-trained language model for classification on the given dataset.

Define the model using the `AutoModelForSequenceClassification` class.

In [16]:
import torch

In [17]:
# Write your code here. Add as many boxes as you need.
# import torch
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define the traning parameters using the `TrainingArguments` class.

In [24]:
# Write your code here. Add as many boxes as you need.
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    metric_for_best_model="f1",
    report_to="none",
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

Define the training using the `Trainer` class.

In [None]:
# Write your code here. Add as many boxes as you need.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


Fine-tune (train) the pre-trained lanugage model.

In [None]:
# Write your code here. Add as many boxes as you need.
trainer.train()

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided []

Use the trained model to make predictions for the test set.

In [None]:
# Write your code here. Add as many boxes as you need.
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

In [None]:
# Write your code here. Add as many boxes as you need.
predictions = trainer.predict(test_dataset)
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), dim=1)

In [None]:
y_true = test_dataset["exceptional"]
print(classification_report(y_true, predicted_labels, target_names=["Not Exceptional", "Exceptional"]))

# Laboratory Exercise - Bonus Task (+ 2 points)

Implement a simple machine learning pipeline to classify wine reviews as **exceptional** or not. Use TF-IDF vectorization to convert text into numerical features and train a logistic regression. Split the dataset into training and testing sets, fit the pipeline on the training data, and evaluate its performance using metrics such as precision, recall, and F1-score. Analyze the texts to find the most influential words or phrases associated with the **exceptional** wines. Use the coefficients from the logistic regression trained on TF-IDF features to identify the top positive and negative keywords for **exceptional** wines. Present these keywords in a simple table or visualization (e.g., bar chart).

In [None]:
# Write your code here. Add as many boxes as you need.