In [None]:
import numpy as np
import pandas as pd
import sagemaker
import torch

from datasets import Dataset
from datasets.filesystems import S3FileSystem
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TextClassificationPipeline
)
from sagemaker.huggingface import HuggingFace
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# -------------------------------------
# Display
# -------------------------------------
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option("display.max_colwidth", None)

# Load the data and split into a training and test dataset

In [None]:
wine_df = pd.read_csv("data/wine_data.csv")
bins = [0, 87, 94, np.inf]
names = ["neutral", "good", "excellent"]

wine_df["rating"] = pd.cut(wine_df["points"], bins, labels=names)

In [None]:
NUMERICAL_FEATURE = "price"
CATEGORICAL_FEATURE = "variety"
TEXT_FEATURE = "description"
TARGET = "rating"
FEATURES = [TEXT_FEATURE, NUMERICAL_FEATURE, CATEGORICAL_FEATURE]

wine_df = wine_df[FEATURES + [TARGET]]

In [None]:
train_df, test_df = train_test_split(wine_df, test_size=0.2)

# Preprocessing
- generate the text input 
- tokenize text data

In [None]:
TARGET_CATEGORIES = ["neutral", "good", "excellent"]
le = LabelEncoder().fit(TARGET_CATEGORIES)
train_df["labels"] = le.transform(train_df[TARGET])

def generate_text_input(df):
    # converting all columns to string type
    df[FEATURES] = df[FEATURES].astype(str)
    df[FEATURES] = df[FEATURES].fillna("")
    df["text"] =df[FEATURES].agg(" [SEP] ".join, axis=1)
    return df

train_df = generate_text_input(train_df, FEATURES)


In [None]:
MODEL_NAME = "distilbert-base-uncased"

def tokenized_pytorch_tensors(
        df: pd.DataFrame,
        column_list: list
    ) -> Dataset:

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    transformers_dataset = Dataset.from_pandas(df)

    def tokenize(model_inputs_batch: Dataset) -> Dataset:
        return tokenizer(
            model_inputs_batch[TEXT_FEATURE],
            padding=True,
            max_length=120,
            truncation=True,
        )

    tokenized_dataset = transformers_dataset.map(
        tokenize,
        batched=True,
        batch_size=128
    )

    tokenized_dataset.set_format(
        "torch",
        columns=column_list
    )

    columns_to_remove = set(tokenized_dataset.column_names) - set(column_list)

    tokenized_dataset = tokenized_dataset.remove_columns(list(columns_to_remove))

    return tokenized_dataset

print("Tokenize text in Dataset of Pytorch tensors")

tokenized_train_df = tokenized_pytorch_tensors(
    train_df[["text", "labels"]],
    column_list=["input_ids", "attention_mask", "labels"]
)
s3 = S3FileSystem()
tokenized_train_df.save_to_disk("s3://path_to_training_data", fs=s3)


# Fine-tune distilbert

In [None]:
ROLE = sagemaker.get_execution_role()

hyperparameters={
    "epochs": 1,
    "train_batch_size": 128,
    "model_name": "distilbert-base-uncased",
}

huggingface_estimator = HuggingFace(
    entry_point="train.py",
    source_dir="s3://path_to_training.tar.gz”",
    output_path="s3://path_to_outputs",
    instance_type="ml.g4dn.xlarge",
    instance_count=1,
    transformers_version="4.6",
    pytorch_version="1.7",
    py_version="py36",
    hyperparameters = hyperparameters,
    role=ROLE
)

huggingface_estimator.fit(
    {"train": "s3://path_to_training_data"}
)


# Evaluate the model

In [None]:
test_df = generate_text_input(test_df, FEATURES)
model = AutoModelForSequenceClassification.from_pretrained("path_to_model")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

pipe = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    return_all_scores=False
)

test_df = generate_text_input(test_df, FEATURES)

actual = test_df[TARGET].values
predictions = [
    pipe(text)
    for text in test_df.text.values
]
prediction_labels = [int(prediction[0]["label"].split("_")[1]) for prediction in predictions]
decoded_predictions = le.inverse_transform(prediction_labels)
accuracy_score(actual, decoded_predictions)