In [None]:
!pip -q install lightning transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m109.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.4/562.4 kB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.9/68.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import transformers
import lightning as pl
import torch
import pandas as pd
import torch.nn.functional as F
import itertools
from torch.utils.data import DataLoader
from typing import List
from torch import nn
from multiprocessing import Pool
from tqdm import tqdm


#DATA

In [None]:
df_products = pd.read_json("https://raw.githubusercontent.com/anyoneai/e-commerce-open-data-set/master/products.json")
df_categories = pd.read_json("https://raw.githubusercontent.com/anyoneai/e-commerce-open-data-set/master/categories.json")

In [None]:
df_products = df_products[['name', 'category', 'description']]


category_counts = df_products['category'].explode().apply(lambda x: x['name']).value_counts()


threshold = 100
filtered_category_counts = category_counts[category_counts >= threshold]
filtered_categories = filtered_category_counts.index.tolist()
df_products['category'] = df_products['category'].apply(
    lambda x: [cat['name'] if cat['name'] in filtered_categories else 'Other' for cat in x]
)


In [None]:
for i in range(1, 7 + 1):
    level_name = 'level_' + str(i)
    df_products[level_name] = df_products['category'].apply(lambda x: x[i - 1] if len(x) >= i else 'NA')
df_products.drop('category', axis=1, inplace=True)

In [None]:
categories = dict()
for i in range(1,8):
  cat = df_products["level_"+str(i)].unique().tolist()  
  categories["level_"+str(i)] = dict(list(zip(cat, list(range(len(cat))))))

In [None]:
df_products["text"]= df_products["name"]+" "+df_products["description"] 

#MODEL



##Definition

In [None]:
BERT_EMBEDDING_SIZE= 768
WORKERS=8
NLP_MODEL_NAME = "bert-base-cased"
TOKENIZER=TOKENIZER = transformers.AutoTokenizer.from_pretrained(NLP_MODEL_NAME)


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
class BertModel(pl.LightningModule):
    """This model uses a BERT model for the text encoding.
    And it has 3 classifiers for the 3 levels of the hierarchy.

    Args:
        pl (LightningModule): Base class for all Lightning modules.
    """

    def __init__(
        self,
        nlp_model: str,
        level_1_labels: int,
        level_2_labels: int,
        level_3_labels: int,
        level_4_labels: int,
        level_5_labels: int,
        level_6_labels: int,
        level_7_labels: int,
    ):
        super().__init__()

        self.text_encoder = transformers.BertModel.from_pretrained(nlp_model)

        self.level_1_classifier = nn.Sequential(
            nn.Linear(BERT_EMBEDDING_SIZE, level_1_labels),
        )
        self.level_2_classifier = nn.Sequential(
            nn.Linear(
                BERT_EMBEDDING_SIZE + level_1_labels,
                level_2_labels,
            ),
        )
        self.level_3_classifier = nn.Sequential(
            nn.Linear(
                BERT_EMBEDDING_SIZE + level_2_labels,
                level_3_labels,
            ),
        )

        self.level_4_classifier = nn.Sequential(
            nn.Linear(
                BERT_EMBEDDING_SIZE + level_3_labels,
                level_4_labels,
            ),
        )

        self.level_5_classifier = nn.Sequential(
            nn.Linear(
                BERT_EMBEDDING_SIZE + level_4_labels,
                level_5_labels,
            ),
        )

        self.level_6_classifier = nn.Sequential(
            nn.Linear(
                BERT_EMBEDDING_SIZE + level_5_labels,
                level_6_labels,
            ),
        )

        self.level_7_classifier = nn.Sequential(
            nn.Linear(
                BERT_EMBEDDING_SIZE + level_6_labels,
                level_7_labels,
            ),
        )

        self.loss = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask):
        x = self.text_encoder(
            input_ids=input_ids, attention_mask=attention_mask, return_dict=False
        )[1]
        level_1 = self.level_1_classifier(x)
        x2 = torch.cat([x, F.softmax(level_1, dim=1)], dim=1)
        level_2 = self.level_2_classifier(x2)
        x3 = torch.cat([x, F.softmax(level_2, dim=1)], dim=1)
        level_3 = self.level_3_classifier(x3)
        x4 = torch.cat([x, F.softmax(level_3, dim=1)], dim=1)
        level_4 = self.level_4_classifier(x4)
        x5 = torch.cat([x, F.softmax(level_4, dim=1)], dim=1)
        level_5 = self.level_5_classifier(x5)
        x6 = torch.cat([x, F.softmax(level_5, dim=1)], dim=1)
        level_6 = self.level_6_classifier(x6)
        x7 = torch.cat([x, F.softmax(level_6, dim=1)], dim=1)
        level_7 = self.level_7_classifier(x7)
        return level_1, level_2, level_3, level_4, level_5, level_6, level_7

    def training_step(self, batch, batch_idx):
        (
            input_ids,
            attention_mask,
            level_1_labels,
            level_2_labels,
            level_3_labels,
            level_4_labels,
            level_5_labels,
            level_6_labels,
            level_7_labels,
        ) = batch
        level_1, level_2, level_3, level_4, level_5, level_6, level_7 = self(input_ids, attention_mask)
        loss = (
            self.loss(level_1, level_1_labels)
            + self.loss(level_2, level_2_labels)
            + self.loss(level_3, level_3_labels)
            + self.loss(level_4, level_4_labels)
            + self.loss(level_5, level_5_labels)
            + self.loss(level_6, level_6_labels)
            + self.loss(level_7, level_7_labels)
        )

        # Calculate accuracy
        level_1_pred = F.softmax(level_1, dim=1).argmax(dim=1)
        level_2_pred = F.softmax(level_2, dim=1).argmax(dim=1)
        level_3_pred = F.softmax(level_3, dim=1).argmax(dim=1)
        level_4_pred = F.softmax(level_4, dim=1).argmax(dim=1)
        level_5_pred = F.softmax(level_5, dim=1).argmax(dim=1)
        level_6_pred = F.softmax(level_6, dim=1).argmax(dim=1)
        level_7_pred = F.softmax(level_7, dim=1).argmax(dim=1)
        level_1_acc = (level_1_pred == level_1_labels).float().mean()
        level_2_acc = (level_2_pred == level_2_labels).float().mean()
        level_3_acc = (level_3_pred == level_3_labels).float().mean()
        level_4_acc = (level_4_pred == level_4_labels).float().mean()
        level_5_acc = (level_5_pred == level_5_labels).float().mean()
        level_6_acc = (level_6_pred == level_6_labels).float().mean()
        level_7_acc = (level_7_pred == level_7_labels).float().mean()

        # Log loss and accuracy
        self.log("train_loss", loss)
        self.log(
            "l1_acc",
            level_1_acc,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )
        self.log(
            "l2_acc",
            level_2_acc,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )
        self.log(
            "l3_acc",
            level_3_acc,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )

        self.log(
            "l4_acc",
            level_4_acc,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )

        self.log(
            "l5_acc",
            level_5_acc,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )

        self.log(
            "l6_acc",
            level_6_acc,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )

        self.log(
            "l7_acc",
            level_7_acc,
            prog_bar=True,
            on_step=True,
            on_epoch=False,
        )

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=2e-5,
            weight_decay=0.01,
            eps=1e-8,
        )

        return optimizer

In [None]:
class Dataset(torch.utils.data.Dataset):
    """Dataset for training the model."""

    def __init__(
        self,
        dataset,
        categories_level_1,
        categories_level_2,
        categories_level_3,
        categories_level_4,
        categories_level_5,
        categories_level_6,
        categories_level_7
    ):
        self.dataset = dataset

        self.categories_level_1 = categories_level_1
        self.categories_level_2 = categories_level_2
        self.categories_level_3 = categories_level_3
        self.categories_level_4 = categories_level_4
        self.categories_level_5 = categories_level_5
        self.categories_level_6 = categories_level_6
        self.categories_level_7 = categories_level_7

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx]
        input_ids = torch.tensor(row["input_ids"])
        attention_mask = torch.tensor(row["attention_mask"])

        level_1 = self.categories_level_1[row["level_1"]]
        level_2 = self.categories_level_2[row["level_2"]]
        level_3 = self.categories_level_3[row["level_3"]]
        level_4 = self.categories_level_4[row["level_4"]]
        level_5 = self.categories_level_5[row["level_5"]]
        level_6 = self.categories_level_6[row["level_6"]]
        level_7 = self.categories_level_7[row["level_7"]]

        return input_ids, attention_mask, level_1, level_2, level_3, level_4, level_5, level_6, level_7

In [None]:
def tokenize_dataset(dataset: pd.DataFrame) -> pd.DataFrame:
    """Tokenize dataset. Add new column with encoded text.
    text is a concatenation of name and description.

    Args:
        dataset (pd.DataFrame): dataset to tokenize

    Returns:
        pd.DataFrame: tokenized dataset with a new column "encoded"
    """
    dataset = dataset[dataset.text.notna()]
    with Pool(WORKERS) as p:
        tokens_pairs = p.map(encode, dataset["text"].to_list())

    dataset["input_ids"] = [x[0] for x in tokens_pairs]
    dataset["attention_mask"] = [x[1] for x in tokens_pairs] 

    return dataset

In [None]:
def encode(text: str) -> List[int]:
    """Encode text using BERT tokenizer.

    Args:
        text (str): text to encode

    Returns:
        List[int]: encoded text
    """
    tokens = TOKENIZER(
        text, padding="max_length", max_length=512, truncation=True, return_tensors="pt"
    )
    del tokens["token_type_ids"]

    return tokens["input_ids"][0].tolist(), tokens["attention_mask"][0].tolist()

##Train and load

In [None]:
df_products=tokenize_dataset(df_products)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["input_ids"] = [x[0] for x in tokens_pairs]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["attention_mask"] = [x[1] for x in tokens_pairs]


In [None]:
model = BertModel(NLP_MODEL_NAME, len(categories["level_1"]),len(categories["level_2"]),len(categories["level_3"]),len(categories["level_4"]),
                   len(categories["level_5"]),len(categories["level_6"]),len(categories["level_7"])).cuda()

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
df_train= df_products.iloc[:48000]
df_test= df_products.iloc[48000:]

In [None]:
dataset_train = Dataset(df_train, categories["level_1"],categories["level_2"],categories["level_3"],categories["level_4"],categories["level_5"]
                  ,categories["level_6"],categories["level_7"])
dataset_test = Dataset(df_test, categories["level_1"],categories["level_2"],categories["level_3"],categories["level_4"],categories["level_5"]
                  ,categories["level_6"],categories["level_7"])

In [None]:
dataloader_train = DataLoader(dataset_train, batch_size=8, shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=8, shuffle=False)

In [None]:
%%script false --no-raise-error
trainer = pl.Trainer(
        max_epochs=3,  # Train for 3 epochs
    )

# Train the model
trainer.fit(model, dataloader_train)

In [None]:
model_path="/content/gdrive/MyDrive/AnyoneAI/Proyecto final grupo 1/bert_model_v1.pt"

In [None]:
%%script false --no-raise-error
torch.save(model.state_dict(), model_path)

In [None]:
model.load_state_dict(torch.load(model_path))

FileNotFoundError: ignored

In [None]:
DEVICE="cuda:0"

In [None]:
def eval_model(model, dataloader):
    size = len(dataloader.dataset)
    model.eval()

    level_1_predictions = []
    level_2_predictions = []
    level_3_predictions = []
    level_4_predictions = []
    level_5_predictions = []
    level_6_predictions = []
    level_7_predictions = []

    level_1_probas = []
    level_2_probas = []
    level_3_probas = []
    level_4_probas = []
    level_5_probas = []
    level_6_probas = []
    level_7_probas = []

    with torch.no_grad():        
        for input_ids, attention_mask, level_1, level_2, level_3, level_4, level_5, level_6, level_7 in tqdm(dataloader):
            input_ids, attention_mask, level_1, level_2, level_3, level_4, level_5, level_6, level_7 = (
            input_ids.to(DEVICE),
            attention_mask.to(DEVICE),
            level_1.to(DEVICE),
            level_2.to(DEVICE),
            level_3.to(DEVICE),
            level_4.to(DEVICE),
            level_5.to(DEVICE),
            level_6.to(DEVICE),
            level_7.to(DEVICE),
            )
            level_1_linear_output, level_2_linear_output, level_3_linear_output, level_4_linear_output, level_5_linear_output, level_6_linear_output,level_7_linear_output = model(
                input_ids, attention_mask
            )

            level_1_pred = level_1_linear_output.argmax(1)
            level_2_pred = level_2_linear_output.argmax(1)
            level_3_pred = level_3_linear_output.argmax(1)
            level_4_pred = level_4_linear_output.argmax(1)
            level_5_pred = level_5_linear_output.argmax(1)
            level_6_pred = level_6_linear_output.argmax(1)
            level_7_pred = level_7_linear_output.argmax(1)

            level_1_predictions.extend(level_1_pred.tolist())
            level_2_predictions.extend(level_2_pred.tolist())
            level_3_predictions.extend(level_3_pred.tolist())
            level_4_predictions.extend(level_4_pred.tolist())
            level_5_predictions.extend(level_5_pred.tolist())
            level_6_predictions.extend(level_6_pred.tolist())
            level_7_predictions.extend(level_7_pred.tolist())

            # Use softmax to get probabilities
            level_1_probas.extend(torch.softmax(level_1_linear_output, dim=1).tolist())
            level_2_probas.extend(torch.softmax(level_2_linear_output, dim=1).tolist())
            level_3_probas.extend(torch.softmax(level_3_linear_output, dim=1).tolist())
            level_4_probas.extend(torch.softmax(level_4_linear_output, dim=1).tolist())
            level_5_probas.extend(torch.softmax(level_5_linear_output, dim=1).tolist())
            level_6_probas.extend(torch.softmax(level_6_linear_output, dim=1).tolist())
            level_7_probas.extend(torch.softmax(level_7_linear_output, dim=1).tolist())

    return (
        level_1_predictions,
        level_2_predictions,
        level_3_predictions,
        level_4_predictions,
        level_5_predictions,
        level_6_predictions,
        level_7_predictions,
        level_1_probas,
        level_2_probas,
        level_3_probas,
        level_4_probas,
        level_5_probas,
        level_6_probas,
        level_7_probas,
    )

In [None]:
evaluation= eval_model(model, dataloader_test)

In [None]:
(level_1_predictions,
level_2_predictions,
level_3_predictions,
level_4_predictions,
level_5_predictions,
level_6_predictions,
level_7_predictions,
level_1_probas,
level_2_probas,
level_3_probas,
level_4_probas,
level_5_probas,
level_6_probas,
level_7_probas)= evaluation

In [None]:
# iterate rows of the dataframe
labels_level_1 = []
labels_level_2 = []
labels_level_3 = []
labels_level_4 = []
labels_level_5 = []
labels_level_6 = []
labels_level_7 = []
for index, row in dataset_test.dataset.iterrows():
    labels_level_1.append(categories["level_1"][row["level_1"]])
    labels_level_2.append(categories["level_2"][row["level_2"]])
    labels_level_3.append(categories["level_3"][row["level_3"]])
    labels_level_4.append(categories["level_4"][row["level_4"]])
    labels_level_5.append(categories["level_5"][row["level_5"]])
    labels_level_6.append(categories["level_6"][row["level_6"]])
    labels_level_7.append(categories["level_7"][row["level_7"]])

In [None]:
def invert_dict(dictionary):
    inverted_dict = {}
    for key, value in dictionary.items():
        for sub_key, sub_value in value.items():
            if sub_value not in inverted_dict:
                inverted_dict[sub_value] = {}
            inverted_dict[sub_value][key] = sub_key
    return inverted_dict

In [None]:
search_categories=invert_dict(categories)

In [None]:
result = []
i=450
result.append(search_categories[labels_level_1[i]]["level_1"])
result.append(search_categories[labels_level_2[i]]["level_2"])
result.append(search_categories[labels_level_3[i]]["level_3"])
result.append(search_categories[labels_level_4[i]]["level_4"])
result.append(search_categories[labels_level_5[i]]["level_5"])
result.append(search_categories[labels_level_6[i]]["level_6"])
result.append(search_categories[labels_level_7[i]]["level_7"])
result = list(filter(lambda x: x != 'NA', result))
result

#METRICS

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score

In [None]:
print("Model 1 Level 1 Accuracy: {:.2f}".format(accuracy_score(labels_level_1, level_1_predictions)))
print("Model 1 Level 2 Accuracy: {:.2f}".format(accuracy_score(labels_level_2, level_2_predictions)))
print("Model 1 Level 3 Accuracy: {:.2f}".format(accuracy_score(labels_level_3, level_3_predictions)))
print("Model 1 Level 4 Accuracy: {:.2f}".format(accuracy_score(labels_level_4, level_4_predictions)))
print("Model 1 Level 5 Accuracy: {:.2f}".format(accuracy_score(labels_level_5, level_5_predictions)))
print("Model 1 Level 6 Accuracy: {:.2f}".format(accuracy_score(labels_level_6, level_6_predictions)))
print("Model 1 Level 7 Accuracy: {:.2f}".format(accuracy_score(labels_level_7, level_7_predictions)))

In [None]:
print("Model 1 Level 1 Accuracy: {:.2f}".format(f1_score(labels_level_1, level_1_predictions, average="weighted")))
print("Model 1 Level 2 Accuracy: {:.2f}".format(f1_score(labels_level_2, level_2_predictions, average="weighted")))
print("Model 1 Level 3 Accuracy: {:.2f}".format(f1_score(labels_level_3, level_3_predictions, average="weighted")))
print("Model 1 Level 4 Accuracy: {:.2f}".format(f1_score(labels_level_4, level_4_predictions, average="weighted")))
print("Model 1 Level 5 Accuracy: {:.2f}".format(f1_score(labels_level_5, level_5_predictions, average="weighted")))
print("Model 1 Level 6 Accuracy: {:.2f}".format(f1_score(labels_level_6, level_6_predictions, average="weighted")))
print("Model 1 Level 7 Accuracy: {:.2f}".format(f1_score(labels_level_7, level_7_predictions, average="weighted")))

In [None]:
print("Model 1 Level 1 Accuracy: {:.2f}".format(recall_score(labels_level_1, level_1_predictions, average="micro")))
print("Model 1 Level 2 Accuracy: {:.2f}".format(recall_score(labels_level_2, level_2_predictions, average="micro")))
print("Model 1 Level 3 Accuracy: {:.2f}".format(recall_score(labels_level_3, level_3_predictions, average="micro")))
print("Model 1 Level 4 Accuracy: {:.2f}".format(recall_score(labels_level_4, level_4_predictions, average="micro")))
print("Model 1 Level 5 Accuracy: {:.2f}".format(recall_score(labels_level_5, level_5_predictions, average="micro")))
print("Model 1 Level 6 Accuracy: {:.2f}".format(recall_score(labels_level_6, level_6_predictions, average="micro")))
print("Model 1 Level 7 Accuracy: {:.2f}".format(recall_score(labels_level_7, level_7_predictions, average="micro")))

#Predict

In [None]:
def predict(input):
  model.eval()
  data_pred = pd.DataFrame.from_dict({"text":[input]})
  data=tokenize_dataset(data_pred)
  input_ids=torch.tensor(data.iloc[0]["input_ids"])
  attention_mask=torch.tensor(data.iloc[0]["attention_mask"])
  l1,l2,l3,l4,l5,l6,l7 = model(input_ids.unsqueeze(0).cuda(), attention_mask.unsqueeze(0).cuda())
  return l1,l2,l3,l4,l5,l6,l7

In [None]:
def parse_predictions(l1,l2,l3,l4,l5,l6,l7):
  search_categories=invert_dict(categories)
  result = []
  result.append(search_categories[l1.argmax(1).item()]["level_1"])
  result.append(search_categories[l2.argmax(1).item()]["level_2"])
  result.append(search_categories[l3.argmax(1).item()]["level_3"])
  result.append(search_categories[l4.argmax(1).item()]["level_4"])
  result.append(search_categories[l5.argmax(1).item()]["level_5"])
  result.append(search_categories[l6.argmax(1).item()]["level_6"])
  result.append(search_categories[l7.argmax(1).item()]["level_7"])
  result = list(set(filter(lambda x: x != 'NA', result)))
  return result

In [None]:
def parse_probabilities(l1,l2,l3,l4,l5,l6,l7):
  search_categories=invert_dict(categories)
  result = []
  levels = [l1,l2,l3,l4,l5,l6,l7]
  for level in levels:
    max = F.softmax(level, dim=1).max(1)
    if max.indices.item() != 0:
      result.append(round(max.values.item(),2))
  return result

In [None]:
def combine_labels_with_probabilities(labels, probabilities):
    combined = [f'{label} {prob:.2f}%' for label, prob in zip(labels, probabilities)]
    return combined

In [None]:
input="home theatre sound system for tv" 

In [None]:
l1,l2,l3,l4,l5,l6,l7=predict(input)

In [None]:
labels =parse_predictions(l1,l2,l3,l4,l5,l6,l7)

In [None]:
probabilities=parse_probabilities(l1,l2,l3,l4,l5,l6,l7)

In [None]:
combine_labels_with_probabilities(labels, probabilities)

In [None]:
import matplotlib.pyplot as plt

def plot_metric(levels, metric_values, metric_name):
    plt.figure(figsize=(10,6))
    plt.bar(levels, metric_values, color='steelblue')
    for i in range(len(metric_values)):
        plt.text(i, metric_values[i], f'{metric_values[i]:.2f}', ha = 'center')
    plt.xlabel('Levels')
    plt.ylabel(metric_name)
    plt.title(f'Model 1 {metric_name} Across Levels')
    plt.show()


Lista de etiquetas de nivel

In [None]:
level_labels = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5', 'Level 6', 'Level 7']

Lista de precisión de nivel

In [None]:
accuracy_scores = [accuracy_score(labels_level_1, level_1_predictions),
                   accuracy_score(labels_level_2, level_2_predictions),
                   accuracy_score(labels_level_3, level_3_predictions),
                   accuracy_score(labels_level_4, level_4_predictions),
                   accuracy_score(labels_level_5, level_5_predictions),
                   accuracy_score(labels_level_6, level_6_predictions),
                   accuracy_score(labels_level_7, level_7_predictions)]

Crear el gráfico de barras

In [None]:
plt.bar(level_labels, accuracy_scores)

Etiquetas y título del gráfico

In [None]:
import matplotlib.pyplot as plt

levels = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5', 'Level 6', 'Level 7']

#Accuracy values
accuracy_values = [
    recall_score(labels_level_1, level_1_predictions, average="micro"),
    recall_score(labels_level_2, level_2_predictions, average="micro"),
    recall_score(labels_level_3, level_3_predictions, average="micro"),
    recall_score(labels_level_4, level_4_predictions, average="micro"),
    recall_score(labels_level_5, level_5_predictions, average="micro"),
    recall_score(labels_level_6, level_6_predictions, average="micro"),
    recall_score(labels_level_7, level_7_predictions, average="micro")
]
plt.figure(figsize=(8, 5))
plt.bar(levels, accuracy_values)
plt.xlabel('Level')
plt.ylabel('Accuracy')
plt.title('Model Accuracy per Level')
plt.show()

#F1 score values
f1_values = [
    recall_score(labels_level_1, level_1_predictions, average="micro"),
    recall_score(labels_level_2, level_2_predictions, average="micro"),
    recall_score(labels_level_3, level_3_predictions, average="micro"),
    recall_score(labels_level_4, level_4_predictions, average="micro"),
    recall_score(labels_level_5, level_5_predictions, average="micro"),
    recall_score(labels_level_6, level_6_predictions, average="micro"),
    recall_score(labels_level_7, level_7_predictions, average="micro")
]
plt.figure(figsize=(8, 5))
plt.bar(levels, f1_values)
plt.xlabel('Level')
plt.ylabel('F1 Score')
plt.title('Model F1 Score per Level')
plt.show()

#Recall values
recall_values = [
    recall_score(labels_level_1, level_1_predictions, average="micro"),
    recall_score(labels_level_2, level_2_predictions, average="micro"),
    recall_score(labels_level_3, level_3_predictions, average="micro"),
    recall_score(labels_level_4, level_4_predictions, average="micro"),
    recall_score(labels_level_5, level_5_predictions, average="micro"),
    recall_score(labels_level_6, level_6_predictions, average="micro"),
    recall_score(labels_level_7, level_7_predictions, average="micro")
]
plt.figure(figsize=(8, 5))
plt.bar(levels, recall_values)
plt.xlabel('Level')
plt.ylabel('Recall')
plt.title('Model Recall per Level')
plt.show()

In [None]:
import numpy as np

# Define the metrics
levels = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5', 'Level 6', 'Level 7']
accuracy_values = [accuracy_score(labels_level_1, level_1_predictions), 
                   accuracy_score(labels_level_2, level_2_predictions),
                   accuracy_score(labels_level_3, level_3_predictions),
                   accuracy_score(labels_level_4, level_4_predictions),
                   accuracy_score(labels_level_5, level_5_predictions),
                   accuracy_score(labels_level_6, level_6_predictions),
                   accuracy_score(labels_level_7, level_7_predictions)]

f1_values = [f1_score(labels_level_1, level_1_predictions, average="micro"), 
             f1_score(labels_level_2, level_2_predictions, average="micro"),
             f1_score(labels_level_3, level_3_predictions, average="micro"),
             f1_score(labels_level_4, level_4_predictions, average="micro"),
             f1_score(labels_level_5, level_5_predictions, average="micro"),
             f1_score(labels_level_6, level_6_predictions, average="micro"),
             f1_score(labels_level_7, level_7_predictions, average="micro")]

recall_values = [recall_score(labels_level_1, level_1_predictions, average="micro"), 
                 recall_score(labels_level_2, level_2_predictions, average="micro"),
                 recall_score(labels_level_3, level_3_predictions, average="micro"),
                 recall_score(labels_level_4, level_4_predictions, average="micro"),
                 recall_score(labels_level_5, level_5_predictions, average="micro"),
                 recall_score(labels_level_6, level_6_predictions, average="micro"),
                 recall_score(labels_level_7, level_7_predictions, average="micro")]

# Set bar width and positions
barWidth = 0.25
r1 = np.arange(len(accuracy_values))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]

# Start the plot
plt.figure(figsize=(10,6))

# Add bars for each metric
plt.bar(r1, accuracy_values, color='b', width=barWidth, edgecolor='grey', label='Accuracy')
plt.bar(r2, f1_values, color='r', width=barWidth, edgecolor='grey', label='F1 Score')
plt.bar(r3, recall_values, color='g', width=barWidth, edgecolor='grey', label='Recall')

# Add xticks in the middle of the group bars
plt.xlabel('Levels')
plt.xticks([r + barWidth for r in range(len(accuracy_values))], levels)

# Add labels and title
plt.ylabel('Score')
plt.title('Model Performance Metrics per Level')

# Create legend & Show graphic
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Definir las etiquetas de nivel
levels = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5', 'Level 6', 'Level 7']

# Definir las métricas de recall
recall_values = [recall_score(labels_level_1, level_1_predictions, average="micro"),
                 recall_score(labels_level_2, level_2_predictions, average="micro"),
                 recall_score(labels_level_3, level_3_predictions, average="micro"),
                 recall_score(labels_level_4, level_4_predictions, average="micro"),
                 recall_score(labels_level_5, level_5_predictions, average="micro"),
                 recall_score(labels_level_6, level_6_predictions, average="micro"),
                 recall_score(labels_level_7, level_7_predictions, average="micro")]

# Calcular la curva ROC para cada nivel
fpr, tpr, thresholds = roc_curve(levels, recall_values)

# Calcular el área bajo la curva (AUC)
roc_auc = auc(fpr, tpr)

# Crear el gráfico de la curva ROC
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')  # Línea de referencia diagonal
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")

# Mostrar el gráfico
plt.show()
