# NUBI NLP / ML Engineering Challenge
# ======================================

In [14]:
import pandas as pd
import numpy as np
import os.path as osp
from tqdm.notebook import tqdm

## Datset

In [2]:
INPUT_PATH = './training_data/'
OUTPUT_PATH = './models/'
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [3]:
train = pd.read_csv(INPUT_PATH + 'textcat_smart_vs_tv.csv')
train = train.rename(columns = {'Title': 'text'})
train.head()

Unnamed: 0.1,Unnamed: 0,Encrypted_Seller,Encrypted_Id,text,label
0,0.0,b10532ee783f9545d7bddd8f9da20d2a8d1642a486caf5...,63c875becd6c78649539497bdd134c2c762f610b3616ae...,Smart Tv Noblex Di43x5100x Led Full Hd 43 220v,1
1,1.0,aa4dd7ba40c80cf15a859c19f2500e328797e2251322d4...,490e4b75cfa66087c95b20fb6546e4dfdc952f7537a8e3...,Smart Tv Tcl L42s6500 Led Full Hd 42 220v,1
2,2.0,e48526711c7cd386af46750540bf107acac9b5988b515a...,eab7301d49cd3bbe3bcd3c196da0d61713de45b44523c1...,Tv Samsung Smart Tv Hd 32 Para Repuestos,1
3,3.0,9e527f7876123f50719291aed05c8351bab1ae9abacadb...,daf900de69964f2cf241ed5395ac74edd1f2deb6577356...,Tv Led Samsung 46 Smart Pantalla Rota,1
4,4.0,cf3021aa458178431fa2f22daa52ee17f005d6728d9b35...,b244c68806fab7d7d523a3927a4eb6381610c40313e8bb...,Tv 29 Sanyo Vizon,0


In [4]:
train.describe()

Unnamed: 0.1,Unnamed: 0,label
count,1999.0,2219.0
mean,999.696848,0.590807
std,577.571969,0.67829
min,0.0,0.0
25%,499.5,0.0
50%,1000.0,0.0
75%,1499.5,1.0
max,1999.0,2.0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2219 entries, 0 to 2218
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1999 non-null   float64
 1   Encrypted_Seller  1999 non-null   object 
 2   Encrypted_Id      1999 non-null   object 
 3   text              2219 non-null   object 
 4   label             2219 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 86.8+ KB


In [6]:
train_texts = train['text'].to_list()
train_labels = train['label'].to_list()
print(len(train_texts), len(train_labels))

2219 2219


## Split Data

In [7]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = \
train_test_split(
    train_texts, 
    train_labels, 
    test_size=.2,
    random_state=RANDOM_SEED,
    stratify = train_labels
)

In [8]:
print(f'Train Example: \n text: {train_texts[0]}\n label: {train_labels[0]}\n\n')
print(f'Val Example: \n text: {test_texts[0]}\n label: {test_labels[0]}\n\n')

Train Example: 
 text: Tv Led Samsung 43 Pulgadas Impecable.
 label: 0


Val Example: 
 text: Smart Tv Fhd 43  LG 43lm6350
 label: 1




## Tokenizer

In [9]:
from transformers import AutoTokenizer
  
tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

In [10]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

## Torch Dataset

In [11]:
import torch

class NubiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NubiDataset(train_encodings, train_labels)
test_dataset = NubiDataset(test_encodings, test_labels)

# Fine-tuning with Trainer

## download pretrained model

In [12]:
#Limpiar cuda
torch.cuda.empty_cache()
#import gc
#del model
#gc.collect()

In [13]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./models',          # output directory
    save_strategy="no",              # save strategy
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=500,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

model = AutoModelForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", num_labels=len(train.label.unique()))
#model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased")

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchi

## Evaluate

In [14]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

## training

In [15]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # test dataset
    compute_metrics=compute_metrics      # the callback that computes metrics of interest
)

In [16]:
trainer.train()

***** Running training *****
  Num examples = 1775
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1332
 38%|███▊      | 500/1332 [00:25<00:38, 21.74it/s]***** Running Evaluation *****
  Num examples = 444
  Batch size = 64


{'loss': 0.3081, 'learning_rate': 5e-05, 'epoch': 1.13}


                                                  
 38%|███▊      | 502/1332 [00:26<01:16, 10.89it/s]

{'eval_loss': 0.2226010411977768, 'eval_accuracy': 0.963963963963964, 'eval_runtime': 0.4352, 'eval_samples_per_second': 1020.17, 'eval_steps_per_second': 16.084, 'epoch': 1.13}


 75%|███████▌  | 1000/1332 [00:48<00:15, 21.96it/s]***** Running Evaluation *****
  Num examples = 444
  Batch size = 64


{'loss': 0.1501, 'learning_rate': 1.9951923076923078e-05, 'epoch': 2.25}


                                                   
 75%|███████▌  | 1004/1332 [00:49<00:24, 13.35it/s]

{'eval_loss': 0.1416054666042328, 'eval_accuracy': 0.9797297297297297, 'eval_runtime': 0.4198, 'eval_samples_per_second': 1057.679, 'eval_steps_per_second': 16.675, 'epoch': 2.25}


100%|█████████▉| 1331/1332 [01:04<00:00, 21.69it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 1332/1332 [01:04<00:00, 20.72it/s]

{'train_runtime': 64.2835, 'train_samples_per_second': 82.836, 'train_steps_per_second': 20.721, 'train_loss': 0.1914982623882122, 'epoch': 3.0}





TrainOutput(global_step=1332, training_loss=0.1914982623882122, metrics={'train_runtime': 64.2835, 'train_samples_per_second': 82.836, 'train_steps_per_second': 20.721, 'train_loss': 0.1914982623882122, 'epoch': 3.0})

In [17]:
trainer.save_model(OUTPUT_PATH + './NubiTvSmartOther/')

Saving model checkpoint to ./models/./NubiTvSmartOther/
Configuration saved in ./models/./NubiTvSmartOther/config.json
Model weights saved in ./models/./NubiTvSmartOther/pytorch_model.bin


In [18]:
tokenizer.save_pretrained(OUTPUT_PATH + './NubiTvSmartOther/')

tokenizer config file saved in ./models/./NubiTvSmartOther/tokenizer_config.json
Special tokens file saved in ./models/./NubiTvSmartOther/special_tokens_map.json


('./models/./NubiTvSmartOther/tokenizer_config.json',
 './models/./NubiTvSmartOther/special_tokens_map.json',
 './models/./NubiTvSmartOther/vocab.txt',
 './models/./NubiTvSmartOther/added_tokens.json',
 './models/./NubiTvSmartOther/tokenizer.json')

# Evaluate

In [19]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 444
  Batch size = 64
100%|██████████| 7/7 [00:00<00:00, 17.20it/s]


{'eval_loss': 0.134835347533226,
 'eval_accuracy': 0.9819819819819819,
 'eval_runtime': 0.4696,
 'eval_samples_per_second': 945.49,
 'eval_steps_per_second': 14.906,
 'epoch': 3.0}

## Inference

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("./models/NubiTvSmartOther")

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained("./models/NubiTvSmartOther", num_labels=3).to(device)

# Real test

In [4]:
categories = ['Televisión', 'Smart', 'Otro']

In [5]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return categories[probs.argmax()]

In [6]:
# Example #1
text = """
Smart Tv Kanji Kj-32mt005 Led Hd 32  220v
"""
text1 = """
Tv Lcd Philips 42
"""
text2 = """
reparar monitor
"""

print(get_prediction(text), get_prediction(text1), get_prediction(text2))

Smart Televisión Otro


# Inferencia sobre datasets TV

In [15]:
final_db = pd.read_csv("../final_database_cel_tv.csv")

In [11]:
for i in tqdm(range(len(final_db))):
    if final_db['Producto'][i] == 'Televisión':
        input = tokenizer.encode(final_db.iloc[i]['Title'], return_tensors="pt").to(device)
        output = model(input[:, :512])[0]
        predictions = torch.softmax(output, dim=1)

        final_db.loc[i, 'type_tv'] =  categories[predictions[0].argmax()]
    else:
        final_db.loc[i, 'type_tv'] =  'no'

  0%|          | 0/99362 [00:00<?, ?it/s]

In [30]:
#final_db.to_csv('../final_database_cel_tv.csv')
final_db

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Encrypted_Seller,Encrypted_Id,Title,marca,start,end,marcas_corregidas,Producto,seller_id,type_tv
0,0,0,b10532ee783f9545d7bddd8f9da20d2a8d1642a486caf5...,63c875becd6c78649539497bdd134c2c762f610b3616ae...,Smart Tv Noblex Di43x5100x Led Full Hd 43 220v,Noblex,9,15,Noblex,Televisión,43945,Smart
1,1,1,aa4dd7ba40c80cf15a859c19f2500e328797e2251322d4...,490e4b75cfa66087c95b20fb6546e4dfdc952f7537a8e3...,Smart Tv Tcl L42s6500 Led Full Hd 42 220v,Tcl,9,12,TCL,Televisión,42250,Smart
2,2,2,e48526711c7cd386af46750540bf107acac9b5988b515a...,eab7301d49cd3bbe3bcd3c196da0d61713de45b44523c1...,Tv Samsung Smart Tv Hd 32 Para Repuestos,Samsung,3,10,Samsung,Televisión,56776,Smart
3,3,3,9e527f7876123f50719291aed05c8351bab1ae9abacadb...,daf900de69964f2cf241ed5395ac74edd1f2deb6577356...,Tv Led Samsung 46 Smart Pantalla Rota,Samsung,7,14,Samsung,Televisión,39355,Smart
4,4,4,cf3021aa458178431fa2f22daa52ee17f005d6728d9b35...,b244c68806fab7d7d523a3927a4eb6381610c40313e8bb...,Tv 29 Sanyo Vizon,Sanyo,6,11,Sanyo,Televisión,51473,Televisión
...,...,...,...,...,...,...,...,...,...,...,...,...
99357,99357,81161,5583dd03fee10a870b6e272731cdd64cfd4a24a8a70b94...,e6fb500e2a7d7136d6198e4bf9bff54d3247db6555f24e...,Celular Google Pixel 2 Xl,Google,8,14,Google,Celular,21298,no
99358,99358,81162,aa5e066df32a70ef64b43bedba3e96ba9043cdde8530aa...,e54a3b76660eadd49b7dcf7146f330c889892c1b1f8a8f...,iPhone XR 256 Gb. Blanco,iPhone,0,6,Apple,Celular,42262,no
99359,99359,81163,19a7c395b0c5c03b37100a057fe9747c71638a9eb1bd40...,37467aeaf0450f2526a0d209e8c519f698756c5646fb8b...,Celular Moto G8 Plus 64 Gb Cosmic Blue 4 Gb Mo...,Motorola,44,52,Motorola,Celular,6244,no
99360,99360,81164,fd7ea4aeecfcce434cc150059fa844da0374bfed65877a...,70afb1016aae244dbdd68dc613a16935fdb73880e4871d...,Celular Blackberry 9320 Curve Para Movistar,Blackberry,8,18,BlackBerry,Celular,63074,no


In [37]:
#final_db[final_db['marcas_corregidas'] == 'Samsung'].value_counts()
#final_db.marcas_corregidas.str.count('Noblex').sum()

#num = final_db.Producto.str.count('Televisión').sum()
final_db[final_db['marcas_corregidas'] == 'BlackBerry']['seller_id'].nunique()

236