In [1]:
import numpy as np 
import pandas as pd 
from datasets import Dataset, load_dataset
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import evaluate
from transformers import AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset(
    'csv',
    data_files='dataset/IMDB_1.csv'
)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['review_es', 'sentiment'],
        num_rows: 250
    })
})

In [4]:
dataset['train'].column_names

['review_es', 'sentiment']

In [5]:
df = pd.DataFrame(dataset['train'])

In [6]:
df

Unnamed: 0,review_es,sentiment
0,Uno de los otros críticos ha mencionado que de...,positive
1,Una pequeña pequeña producción.La técnica de f...,positive
2,Pensé que esta era una manera maravillosa de p...,positive
3,"Básicamente, hay una familia donde un niño peq...",negative
4,"El ""amor en el tiempo"" de Petter Mattei es una...",positive
...,...,...
245,He visto esta película al menos 100 veces y to...,positive
246,Este espectáculo no tiene absolutamente ningún...,negative
247,¡Esta película fue tan mala que fue divertido!...,negative
248,"Meryl Streep es un genio.Bueno, al menos como ...",positive


In [7]:
df['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [8]:
encoder = LabelEncoder()

labels = encoder.fit_transform(dataset['train']['sentiment'])

In [9]:
dataset['train'] = dataset['train'].add_column('label', labels)

In [10]:
dataset['train'] = dataset['train'].remove_columns('sentiment')

In [11]:
dataset = dataset.rename_column('review_es', 'text')

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 250
    })
})

In [13]:
dataset = dataset['train'].train_test_split(test_size=0.2)

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50
    })
})

In [15]:
dataset['train'].features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None)}

In [16]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [17]:
def tokenize(sample):
    return tokenizer(sample["text"], truncation=True)

In [18]:
tokenized_dataset = dataset.map(tokenize, batched=True)

Map: 100%|██████████| 200/200 [00:00<00:00, 1413.82 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 2525.65 examples/s]


In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [20]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50
    })
})

In [21]:
# HYPER TUNE THE MODEL 

training_args = TrainingArguments(

    output_dir = 'trainer',
    eval_strategy='epoch',
    per_device_train_batch_size=8, #batch size for training 
    per_device_eval_batch_size=8, # batch size for evaluation
    metric_for_best_model='f1',
    num_train_epochs=3
)

In [22]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
metrics = evaluate.load('f1')

In [24]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")

In [25]:
trainer = Trainer(
    model,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics = compute_metrics,
    data_collator = data_collator
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
predictions = model.predict(tokenized_dataset['test'])

In [None]:
logits, labels = predictions.predictions, predictions.label_ids
preds = np.argmax(logits, axis=-1)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(labels, preds))