<a href="https://colab.research.google.com/github/dalgual/aidatasci/blob/main/tweets_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch
!pip install tensorflow



In [2]:
import torch
torch.cuda.is_available()

False

In [3]:
!pip install datasets transformers huggingface_hub




In [4]:
# !apt-get install git-lfs


In [5]:
from datasets import load_dataset
#imdb = load_dataset("imdb")

data_files = "https://raw.githubusercontent.com/dalgual/aidatasci/master/data/bigdata/tweets.csv"
dataset = load_dataset("csv", data_files=data_files, split="train")

In [6]:
print(dataset)

Dataset({
    features: ['ItemID', 'Sentiment', 'SentimentSource', 'SentimentText'],
    num_rows: 1932
})


In [7]:
# https://discuss.huggingface.co/t/how-to-split-hugging-face-dataset-to-train-and-test/20885/3
train_ds=dataset.train_test_split(test_size=0.3, shuffle=True) #, stratify_by_column='Sentiment')
print(train_ds)

DatasetDict({
    train: Dataset({
        features: ['ItemID', 'Sentiment', 'SentimentSource', 'SentimentText'],
        num_rows: 1352
    })
    test: Dataset({
        features: ['ItemID', 'Sentiment', 'SentimentSource', 'SentimentText'],
        num_rows: 580
    })
})


In [8]:
train_ds = train_ds.rename_column("Sentiment", "labels")
train_ds = train_ds.rename_column("SentimentText", "text")
print(train_ds)

DatasetDict({
    train: Dataset({
        features: ['ItemID', 'labels', 'SentimentSource', 'text'],
        num_rows: 1352
    })
    test: Dataset({
        features: ['ItemID', 'labels', 'SentimentSource', 'text'],
        num_rows: 580
    })
})


In [9]:
small_train_dataset = train_ds["train"].shuffle(seed=42).select([i for i in list(range(1352))])
small_test_dataset = train_ds["test"].shuffle(seed=42).select([i for i in list(range(580))])


In [10]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [11]:
def preprocess_function(examples):
   #return tokenizer(examples["SentimentText"], truncation=True)
   return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/1352 [00:00<?, ? examples/s]

Map:   0%|          | 0/580 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [13]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
   load_prec = load_metric("precision")
   load_recall = load_metric("recall")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   prec = load_prec.compute(predictions=predictions, references=labels)["precision"]
   recall = load_recall.compute(predictions=predictions, references=labels)["recall"]

   return {"accuracy": accuracy, "f1": f1, "precision":prec, "recall":recall}


In [22]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
!pip install accelerate -U



In [17]:
!pip install transformers[torch]



In [33]:
from transformers import TrainingArguments, Trainer


#repo_name = "finetuning-sentiment-model-3000-samples-jwoo5"
repo_name = "semadalg/finetuning-sentiment-model-3000-samples"

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)




In [34]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [35]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=170, training_loss=0.07249555587768555, metrics={'train_runtime': 304.2272, 'train_samples_per_second': 8.888, 'train_steps_per_second': 0.559, 'total_flos': 7596177391680.0, 'train_loss': 0.07249555587768555, 'epoch': 2.0})

In [36]:
trainer.evaluate()

{'eval_loss': 0.021632784977555275,
 'eval_accuracy': 0.996551724137931,
 'eval_f1': 0.9964285714285714,
 'eval_precision': 0.9964285714285714,
 'eval_recall': 0.9964285714285714,
 'eval_runtime': 14.461,
 'eval_samples_per_second': 40.108,
 'eval_steps_per_second': 2.559,
 'epoch': 2.0}

In [37]:
trainer.push_to_hub()


'https://huggingface.co/semadalg/finetuning-sentiment-model-3000-samples/tree/main/'

In [38]:
from transformers import pipeline

sentiment_model = pipeline(model="semadalg/finetuning-sentiment-model-3000-samples")
sentiment_model(["I love this move", "This movie sucks!"])


[{'label': 'LABEL_1', 'score': 0.989282488822937},
 {'label': 'LABEL_0', 'score': 0.9843447804450989}]

## References
1. Getting Started with Sentiment Analysis using Python, https://huggingface.co/blog/sentiment-analysis-python
1. The model did not return a loss from the inputs, only the following keys: logits, https://discuss.huggingface.co/t/the-model-did-not-return-a-loss-from-the-inputs-only-the-following-keys-logits-for-reference-the-inputs-it-received-are-input-values/25420/3
