<a href="https://colab.research.google.com/github/gin7018/headline-sentiment-analysis/blob/main/headline_sentiment_analysis_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchvision torchaudio
!pip install datasets

In [None]:
import torch
import transformers
from transformers.modeling_utils import PreTrainedModel


class SentimentAnalysisModel(PreTrainedModel):

    def __init__(self):
        config = transformers.DistilBertConfig.from_pretrained("distilbert-base-uncased")
        super(SentimentAnalysisModel, self).__init__(config=config)

        self.bert = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased', )
        self.dropout = torch.nn.Dropout(p=0.3)
        self.output = torch.nn.Linear(in_features=768, out_features=3)  # we have three possible classes

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids, attention_mask=attention_mask)
        output = self.dropout(output[0][:, 0])
        output = self.output(output)
        return output

In [None]:
import torch
from datasets import load_dataset
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
from transformers import BertTokenizerFast
from huggingface_hub import HfApi, login


tokenizer = (BertTokenizerFast.from_pretrained(
    pretrained_model_name_or_path="bert-base-uncased"
))


def get_financial_data():
    dataset = load_dataset(
        "financial_phrasebank",
        "sentences_66agree",
        split="train",
    )
    return dataset.to_pandas()


def trainer():
    # loading the data into tensors, with 70-30 train test split
    df = get_financial_data()
    tokenized_sentences = tokenizer(df["sentence"].tolist(),
                                    add_special_tokens=True,
                                    max_length=512,
                                    padding='max_length',
                                    return_tensors="pt",
                                    truncation=True)
    sentiments = torch.tensor(df["label"].tolist())
    print("got the data tokenized")

    dataset = TensorDataset(
        tokenized_sentences["input_ids"],
        tokenized_sentences["attention_mask"],
        sentiments)
    train_set_size = int(len(dataset) * 0.7)
    validation_set_size = len(dataset) - train_set_size

    training_set, validation_set = random_split(dataset, [train_set_size, validation_set_size])

    batch_size = 16
    training_dataloader = DataLoader(
        training_set,
        sampler=RandomSampler(training_set),
        batch_size=batch_size
    )
    validation_dataloader = DataLoader(
        validation_set,
        sampler=SequentialSampler(validation_set),
        batch_size=batch_size
    )
    print("put the data into loader")

    # loading our model
    sentiment_model = SentimentAnalysisModel()
    print("loaded the model")

    epochs = 1
    learning_rate = 1e-5
    optimizer = torch.optim.Adam(
        params=sentiment_model.parameters(),
        lr=learning_rate
    )
    loss_function = torch.nn.CrossEntropyLoss()

    for epoch in range(epochs):
        sentiment_model.train()
        training_progress_bar = tqdm(training_dataloader, desc=f"Epoch {epoch + 1} - Training")
        for batch in training_progress_bar:
            input_ids, attention_mask, target_sentiments = batch

            optimizer.zero_grad()  # reset the optimizer because the loss accumulates
            outputs = sentiment_model(input_ids, attention_mask)
            batch_loss = loss_function(outputs, target_sentiments)
            batch_loss.backward()  # gradients - the rate of change of the loss functions
            optimizer.step()  # gradient descent to find the weights which minimize the loss function

            training_progress_bar.set_postfix(loss=batch_loss.item())

        # testing how good our model is at classifying the sentences
        sentiment_model.eval()
        total_correct_classification = 0
        total_samples = 0
        validation_progress_bar = tqdm(validation_dataloader, desc=f"Epoch {epoch + 1} - Validating")
        for batch in validation_progress_bar:
            input_ids, attention_mask, target_sentiments = batch

            with torch.no_grad():
                outputs = sentiment_model(input_ids, attention_mask)

                sentiment_prediction = torch.argmax(outputs, 1)
                total_correct_classification += torch.sum(torch.eq(sentiment_prediction, target_sentiments))
                total_samples += target_sentiments.size(0)
        print(f"epoch: {epoch}, accuracy: {total_correct_classification / total_samples}")


    login()
    # torch.save(sentiment_model, "sentiment_analysis_model.pt")
    sentiment_model.save_pretrained(
        save_directory="./model",
        push_to_hub=True,
        repo_id="ghislainehaha/headline-sentiment-analyzer"
        token
    )



trainer()

got the data tokenized
put the data into loader
loaded the model


Epoch 1 - Training: 100%|██████████| 185/185 [2:35:19<00:00, 50.37s/it, loss=0.297]
Epoch 1 - Validating: 100%|██████████| 80/80 [19:19<00:00, 14.50s/it]

epoch: 0, accuracy: 0.852290689945221





VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

ValueError: ignored

In [None]:
from huggingface_hub import HfApi, login

login()

api = HfApi()
api.upload_file(
    path_or_fileobj="config.json",
    path_in_repo="config.json",
    repo_type="model",
    repo_id="ghislainehaha/headline-sentiment-analyzer",
    commit_description="adding the model's config"
)
print("committed the config")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

config.json:   0%|          | 0.00/265M [00:00<?, ?B/s]

committed the config


In [None]:
api = HfApi()
api.upload_file(
    path_or_fileobj="sentiment_analysis_model.pt",
    path_in_repo="sentiment_analysis_model.pt",
    repo_type="model",
    repo_id="ghislainehaha/headline-sentiment-analyzer",
    commit_description="new trained model version (from google collab)"
)
print("committed the model")

sentiment_analysis_model.pt:   0%|          | 0.00/266M [00:00<?, ?B/s]

committed the model
