In [3]:
import sys
from pathlib import Path
project_root = Path().resolve().parent
sys.path.append(str(project_root))

from src import preprocessing
from src import constants
from src import training
from src import evaluation


## Load and validate the training data

In [2]:
texts, labels = preprocessing.load_data(
    constants.DATA_FILE_PATH
)

2024-07-15 17:31:57,477 - JaneAustenLogger - INFO - Data validation successful.


## Preprocess the data

Tokenise and split the data.

In [3]:
tokenizer, train_loader, val_loader, test_loader = preprocessing.preprocess_data(
    texts,
    labels,
    tokenizer=constants.TOKENIZER,
    train_size=constants.TRAIN_SIZE,
    val_size=constants.VAL_SIZE,
    directory=constants.MODEL_FILE_PATH
)

2024-07-15 17:31:59,629 - JaneAustenLogger - INFO - Tokenizer saved to ../artefacts/austen_classifier_model_v2
2024-07-15 17:31:59,630 - JaneAustenLogger - INFO - Datasets and data loaders prepared successfully.


## Model Training

Train a model with a Huggingface distilbert model.

In [4]:
model = training.train_and_validate_model(
    constants.MODEL,
    train_loader,
    val_loader,
    num_epochs=constants.NUM_EPOCHS,
    directory=constants.MODEL_FILE_PATH
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-07-15 17:32:01,653 - JaneAustenLogger - INFO - Beginning training and validation process.
2024-07-15 17:32:01,654 - JaneAustenLogger - INFO - Total Epochs: 3
2024-07-15 17:32:01,654 - JaneAustenLogger - INFO - Training rows: 63
2024-07-15 17:32:01,655 - JaneAustenLogger - INFO - Validation rows: 63
2024-07-15 17:32:30,223 - JaneAustenLogger - INFO - Epoch 1/3 completed.
2024-07-15 17:32:35,863 - JaneAustenLogger - INFO - Validation Accuracy: 0.6190
2024-07-15 17:33:03,839 - JaneAustenLogger - INFO - Epoch 2/3 completed.
2024-07-15 17:33:09,226 - JaneAustenLogger - INFO - Validation Accuracy: 0.6984
2024-07-15 17:33:37,438 - JaneAusten

## Evaluation

Evaluate the model with F1 Score. Log the results.

In [5]:
evaluation.evaluate_model(
    model, test_loader
)

2024-07-15 17:35:20,815 - JaneAustenLogger - INFO - Test Set Evaluation
2024-07-15 17:35:20,821 - JaneAustenLogger - INFO - 
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       693
           1       0.99      0.73      0.84       442

    accuracy                           0.89      1135
   macro avg       0.92      0.86      0.88      1135
weighted avg       0.91      0.89      0.89      1135

