In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
path = '/content/drive/MyDrive/ENEL645-garbage-classification-model'
if not path in sys.path:
  sys.path.append(path)

In [3]:
print(sys.path)

['/content', '/env/python', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.10/dist-packages/IPython/extensions', '/usr/local/lib/python3.10/dist-packages/setuptools/_vendor', '/root/.ipython', '/content/drive/MyDrive/ENEL645-garbage-classification-model']


In [4]:
from text_model import *

In [5]:
# check if GPU is available
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [6]:
# read the files
root_path = '/work/TALC/enel645_2024f/garbage_data'

train_folder = '/CVPR_2024_dataset_Train'
val_folder = '/CVPR_2024_dataset_Val'
test_folder = '/CVPR_2024_dataset_Test'

train_path = root_path + train_folder
val_path = root_path + val_folder
test_path = root_path + test_folder

In [None]:
# read train, validation, and test datasets
text_train,labels_train = read_text_files_with_labels(train_path)
text_val,labels_val = read_text_files_with_labels(val_path)
text_test,labels_test = read_text_files_with_labels(test_path)

In [None]:
# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize data
max_len = 24
dataset_train = CustomDataset(text_train, labels_train, tokenizer, max_len)
dataset_val = CustomDataset(text_val, labels_val, tokenizer, max_len)
dataset_test = CustomDataset(text_test, labels_test, tokenizer, max_len)

In [None]:
# Data loaders
batch_size = 16
text_train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
text_val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)
text_test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

In [None]:
best_loss = 1e+10   # best loss tracker
best_acc = 0      # best accuracy tracker
num_epoch = 10

text_path = './garbage_text_model.pth'

text_model = DistilBERTClassifier(num_classes=4)      # initialize the model
text_model.to(device)                     # push the model to GPU
print(text_model)

In [None]:
# Training parameters
optimizer = optim.Adam(model.parameters(), lr=2e-5)     # use Adam optimizer
criterion = nn.CrossEntropyLoss()             # cross entropy loss as loss function

In [None]:
# train process
train_loss, train_acc, val_loss, val_acc = text_model.train_multi_epochs_and_save_best_model(
    text_train_loader, text_val_loader, train_size, val_size, criterion, text_path,
    optimizer, num_epoch
  )
print()
print('Finished traning')

In [None]:
# Test Evaluation
text_path = './best_text_model.pth'
text_model_test = DistilBERTClassifier(num_classes=4)
text_model_test.load_state_dict(torch.load(text_path))
text_model_test.to(device)

labels_test, predict_test = text_model_predict(model, text_test_loader, test_size)    # get the predict from the text model
print(f"Accuracy:  {(predict_test == labels_test).sum()/labels_test.size:.4f}")
cm = confusion_matrix(labels_test, predict_test)                      # draw confusion matrix
print(cm)