In [32]:
import pandas as pd
import numpy as np

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package wordnet to C:\Users\Gio
[nltk_data]     Gerardino\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Gio
[nltk_data]     Gerardino\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Gio
[nltk_data]     Gerardino\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Gio
[nltk_data]     Gerardino\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Prepare dataset

In [80]:
df = pd.read_csv("Train Dataset - Sheet2.csv")
df

Unnamed: 0,comment,clean_comment,Polarity,Category
0,😍,,,
1,the jollibee taste testing is the reason why i...,the jollibee taste testing is the reason why i...,Positive,Praise towards the creator
2,"Yummy balut watching from Davao, philippines",Yummy balut watching from Davao philippines,Positive,Conversation/Anecdote/Opinion/Queries
3,"Mostly, when a product is introduced or seen o...",Mostly when a product is introduced or seen on...,Neutral,Information
4,Hiìiii can you do a challenge that you&#39;ll ...,Hiiii can you do a challenge that you'll only ...,Neutral,Request/Suggestion towards the creator
...,...,...,...,...
9995,Who here came back when they get bored?,Who here came back when they get bored,Neutral,Conversation/Anecdote/Opinion/Queries
9996,Kz sang iconic music in Philippines.. Emotiona...,Kz sang iconic music in Philippines Emotional ...,Positive,Conversation/Anecdote/Opinion/Queries
9997,The piggy though ;(,The piggy though,Negative,Conversation/Anecdote/Opinion/Queries
9998,In the Philippines you could buy a whole Lecho...,In the Philippines you could buy a whole Lecho...,Neutral,Information


## Do not drop N/A Values

In [65]:
# df.dropna(inplace=True)
# df.reset_index(drop=True, inplace=True)
# df

Unnamed: 0,comment,clean_comment,Polarity,Category
0,the jollibee taste testing is the reason why i...,the jollibee taste testing is the reason why i...,Positive,Praise towards the creator
1,"Yummy balut watching from Davao, philippines",Yummy balut watching from Davao philippines,Positive,Conversation/Anecdote/Opinion/Queries
2,"Mostly, when a product is introduced or seen o...",Mostly when a product is introduced or seen on...,Neutral,Information
3,Hiìiii can you do a challenge that you&#39;ll ...,Hiiii can you do a challenge that you'll only ...,Neutral,Request/Suggestion towards the creator
4,i will go to jollibee tomorrow because of this...,i will go to jollibee tomorrow because of this...,Positive,Conversation/Anecdote/Opinion/Queries
...,...,...,...,...
8782,I&#39;m watching this late at night and it&#39...,I'm watching this late at night and it's makin...,Positive,Conversation/Anecdote/Opinion/Queries
8783,Who here came back when they get bored?,Who here came back when they get bored,Neutral,Conversation/Anecdote/Opinion/Queries
8784,Kz sang iconic music in Philippines.. Emotiona...,Kz sang iconic music in Philippines Emotional ...,Positive,Conversation/Anecdote/Opinion/Queries
8785,The piggy though ;(,The piggy though,Negative,Conversation/Anecdote/Opinion/Queries


### Pre-process dataset
- remove special characters and emojis 
- lowercase all texts 

In [81]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    stop_words = set(stopwords.words("english"))
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token not in stop_words]
    text = " ".join(filtered_tokens)
    lemmatizer = WordNetLemmatizer()
    tokens = text.split()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    text = " ".join(lemmatized_tokens)
    return text

def clean_dataframe(df):
    # Drop rows where either "comment" or "clean_comment" is empty
    df.dropna(subset=["comment", "clean_comment"], inplace=True)
    
    # Handle NaN values for other columns
    df["Polarity"].fillna("Unknown", inplace=True)
    df["Category"].fillna("Unknown", inplace=True)
    
    # Apply preprocessing function
    df["clean_comment"] = df["clean_comment"].apply(preprocess_text)
    
    # Reset index if needed
    df.reset_index(drop=True, inplace=True)
    
    return df


In [82]:
df = clean_dataframe(df)
df

Unnamed: 0,comment,clean_comment,Polarity,Category
0,the jollibee taste testing is the reason why i...,jollibee taste testing reason im lol love vlog...,Positive,Praise towards the creator
1,"Yummy balut watching from Davao, philippines",yummy balut watching davao philippine,Positive,Conversation/Anecdote/Opinion/Queries
2,"Mostly, when a product is introduced or seen o...",mostly product introduced seen tv first time l...,Neutral,Information
3,Hiìiii can you do a challenge that you&#39;ll ...,hiiii challenge youll speak filipino fam sibli...,Neutral,Request/Suggestion towards the creator
4,i will go to jollibee tomorrow because of this...,go jollibee tomorrow thisi love spicy chicken ...,Positive,Conversation/Anecdote/Opinion/Queries
...,...,...,...,...
9713,Who here came back when they get bored?,came back get bored,Neutral,Conversation/Anecdote/Opinion/Queries
9714,Kz sang iconic music in Philippines.. Emotiona...,kz sang iconic music philippine emotional amazing,Positive,Conversation/Anecdote/Opinion/Queries
9715,The piggy though ;(,piggy though,Negative,Conversation/Anecdote/Opinion/Queries
9716,In the Philippines you could buy a whole Lecho...,philippine could buy whole lechon dollar weigh...,Neutral,Information


In [83]:
df['label'] =  df['Category'].astype("category").cat.codes

## Categories 
- Label 0 : Conversation/Anecdote/Opinion/Queries
- Label 1 : Criticism towards the creator
- Label 2 : Information
- Label 3 : Praise towards the creator
- Label 4 : Request/Suggestion towards the creator
- Label 5 : Unknown (N/A)

In [84]:
df['Category'].unique()

array(['Praise towards the creator',
       'Conversation/Anecdote/Opinion/Queries', 'Information',
       'Request/Suggestion towards the creator',
       'Criticism towards the creator', 'Unknown'], dtype=object)

In [85]:
df['label'].unique()

array([3, 0, 2, 4, 1, 5], dtype=int8)

In [86]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,comment,clean_comment,Polarity,Category,label
0,the jollibee taste testing is the reason why i...,jollibee taste testing reason im lol love vlog...,Positive,Praise towards the creator,3
1,"Yummy balut watching from Davao, philippines",yummy balut watching davao philippine,Positive,Conversation/Anecdote/Opinion/Queries,0
2,"Mostly, when a product is introduced or seen o...",mostly product introduced seen tv first time l...,Neutral,Information,2
3,Hiìiii can you do a challenge that you&#39;ll ...,hiiii challenge youll speak filipino fam sibli...,Neutral,Request/Suggestion towards the creator,4
4,i will go to jollibee tomorrow because of this...,go jollibee tomorrow thisi love spicy chicken ...,Positive,Conversation/Anecdote/Opinion/Queries,0
...,...,...,...,...,...
9713,Who here came back when they get bored?,came back get bored,Neutral,Conversation/Anecdote/Opinion/Queries,0
9714,Kz sang iconic music in Philippines.. Emotiona...,kz sang iconic music philippine emotional amazing,Positive,Conversation/Anecdote/Opinion/Queries,0
9715,The piggy though ;(,piggy though,Negative,Conversation/Anecdote/Opinion/Queries,0
9716,In the Philippines you could buy a whole Lecho...,philippine could buy whole lechon dollar weigh...,Neutral,Information,2


In [87]:
df['label'] = '__label__' + df['label'].astype(str)


In [88]:
df['label_text'] = df['label'] + ' ' + df['clean_comment']
df['label_text'] = df['label_text'].str.rstrip()
df

Unnamed: 0,comment,clean_comment,Polarity,Category,label,label_text
0,the jollibee taste testing is the reason why i...,jollibee taste testing reason im lol love vlog...,Positive,Praise towards the creator,__label__3,__label__3 jollibee taste testing reason im lo...
1,"Yummy balut watching from Davao, philippines",yummy balut watching davao philippine,Positive,Conversation/Anecdote/Opinion/Queries,__label__0,__label__0 yummy balut watching davao philippine
2,"Mostly, when a product is introduced or seen o...",mostly product introduced seen tv first time l...,Neutral,Information,__label__2,__label__2 mostly product introduced seen tv f...
3,Hiìiii can you do a challenge that you&#39;ll ...,hiiii challenge youll speak filipino fam sibli...,Neutral,Request/Suggestion towards the creator,__label__4,__label__4 hiiii challenge youll speak filipin...
4,i will go to jollibee tomorrow because of this...,go jollibee tomorrow thisi love spicy chicken ...,Positive,Conversation/Anecdote/Opinion/Queries,__label__0,__label__0 go jollibee tomorrow thisi love spi...
...,...,...,...,...,...,...
9713,Who here came back when they get bored?,came back get bored,Neutral,Conversation/Anecdote/Opinion/Queries,__label__0,__label__0 came back get bored
9714,Kz sang iconic music in Philippines.. Emotiona...,kz sang iconic music philippine emotional amazing,Positive,Conversation/Anecdote/Opinion/Queries,__label__0,__label__0 kz sang iconic music philippine emo...
9715,The piggy though ;(,piggy though,Negative,Conversation/Anecdote/Opinion/Queries,__label__0,__label__0 piggy though
9716,In the Philippines you could buy a whole Lecho...,philippine could buy whole lechon dollar weigh...,Neutral,Information,__label__2,__label__2 philippine could buy whole lechon d...


### Prepare train, test data

In [89]:
from sklearn.model_selection import train_test_split

In [90]:
train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
train, valid = train_test_split(train, test_size=0.2, random_state=42, stratify = train['label'])

In [91]:
train.to_csv("content/train.csv", columns=["label","clean_comment"], index=False, header=False)
valid.to_csv("content/dev.csv", columns=["label","clean_comment"], index=False, header=False)
test.to_csv("content/test.csv", columns=["label","clean_comment"], index=False, header=False)

## Prepare dictionary / corpus for training

In [92]:
data_train = [(str(row['clean_comment']), str(row['label'])) for index, row in train.iterrows()]
data_test = [(str(row['clean_comment']), str(row['label'])) for index, row in test.iterrows()]
data_valid = [(str(row['clean_comment']), str(row['label'])) for index, row in valid.iterrows()]


In [93]:
column_name_map = {0: 'text', 1: 'label'}


In [94]:
import os

In [95]:
flair_corpus_dir = 'flair_corpus'
os.makedirs(flair_corpus_dir, exist_ok=True)

In [96]:
from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus
from flair.samplers import ImbalancedClassificationDatasetSampler

In [97]:
flair_csv_file = os.path.join(flair_corpus_dir, 'train.csv')
with open(flair_csv_file, 'w', encoding='utf-8') as f:
    f.write('text,label\n')
    for row in data_train:
        f.write(f'{row[0]},{row[1]}\n')

In [98]:
flair_csv_file = os.path.join(flair_corpus_dir, 'test.csv')
with open(flair_csv_file, 'w', encoding='utf-8') as f:
    f.write('text,label\n')
    for row in data_test:
        f.write(f'{row[0]},{row[1]}\n')

In [99]:
flair_csv_file = os.path.join(flair_corpus_dir, 'valid.csv')
with open(flair_csv_file, 'w', encoding='utf-8') as f:
    f.write('text,label\n')
    for row in data_valid:
        f.write(f'{row[0]},{row[1]}\n')

### Load corpus

In [100]:
label_type = 'label'
# load corpus containing training, test and dev data
corpus = CSVClassificationCorpus(flair_corpus_dir, column_name_map, label_type=label_type)
label_dict = corpus.make_label_dictionary(label_type=label_type)

2024-02-29 00:01:03,144 Reading data from flair_corpus
2024-02-29 00:01:03,145 Train: flair_corpus\train.csv
2024-02-29 00:01:03,145 Dev: None
2024-02-29 00:01:03,146 Test: flair_corpus\test.csv
2024-02-29 00:01:03,177 No dev split found. Using 0% (i.e. 622 samples) of the train split as dev data
2024-02-29 00:01:03,178 Computing label dictionary. Progress:


0it [00:00, ?it/s]
0it [00:00, ?it/s]



531it [00:00, 5305.15it/s]



1062it [00:00, 4274.28it/s]



4575it [00:00, 5825.45it/s]



5209it [00:00, 5982.83it/s]



5598it [00:01, 5570.59it/s]

2024-02-29 00:01:04,191 Dictionary created for label 'label' with 7 values: __label__0 (seen 2515 times), __label__3 (seen 983 times), __label__4 (seen 864 times), __label__5 (seen 543 times), __label__2 (seen 537 times), __label__1 (seen 155 times), label (seen 1 times)





## Model Selection and Training

In [101]:
# Initialize transformer document embeddings (many models are available)
document_embeddings = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=True)

# Create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type)

# Initialize trainer
trainer = ModelTrainer(classifier, corpus)

In [29]:
from flair.data import Sentence

# Function to calculate accuracy on validation set
def calculate_validation_accuracy(model, validation_data):
    correct = 0
    total = 0
    for text, label in validation_data:
        # Convert text to Flair Sentence object
        sentence = Sentence(text)
        # Predict label for the sentence
        model.predict(sentence)
        
        # Check if sentence has labels
        if sentence.labels:
            predicted_label = sentence.labels[0].value
            if predicted_label == label:
                correct += 1
            total += 1
    # Return accuracy only if predictions were made
    if total > 0:
        return correct / total
    else:
        return 0  # Return 0 accuracy if no predictions were made
    
def calculate_training_accuracy(model, training_data):
    correct = 0
    total = 0
    for text, label in training_data:
        sentence = Sentence(text)
        model.predict(sentence)
        if sentence.labels:
            predicted_label = sentence.labels[0].value
            if predicted_label == label:
                correct += 1
            total += 1
    if total > 0:
        return correct / total
    else:
        return 0


validation_accuracy_list = []
max_epochs = 3

for epoch in range(1, max_epochs + 1):
    trainer.train('content/flair/', 
                  embeddings_storage_mode='gpu',
                  learning_rate=0.001,  
                  mini_batch_size=16,   
                  mini_batch_chunk_size=4,
                  sampler=ImbalancedClassificationDatasetSampler,
                  train_with_dev="True", 
                  max_epochs=1,  # Train for one epoch at a time
                  )
    
    # Calculate validation and training accuracy
    validation_accuracy = calculate_validation_accuracy(trainer.model, data_valid)
    training_accuracy = calculate_training_accuracy(trainer.model, data_train)

    
    validation_accuracy_list.append(validation_accuracy)
    
    print(f"Epoch {epoch}: Training Accuracy - {training_accuracy * 100:.2f}% | Validation Accuracy - {validation_accuracy * 100:.2f}%")


2024-02-28 16:57:59,521 ----------------------------------------------------------------------------------------------------
2024-02-28 16:57:59,523 Model: "TextClassifier(
  (embeddings): TransformerDocumentEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30523, 768)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Drop

100%|██████████| 22/22 [11:03<00:00, 30.14s/it]

2024-02-28 17:30:26,444 
Results:
- F-score (micro) 0.5838
- F-score (macro) 0.6149
- Accuracy 0.5838

By class:
              precision    recall  f1-score   support

  __label__0     0.8476    0.4847    0.6167       815
  __label__2     0.4709    0.6667    0.5519       291
  __label__3     0.6273    0.8313    0.7150       243
  __label__1     0.1179    0.5000    0.1909        46
       label     1.0000    1.0000    1.0000         1

    accuracy                         0.5838      1396
   macro avg     0.6128    0.6965    0.6149      1396
weighted avg     0.7068    0.5838    0.6066      1396

2024-02-28 17:30:26,446 ----------------------------------------------------------------------------------------------------





Epoch 1: Validation Accuracy - 58.92%
2024-02-28 17:31:58,949 ----------------------------------------------------------------------------------------------------
2024-02-28 17:31:58,951 Model: "TextClassifier(
  (embeddings): TransformerDocumentEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30523, 768)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias

100%|██████████| 22/22 [09:19<00:00, 25.45s/it]

2024-02-28 18:00:30,253 
Results:
- F-score (micro) 0.5201
- F-score (macro) 0.5927
- Accuracy 0.5201

By class:
              precision    recall  f1-score   support

  __label__0     0.8793    0.3485    0.4991       815
  __label__2     0.4866    0.6873    0.5698       291
  __label__3     0.6220    0.8601    0.7219       243
  __label__1     0.0985    0.6957    0.1725        46
       label     1.0000    1.0000    1.0000         1

    accuracy                         0.5201      1396
   macro avg     0.6173    0.7183    0.5927      1396
weighted avg     0.7270    0.5201    0.5422      1396

2024-02-28 18:00:30,254 ----------------------------------------------------------------------------------------------------





Epoch 2: Validation Accuracy - 52.20%
2024-02-28 18:01:41,698 ----------------------------------------------------------------------------------------------------
2024-02-28 18:01:41,700 Model: "TextClassifier(
  (embeddings): TransformerDocumentEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30523, 768)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias

100%|██████████| 22/22 [09:18<00:00, 25.38s/it]

2024-02-28 18:30:04,746 
Results:
- F-score (micro) 0.6074
- F-score (macro) 0.6314
- Accuracy 0.6074

By class:
              precision    recall  f1-score   support

  __label__0     0.8640    0.5067    0.6388       815
  __label__2     0.5104    0.6770    0.5820       291
  __label__3     0.6127    0.8724    0.7199       243
  __label__1     0.1351    0.5435    0.2165        46
       label     1.0000    1.0000    1.0000         1

    accuracy                         0.6074      1396
   macro avg     0.6244    0.7199    0.6314      1396
weighted avg     0.7226    0.6074    0.6274      1396

2024-02-28 18:30:04,747 ----------------------------------------------------------------------------------------------------





Epoch 3: Validation Accuracy - 59.91%


## Test model

In [27]:
from flair.data import Sentence
from flair.models import TextClassifier
from sklearn.metrics import classification_report

model = TextClassifier.load('content/flair/final-model.pt')

# Load the test dataset
test_sentences = [text for text, label in data_test]
actual_labels = [label for text, label in data_test]

predicted_labels = []
for text in test_sentences:
    sentence = Sentence(text)
    model.predict(sentence)
    predicted_label = sentence.labels[0].value
    predicted_labels.append(predicted_label)

report = classification_report(actual_labels, predicted_labels)
print(report)


              precision    recall  f1-score   support

  __label__0       0.87      0.70      0.77       815
  __label__1       0.39      0.26      0.31        46
  __label__2       0.56      0.79      0.66       291
  __label__3       0.69      0.85      0.76       243

    accuracy                           0.73      1395
   macro avg       0.63      0.65      0.63      1395
weighted avg       0.76      0.73      0.73      1395

