In [None]:
import pandas as pd
import numpy as np

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

## Prepare dataset

In [None]:
df = pd.read_csv("Test_Annotation.csv")
df

In [None]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df

### Pre-process dataset

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    stop_words = set(stopwords.words("english"))
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token not in stop_words]
    text = " ".join(filtered_tokens)
    lemmatizer = WordNetLemmatizer()
    tokens = text.split()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    text = " ".join(lemmatized_tokens)
    return text

def clean_dataframe(df):
    data = df.copy()
    data["clean_comment"] = data["comment"].apply(preprocess_text)
    data.reset_index(inplace=True,drop=True)
    return data

In [None]:
df = clean_dataframe(df)
df

In [None]:
df['label'] =  df['Category'].astype("category").cat.codes

In [None]:
df['Category'].unique()

In [None]:
df['label'].unique()

In [None]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df

In [None]:
df['label'] = '__label__' + df['label'].astype(str)


In [None]:
df['label_text'] = df['label'] + ' ' + df['clean_comment']
df['label_text'] = df['label_text'].str.rstrip()
df

### Prepare train, test data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
train, valid = train_test_split(train, test_size=0.2, random_state=42, stratify = train['label'])

In [None]:
train.to_csv("content/train.csv", columns=["label","clean_comment"], index=False, header=False)
valid.to_csv("content/dev.csv", columns=["label","clean_comment"], index=False, header=False)
test.to_csv("content/test.csv", columns=["label","clean_comment"], index=False, header=False)

## Prepare dictionary / corpus for training

In [None]:
data_train = [(str(row['clean_comment']), str(row['label'])) for index, row in train.iterrows()]
data_test = [(str(row['clean_comment']), str(row['label'])) for index, row in test.iterrows()]
data_valid = [(str(row['clean_comment']), str(row['label'])) for index, row in valid.iterrows()]


In [None]:
column_name_map = {0: 'text', 1: 'label'}


In [None]:
import os

In [None]:
flair_corpus_dir = 'flair_corpus'
os.makedirs(flair_corpus_dir, exist_ok=True)

In [None]:
from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus
from flair.samplers import ImbalancedClassificationDatasetSampler

In [None]:
flair_csv_file = os.path.join(flair_corpus_dir, 'train.csv')
with open(flair_csv_file, 'w', encoding='utf-8') as f:
    f.write('text,label\n')
    for row in data_train:
        f.write(f'{row[0]},{row[1]}\n')

In [None]:
flair_csv_file = os.path.join(flair_corpus_dir, 'test.csv')
with open(flair_csv_file, 'w', encoding='utf-8') as f:
    f.write('text,label\n')
    for row in data_test:
        f.write(f'{row[0]},{row[1]}\n')

In [None]:
flair_csv_file = os.path.join(flair_corpus_dir, 'valid.csv')
with open(flair_csv_file, 'w', encoding='utf-8') as f:
    f.write('text,label\n')
    for row in data_valid:
        f.write(f'{row[0]},{row[1]}\n')

### Load corpus

In [None]:
label_type = 'label'
# load corpus containing training, test and dev data
corpus = CSVClassificationCorpus(flair_corpus_dir, column_name_map, label_type=label_type)
label_dict = corpus.make_label_dictionary(label_type=label_type)

## Model Selection and Training

In [None]:
# Initialize transformer document embeddings (many models are available)
document_embeddings = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=True)

# Create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type)

# Initialize trainer
trainer = ModelTrainer(classifier, corpus)

In [None]:
trainer.train('content/flair/', 
              embeddings_storage_mode='gpu',
              learning_rate = 0.005,
              mini_batch_size=16,
              mini_batch_chunk_size=4,
              sampler=ImbalancedClassificationDatasetSampler,
              # train_with_dev= "True",
              max_epochs=10, 
              )

## Test model

In [None]:
from flair.data import Sentence 
from flair.models import TextClassifier 
c = TextClassifier.load('content/flair/final-model.pt') 

# input example sentence 
s = Sentence('i hate your vlogs') 

# predict class and print 
c.predict(s) 

# print the labels 
print(s.labels) 
