# 1. Install and load data

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
import os

# Define paths
repo_path = "./ntc-scv"
data_test_zip = f"{repo_path}/data/data_test.zip"
data_train_zip = f"{repo_path}/data/data_train.zip"
data_dir = "./data/ntc-csv"

# Check if the data directory already exists
if not os.path.exists(data_dir):
    # Clone the repository if not already cloned
    if not os.path.exists(repo_path):
        print("Cloning repository...")
        clone_result = os.system("git clone https://github.com/congnghia0609/ntc-scv.git")
        if clone_result != 0:
            print("Error: Failed to clone repository. Please check the URL or your internet connection.")
            exit(1)
    else:
        print("Repository already cloned, skipping cloning step.")
    
    # Unzip data_test.zip
    if os.path.exists(data_test_zip):
        print("Extracting data_test.zip...")
        unzip_test_result = os.system(f"unzip -o {data_test_zip} -d {data_dir}")
        if unzip_test_result != 0:
            print("Error: Failed to extract data_test.zip.")
            exit(1)
    else:
        print("Error: data_test.zip not found.")
        exit(1)
    
    # Unzip data_train.zip
    if os.path.exists(data_train_zip):
        print("Extracting data_train.zip...")
        unzip_train_result = os.system(f"unzip -o {data_train_zip} -d {data_dir}")
        if unzip_train_result != 0:
            print("Error: Failed to extract data_train.zip.")
            exit(1)
    else:
        print("Error: data_train.zip not found.")
        exit(1)
    
    # Clean up the cloned repository
    print("Cleaning up repository...")
    cleanup_result = os.system(f"rm -rf {repo_path}")
    if cleanup_result != 0:
        print("Error: Failed to remove cloned repository.")
        exit(1)
    else:
        print("Cleanup completed successfully.")
else:
    print("Data directory already exists, skipping download and extraction.")


In [None]:
import os
import pandas as pd

def load_data_from_path(folder_path):
    examples=[]
    for label in os.listdir(folder_path):
        full_path = os.path.join(folder_path, label)

        for file_name in os.listdir(full_path):
            file_path = os.path.join(full_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            sentence = ' '.join(lines)
            if label == 'neg':
                label = 0
            if label == 'pos':
                label = 1
            data = {
                'sentence': sentence,
                'label': label,
            }
            examples.append(data)
    return pd.DataFrame(examples)

folder_paths = {
    'train': './data/ntc-csv/data_train/train',
    'valid': './data/ntc-csv/data_train/test',
    'test': './data/ntc-csv/data_test/test'
}

train_df = load_data_from_path(folder_paths['train'])
valid_df = load_data_from_path(folder_paths['valid'])
test_df = load_data_from_path(folder_paths['test'])

In [None]:
train_df

# 2. Preprocessing data

In [None]:
!pip install langid

from langid.langid import LanguageIdentifier, model
def identify_vn(df):
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    not_vi_idx = set()
    THRESHOLD = 0.9
    for idx, row in df.iterrows():
        score = identifier.classify(row['sentence'])
        if score[0] != 'vi' or (score[0] == 'vi' and score[1] <= THRESHOLD):
            not_vi_idx.add(idx)
    vi_df = df[~df.index.isin(not_vi_idx)]
    not_vi_df = df[df.index.isin(not_vi_idx)]
    return vi_df, not_vi_df

train_df_vi, train_df_other = identify_vn(train_df)

In [None]:
train_df_vi

In [None]:
train_df_other

In [None]:
import re
import string

"""
- Xoá bỏ thẻ HTML, đường dẫn URL
- Xoá bỏ dấu câu, số
- Xoá bỏ các ký tự đặc biệt, emoticons,...
- Chuẩn hóa khoảng trắng
- Chuyển sang viết thường 
"""

def preprocess_text(text):
    url_pattern = re.compile(r'https?://\s+\www\.\s+')
    text = url_pattern.sub(r' ', text)

    html_pattern = re.compile(r'<[^<>]+>')
    text = html_pattern.sub(' ', text)

    replace_chars = list(string.punctuation + string.digits)
    for char in replace_chars:
        text = text.replace(char, ' ')

    emoji_pattern = re . compile ("["
        u"\U0001F600-\U0001F64F" # emoticons
        u"\U0001F300-\U0001F5FF" # symbols & pictographs
        u"\U0001F680-\U0001F6FF" # transport & map symbols
        u"\U0001F1E0-\U0001F1FF" # flags (iOS)
        u"\U0001F1F2-\U0001F1F4" # Macau flag
        u"\U0001F1E6-\U0001F1FF" # flags
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642 "
        "]+", flags = re.UNICODE)
    text = emoji_pattern.sub(r" ", text)

    text = " ".join(text.split())

    return text.lower()

train_df_vi['preprocess_sentence'] = [
    preprocess_text(row['sentence']) for _, row in train_df_vi.iterrows()
]
valid_df['preprocess_sentence'] = [
    preprocess_text(row['sentence']) for _, row in valid_df.iterrows()
]
test_df['preprocess_sentence'] = [
    preprocess_text(row['sentence']) for _, row in test_df.iterrows()
]


# 3. Vectorizer

In [None]:
import torchtext
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')

def yield_tokens(sentences, tokenizer):
    for sentence in sentences:
        yield tokenizer(sentence)

# build vocab
from torchtext.vocab import build_vocab_from_iterator

vocab_size = 10000
vocabulary = build_vocab_from_iterator(
    yield_tokens(train_df_vi['preprocess_sentence'], tokenizer),
    max_tokens=vocab_size,
    specials=['<pad>', '<unk>']
)
vocabulary.set_default_index(vocabulary['<unk>'])

# convert iter into torchtext dataset
from torchtext.data.functional import to_map_style_dataset
def prepare_dataset(df):
    for idx, row in df.iterrows():
        sentence = row['preprocess_sentence']
        encoded_sentence = vocabulary(tokenizer(sentence))
        label = row['label']
        yield encoded_sentence, label

train_dataset = prepare_dataset(train_df_vi)
train_dataset = to_map_style_dataset(train_dataset)

valid_dataset = prepare_dataset(valid_df)
valid_dataset = to_map_style_dataset(valid_dataset)

In [None]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    encoded_sentences, labels = [], []
    for encoded_sentence, label in batch:
        labels.append(label)
        encoded_sentence = torch.tensor(encoded_sentence, dtype=torch.int64)
        encoded_sentences.append(encoded_sentence)

    labels = torch.tensor(labels, dtype=torch.int64)
    encoded_sentences = pad_sequence(
        encoded_sentences,
        padding_value=vocabulary["<pad>"]
    )
    return encoded_sentences, labels

batch_size = 128
train_dataloader = DataLoader(
    train_dataset,
    batch_size,
    shuffle=False,
    collate_fn=collate_batch
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size,
    shuffle=False,
    collate_fn=collate_batch
)

# 4. Model

In [None]:
import torch.nn as nn

class TextCNN(nn.Module):
    def __init__(
        self,
        vocab_size, embedding_dim, kernel_sizes, num_filters, num_classes,
    ):
        super(TextCNN, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.kernel_sizes = kernel_sizes
        self.num_filters = num_filters
        self.num_classes = num_classes
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.conv = nn.ModuleList([
            nn.Conv1d(
                in_channels=embedding_dim,
                out_channels=num_filters,
                kernel_size=k,
                stride=1,
            ) for k in kernel_sizes
        ])
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)
    
    def forward(self, x):
        batch_size, sequence_len = x.shape
        x = self.embedding(x.T).transpose(1, 2)
        x = [F.relu(conv(x)) for conv in self.conv]
        x = [F.max_pool1d(c, c.size(-1)).squeeze(dim=-1) for c in x]
        x = torch.cat(x, dim=1)
        x = self.fc(x)
        return x


# 5. Training

In [None]:
import time

def train(
    model, 
    optimizer, 
    criterion, 
    train_dataloader, 
    device, 
    epoch=0 , 
    log_interval=50
):
    model.train()
    total_acc, total_count = 0, 0
    losses = []
    start_time = time.time()

    for idx, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        predictions = model(inputs)

        loss = criterion(predictions, labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        total_acc += (predictions.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        
        if idx & log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                f"| epoch {epoch:3d} | {idx:5d}/{len(train_dataloader):5d} batches {elapsed:8.3f} sec"
                f"| accuracy {total_acc / total_count:8.3f}"
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc, epoch_loss

def evaluate(model, criterion, valid_dataloader, device):
    model.eval()
    total_acc, total_count = 0, 0

    losses = []

    with torch.no_grad():
        for _, (inputs, labels) in enumerate(valid_dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            predictions = model(inputs)

            loss = criterion(predictions, labels)
            losses.append(loss.item())

            total_acc += (predictions.argmax(1) == labels).sum().item()
            total_count += labels.size(0)

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc, epoch_loss

In [None]:
num_classes = 2
vocab_size = len(vocabulary)
embedding_dim = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TextCNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    kernel_sizes=[3,4,5],
    num_filters=100,
    num_classes=2
)
model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

num_epochs = 10
save_model = './model'
save_model = './model'
if not os.path.exists(save_model):
    os.makedirs(save_model)

train_accs, train_losses = [], []
eval_accs, eval_losses = [], []
best_loss_eval = 100

for epoch in range(1, num_epochs + 1):
    epoch_start_time = time.time()

    train_acc, train_loss = train(
        model,
        optimizer,
        criterion,
        train_dataloader,
        device,
        epoch
    )
    train_accs.append(train_acc)
    train_losses.append(train_loss)

    eval_acc, eval_loss = evaluate(
        model,
        criterion,
        valid_dataloader,
        device
    )
    eval_accs.append(eval_acc)
    eval_losses.append(eval_loss)

    if eval_loss < best_loss_eval:
        torch.save(lenet_model.state_dict(), save_model + '/text_cnn_model.pt')
    
    print('-' * 60)
    print(
        f"| End of epoch {epoch:3d} | Time:{time.time() - epoch_start_time}s | Train Accuracy {train_acc:8.3f} | Train Loss {train_loss:8.3f} "
        f"| Valid Accuracy {eval_acc:8.3f} | Valid Loss {eval_loss:8.3f}"
    )
    print('-' * 60)

model.load_state_dict(torch.load(save_model + '/text_cnn_model.pt'))
model.eval()