In [1]:
import pandas as pd
import numpy as np
import json
import codecs
import os
import re
import unicodedata
import wandb


import torch
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
import torch.nn as nn


from pyvi import ViTokenizer, ViPosTagger
from datasets import load_dataset

import dgl
from dgl.dataloading import GraphDataLoader
from dgl.data import DGLDataset
from dgl.nn.pytorch import GraphConv, GATConv, GatedGraphConv, DotGatConv
from dgl.nn import AvgPooling, MaxPooling

from underthesea import text_normalize
from nltk.corpus import words as eng_word

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

from pyvi import ViTokenizer, ViPosTagger
import pickle

import warnings
from tqdm import tqdm
import operator
import scipy.sparse as sp

import matplotlib.pyplot as plt
import networkx as nx

from sklearn.model_selection import StratifiedKFold
import sklearn.metrics


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/duongphuonggiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
uit = load_dataset("uitnlp/vietnamese_students_feedback")
uit_train=uit["train"]
uit_test=uit["test"]
train=uit_train.to_pandas()
test=uit_test.to_pandas
#i use cross-validation

In [3]:
teencode=dict()

with codecs.open("teencode.txt","r","utf-8") as f:
    tc_temp=f.readlines()

for line in tc_temp:
    line=line.replace("\t"," ").replace("\n","").replace("\r","")
    line_w=line.split()
    # key=line_w[0].replace("_"," ")
    teencode[line_w[0]]=" ".join(line_w[1:])

with open("vn_stopword.txt","r",encoding="utf-8") as f:
    stopword=f.readlines()
for i in tqdm(range(len(stopword))):
    stopword[i]=stopword[i][:-1]
    stopword[i]=stopword[i].replace(" ","_")
stopword=set(stopword)

with open("common_eng.txt","r") as f:
    eng=f.readlines()
    for i in tqdm(range(len(eng))):
        eng[i]=eng[i][:-1]
eng_common=set(eng)    


100%|██████████| 1959/1959 [00:00<00:00, 1670049.09it/s]
100%|██████████| 9635/9635 [00:00<00:00, 2796105.93it/s]


In [4]:
class TextProcessor:
    def __init__(self, eng_common, teencode, stopword):
        self.eng_common = eng_common
        self.teencode = teencode
        self.stopword = stopword
        self.lemmatizer = WordNetLemmatizer()
        self.text = ""

    def non_keo_dai(self):
        temp = self.text.split()
        for i in range(len(temp)):
            new_text = ""
            if temp[i] in self.eng_common:
                temp[i] = self.lemmatizer.lemmatize(temp[i])
                continue
            if "http" in self.text:
                continue
            for c in temp[i]:
                if len(new_text) < 1:
                    new_text += c
                elif c != new_text[-1] or c.isnumeric():
                    new_text += c
            temp[i] = new_text
        self.text = " ".join(temp)

    def find_link(self):
        regex = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
        matches = re.findall(regex, self.text)
        for match in matches:
            self.text = self.text.replace(match, "")

    def keep_text_only(self):
        newtext = ""
        for c in self.text:
            if c.isalpha() or c in [".", " "] or c in "áàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ":
                newtext += c
            else:
                newtext += " "
        self.text = newtext

    def remove_tag_emoji_nl(self):
        self.text = self.text.replace("<TAG>", " ")
        self.text = self.text.replace("<EMOJI>", " ")
        self.text = self.text.replace("<Emoji>", " ")
        self.text = self.text.replace("\n", ".")
        self.text = self.text.replace("/", " ")

    def checkspam(self):
        newtext = ""
        sen = self.text.split(".")
        for se in sen:
            if len(se.strip()) <= 2:
                newtext += se
            else:
                newtext += se + "."
        self.text = newtext

    def rmv_teencode(self):
        words = self.text.split()
        # 2 words
        for i in range(1, len(words)):
            if words[i - 1] + "_" + words[i] in self.teencode:
                temp = self.teencode[words[i - 1] + "_" + words[i]].split(" ")
                if len(temp) <= 1:
                    words[i - 1] = temp[0]
                    words[i] = " "
                    continue
                words[i] = " ".join(temp[1:])

        # 1 word
        for i in range(len(words)):
            if words[i] in self.teencode:
                words[i] = self.teencode[words[i]]
        self.text = " ".join(words)

    def rm_vn_stop_word(self):
        words = self.text.split()
        # 2 words
        for i in range(1, len(words)):
            if words[i - 1] + "_" + words[i] in self.stopword:
                words[i - 1] = ""
                words[i] = ""

        # 1 word
        for i in range(len(words)):
            if words[i] in self.stopword:
                words[i] = ""
        self.text = " ".join(words)

    def refine_sentence(self):
        self.rmv_teencode()
        word = False
        self.rm_vn_stop_word()
        self.text = " ".join(self.text.split())
        self.rmv_teencode()
        self.keep_text_only()
        self.text = " ".join(self.text.split())
        if len(self.text) <= 1:
            return ""

        for t in self.text:
            if t not in "0123456789 .":
                word = True
                break
        if word:
            return self.text.strip()
        return ""

    def runall(self):
        self.non_keo_dai()
        self.find_link()
        self.keep_text_only()
        self.remove_tag_emoji_nl()
        self.checkspam()
        self.rmv_teencode()
        self.rm_vn_stop_word()
        return self.refine_sentence()


In [5]:
def clean(data):
    processor = TextProcessor(eng_common, teencode, stopword)
    cleaned_sentences = []

    for i in tqdm(range(len(data))):
        sentence = data[i]
        processor.text = sentence
        cleaned_sentence = processor.runall()
        cleaned_sentence=ViTokenizer.tokenize(cleaned_sentence)
        cleaned_sentences.append(cleaned_sentence)

    data.drop(columns=['sentence'], inplace=True)
    data = cleaned_sentences
    return data

train["sentence"]=clean(train["sentence"])

100%|██████████| 11426/11426 [00:02<00:00, 5149.19it/s]


In [6]:
train["sentence"]

0                                       slide giáo_trình .
1                     nhiệt_tình giảng_dạy gũi sinh_viên .
2                                     đi học full chuyên .
3                áp_dụng công_nghệ thông_thiết giảng_dạy .
4                               thầy giảng tập ví_dụ lớp .
                               ...                        
11421              môn game học hai hài vô chuyên_nghiệp .
11422                                                     
11423                                           giao tập .
11424                           giáo_viên dạy nhiệt_tình .
11425    gói gọn doubledot tận_tình trình_độ nhu_cầu_mô...
Name: sentence, Length: 11426, dtype: object

## Glove Embeddings


In [7]:
def load_embeddings(path):
    with open(path,'rb') as f:
        emb=pickle.load(f)
    return emb

def check_coverage (vocab, embeddings_index):
    """
    
    """

    a={}
    oov={}
    k=0
    i=0
    
    for word in tqdm(vocab):
        try:
            a[word]=embeddings_index[word]
            k+=vocab[word]
        except:
            oov[word]=vocab[word]
            i+=vocab[word]
            pass
    print("Found embeddings for {:.2%} of vocab".format(len(a)/len(vocab)))
    print ("Found embeddings for {:.2%} of all text".format(k/(k+i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    return sorted_x

def build_vocab(sentences, verbose=True):
    vocab={}
    for sentence in tqdm(sentences,disable=(not verbose)):
        for word in sentence:
            try:
                vocab[word]+=1
            except KeyError:
                vocab[word]=1
    return vocab

glove_path="word2vec_vi_words_100dims.txt"

word_embeddings={}

with open(glove_path,"r") as f:
    for line in tqdm(f.readlines()):
        data=line.split()
        word_embeddings[" ".join(data[:-100])]=list(map(float, data[-100:]))

100%|██████████| 1587508/1587508 [03:01<00:00, 8727.50it/s] 


In [8]:
#check coverage
vocab = build_vocab([sentence.split() for sentence in train["sentence"]])
oov = check_coverage(vocab, word_embeddings)
oov[:10]

100%|██████████| 11426/11426 [00:00<00:00, 647955.94it/s]
100%|██████████| 2735/2735 [00:00<00:00, 113911.14it/s]

Found embeddings for 85.41% of vocab
Found embeddings for 97.80% of all text





[('wzjwz', 236),
 ('doubledot', 87),
 ('hòa', 71),
 ('học_sinh_viên', 53),
 ('trang_thiết', 41),
 ('minh_họa', 38),
 ('khóa', 33),
 ('truyền_cảm_hứng', 30),
 ('tận_tụy', 30),
 ('thực_hành_lý_thuyết', 28)]

In [9]:
# train_sentence=train["sentence"].apply(lambda x:" ".join(x.split("_")))

In [10]:
# #check coverage
# vocab = build_vocab([sentence.split() for sentence in train_sentence])
# oov = check_coverage(vocab, word_embeddings)
# oov[:10]

## Build graph

In [61]:
#Args for building graphs and training

class args:
    epochs=30
    lr=1e-3
    batch_size=64
    embedding_dim=100
    n_folds=5
    window_size=3
    num_heads=8 
    hidden_dim=50

In [12]:
word_embeddings_dim=args.embedding_dim

shuffle_doc_words_list=list(train["sentence"].values)+list(train["sentence"].values)
word_set=set()

for doc_words in shuffle_doc_words_list:
    words=doc_words.split()
    word_set.update(words)

vocab=list(word_set)
vocab_size=len(vocab)

word_id_map={}

for i in range (vocab_size):
    word_id_map[vocab[i]]=i

oov={}

for v in vocab:
    oov[v] =np.random.uniform(-0.1 , 0.1, word_embeddings_dim)

window_size=args.window_size


In [13]:
def build_graph (start, end, truncate =False, weighted_graph=True):

    x_adj=[]
    x_feature=[]
    doc_len_list=[]
    vocab_set=set()

    for i in tqdm(range(start, end)):
        doc_words=shuffle_doc_words_list[i].split()
        if truncate:
            doc_words=doc_words[:MAX_TRUNG_LEN]
        doc_len=len(doc_words)

        doc_vocab=list(set(doc_words))
        doc_nodes=len(doc_vocab)

        doc_len_list.append(doc_nodes)
        vocab_set.update(doc_vocab)

        doc_word_id_map={}
        for j in range (doc_nodes):
            doc_word_id_map[doc_vocab[j]]=j

        #sliding windows
        windows=[]
        if doc_len<=window_size:
            windows.append(doc_words)
        else:
            for j in range(doc_len-window_size+1):
                window=doc_words[j:j+window_size]
                windows.append(window)

        word_pair_count={}
        for window in windows:
            for p in range(1,len(window)):
                for q in range(0,p):
                    word_p=window[p]
                    word_p_id=word_id_map[word_p]
                    word_q=window[q]
                    word_q_id=word_id_map[word_q]
                    if word_p_id == word_q_id:
                        continue
                    word_pair_key=(word_q_id,word_p_id)
                    if word_pair_key in word_pair_count:
                        word_pair_count[word_pair_key]+=1
                    else:
                        word_pair_count[word_pair_key]=1
                    
                    #bi-direction
                    word_pair_key=(word_q_id,word_p_id)
                    if word_pair_key in word_pair_count:
                        word_pair_count[word_pair_key]+=1
                    else:
                        word_pair_count[word_pair_key]=1
        
        row=[]
        col=[]
        weight=[]
        features=[]

        for key in word_pair_count:
            p=key[0]
            q=key[1]
            row.append(doc_word_id_map[vocab[p]])
            col.append(doc_word_id_map[vocab[q]])
            weight.append(word_pair_count[key] if weighted_graph else 1.)
        adj=sp.csr_matrix((weight,(row,col)),shape=(doc_nodes, doc_nodes))

        for k,v in sorted(doc_word_id_map.items(),key=lambda x:x[1]):
            features.append(word_embeddings[k] if k in word_embeddings else oov[k])
            x_adj.append(adj)
            x_feature.append(features)
    return x_adj,x_feature
        

In [14]:
print("building graph for training")

x_adj,x_feature=build_graph(start=0, end=len(train),weighted_graph=True)

building graph for training


100%|██████████| 11426/11426 [00:02<00:00, 5627.32it/s]


In [15]:
# def draw_graph(x_adj, x_feature):
#     for adj, features in zip(x_adj, x_feature):
#         # Chuyển đổi ma trận sparse thành ma trận dense
#         adj_dense = adj.toarray()
        
#         # Tạo đồ thị từ ma trận kề
#         G = nx.Graph(adj_dense)  # Tạo đồ thị từ ma trận kề
        
#         pos = nx.spring_layout(G)  # Phân bố lại các nút

#         plt.figure(figsize=(8, 6))
#         nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=1500, edge_color='black', linewidths=1,
#                 font_size=10)

#         # Thêm thông tin đặc trưng cho mỗi nút
#         for i, node in enumerate(G.nodes()):
#             feature = features[i]
#             plt.text(pos[node][0], pos[node][1], str(feature), fontsize=8, ha='center', va='center')

#         plt.title("Word Graph", fontsize=15)
#         plt.show()

# # Sử dụng hàm draw_graph để vẽ đồ thị từ x_adj và x_feature
# draw_graph(x_adj, x_feature)


In [16]:
def check_train_test_word_overlap(train):
    train_unique_words=[]

    for text in train["sentence"]:
        train_unique_words.extencd(text.split())
    train_unique_words=list(set(train_unique_words))

    test_unique_words=[]

    for text in test["sentence"]:
        test_unique_words.extend(text.split())
    test_unique_words=list(set(test_unique_words))  

    overlap=[x for x in test_unique_words if x in train_unique_words]
    print("overlap:", np.round(len(overlap)/len(test_unique_words),3))

#maybe plot chart

In [17]:
train

Unnamed: 0,sentence,sentiment,topic
0,slide giáo_trình .,2,1
1,nhiệt_tình giảng_dạy gũi sinh_viên .,2,0
2,đi học full chuyên .,0,1
3,áp_dụng công_nghệ thông_thiết giảng_dạy .,0,0
4,thầy giảng tập ví_dụ lớp .,2,0
...,...,...,...
11421,môn game học hai hài vô chuyên_nghiệp .,0,1
11422,,2,0
11423,giao tập .,0,0
11424,giáo_viên dạy nhiệt_tình .,2,0


In [18]:
skf = StratifiedKFold(n_splits = args.n_folds,shuffle=True, random_state = 42)
train['fold'] = -1

for idx, (_, val_idx) in enumerate(skf.split(train, train['topic'])):
    train.loc[val_idx, 'fold'] = idx

for fold in range(args.n_folds):
    print(train[train['fold']==fold]['topic'].value_counts(normalize = True))


topic
0    0.714348
1    0.192913
3    0.048994
2    0.043745
Name: proportion, dtype: float64
topic
0    0.715098
1    0.192560
3    0.049015
2    0.043326
Name: proportion, dtype: float64
topic
0    0.714661
1    0.192560
3    0.049453
2    0.043326
Name: proportion, dtype: float64
topic
0    0.714661
1    0.192560
3    0.049453
2    0.043326
Name: proportion, dtype: float64
topic
0    0.714661
1    0.192560
3    0.049015
2    0.043764
Name: proportion, dtype: float64


In [49]:
#create dataset and model

class GraphDataset(DGLDataset):
    def __init__(self,x_adj, x_feature, topic=None):
        self.adj=x_adj
        self.node=x_feature
        self.topic=topic
    
    def __len__(self):
        return len(self.adj)

    def __getitem__(self, idx):

        adj_sci=self.adj[idx]
        G=dgl.from_scipy(adj_sci)

        G.ndata["feat"]=torch.stack([torch.tensor(x,dtype=torch.float) for x in self.node[idx]])

        if self.topic is not None:
            label=self.topic[idx]
            return G, torch.tensor(label,dtype=torch.long)
    
        return G
    
class GCN(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes):
        super(GCN,self).__init__()
        self.conv1=GraphConv(in_dim, hidden_dim)
        self.conv2=GraphConv(hidden_dim, hidden_dim)
        self.avgpooling=AvgPooling()
        self.classify=nn.Linear(hidden_dim,n_classes)

    def forward (self, g, h):
        h=F.relu(self.conv1(g,h))
        h=F.relu(self.conv2(g,h))
        h=self.avgpooling(g,h)

        return self.classify(h)

class GATClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, num_heads, n_classes):
        super(GATClassifier, self).__init__()
        self.hid_dim = hidden_dim
        self.gat1 = GATConv(in_dim, hidden_dim, num_heads)
        self.gat2 = GATConv(hidden_dim*num_heads, hidden_dim, 1)
        self.avgpooling = AvgPooling()
        self.drop = nn.Dropout(p = 0.3)
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g, h):
        # Apply graph convolution and activation.
        bs = h.shape[0]
        h = F.relu(self.gat1(g, h))
        h = h.reshape(bs, -1)
        h = F.relu(self.gat2(g, h))
        h = h.reshape(bs, -1)
        h = self.drop(h)
        h = self.avgpooling(g, h)
        
        return self.classify(h)

In [73]:
def train_fold (args, adj_list, node_list,model, fold=0):
    train_idx=list(train[train["fold"]!=fold].index)
    val_idx=list(train[train["fold"]==fold].index)

    print("train:", len(train_idx))
    print("val:",len(val_idx))

    n_classes=train["topic"].nunique()

    train_=train[train["fold"]!=fold].reset_index(drop=True)
    val=train[train["fold"]==fold].reset_index(drop=True)

    train_adj, val_adj=[adj_list[i] for i in train_idx],[adj_list[i] for i in val_idx]
    train_node, val_node=[node_list[i] for i in train_idx],[node_list[i] for i in val_idx]
    train_label, val_label= train_['topic'].values, val['topic'].values

    train_dataset=GraphDataset(train_adj,train_node, train_label)
    val_dataset=GraphDataset(val_adj,val_node, val_label)

    train_loader=GraphDataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
    val_loader=GraphDataLoader(val_dataset, batch_size=args.batch_size, shuffle=True)

    criterion=CrossEntropyLoss()
    optimizer=torch.optim.Adam(model.parameters(),lr=args.lr)
    scheduler=None

    best_val_mrr=0

    loss=[]
    f1=[]
    auc=[]
    mrr=[]

    for i in tqdm(range(args.epochs)):
        print(f"Epoch{i+1} / {args.epochs}")

        train_loss, train_f1, train_auc, train_mrr=one_epoch(train_loader, model, criterion,optimizer, scheduler, n_classes)
        val_loss, val_f1, val_auc, val_mrr=validate(val_loader, model, criterion, optimizer, scheduler, n_classes)

        print("Train Loss:", train_loss, "Train F1:", train_f1, "Train AUC:", train_auc, "Train MRR:", train_mrr)
        print("Validation Loss:", val_loss, "Validation F1:", val_f1, "Validation AUC:", val_auc, "Validation MRR:",val_mrr)

        loss.append((train_loss, val_loss))
        f1.append((train_f1, val_f1))
        auc.append((train_auc, val_auc))
        mrr.append((train_mrr, val_mrr))
        
        if val_mrr > best_val_mrr:
            torch.save(model.state_dict(), f'fold-{fold}.pt')
            best_val_mrr = val_mrr
            
    return {'loss': loss, 'f1': f1, 'auc': auc, 'mrr': mrr}


In [79]:
def one_epoch(train_loader, model, criterion, optimizer, scheduler, n_classes):
    train_loss= 0
    train_f1=0
    train_auc=0

    labels=[]
    logits=[]

    total=len(train_loader)
    model.train()

    for i, (G, label) in tqdm(enumerate(train_loader), total=total):

        G=dgl.add_self_loop(G)
        h=G.ndata["feat"].float()
        logit=model(G,h)
        
        loss=criterion(logit, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if scheduler:
            scheduler.step()
        
        label_np=label.detach().cpu().numpy()
        logit_np=logit.softmax(-1).detach().cpu().numpy()

        train_loss+=loss.item()/total
        train_f1+=sklearn.metrics.f1_score(label_np, logit_np.argmax(-1), average="micro")/total

        labels.append(label_np)
        logits.append(logit_np)

    labels=np.concatenate(labels)
    logits=np.concatenate(logits)

    one_hot_labels= np.zeros((len(labels), n_classes))
    one_hot_labels[np.arange(len(labels)), labels] = 1.0
    
    train_auc = sklearn.metrics.roc_auc_score(labels, logits, multi_class = 'ovo', labels = np.array([int(i) for i in range(n_classes)]))
    train_mrr = sklearn.metrics.label_ranking_average_precision_score(one_hot_labels, logits)
    
    return train_loss, train_f1, train_auc, train_mrr
    

def validate(val_loader, model, criterion, n_classes):
    val_loss = 0
    val_f1 = 0
    val_auc = 0
    
    labels = []
    logits = []
    
    total = len(val_loader)
    model.eval()
    
    with torch.no_grad():
        for i, (G, label) in tqdm(enumerate(val_loader), total = total):

            h = G.ndata['feat'].float()
            logit = model(G, h)
            loss = criterion(logit, label)

            label_numpy = label.detach().cpu().numpy()
            logit_numpy = logit.softmax(-1).detach().cpu().numpy()

            val_loss += loss.item()/total
            val_f1 += sklearn.metrics.f1_score(label_numpy, logit_numpy.argmax(-1), average = 'micro')/total

        
            labels.append(label_numpy)
            logits.append(logit_numpy)

        labels = np.concatenate(labels)
        logits = np.concatenate(logits)
    
        
        one_hot_labels= np.zeros((len(labels), n_classes))
        one_hot_labels[np.arange(len(labels)), labels] = 1.0
        
        val_auc = sklearn.metrics.roc_auc_score(labels, logits, multi_class = 'ovo', labels = np.array([int(i) for i in range(n_classes)]))
        val_mrr = sklearn.metrics.label_ranking_average_precision_score(one_hot_labels, logits)
        
    
    return val_loss, val_f1, val_auc, val_mrr    