## 1. Tạo dữ liệu train

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report

In [2]:
#Reading the csv file
df1 = pd.read_csv("Annotated/GMB_dataset.txt", sep="\t", encoding="latin1")
del df1['Unnamed: 0']

# Chuyển đổi kiểu dữ liệu của cột "Sentence #" sang số nguyên (int)
df1['Sentence #'] = df1['Sentence #'].astype(int)

# Xử lý giá trị null ở cột Sentence
df1 = df1.fillna(method = 'ffill')

#Display first 10 rows
df1.head(10)

  df1 = df1.fillna(method = 'ffill')


Unnamed: 0,Sentence #,Word,POS,Tag
0,1,Thousands,NNS,O
1,1,of,IN,O
2,1,demonstrators,NNS,O
3,1,have,VBP,O
4,1,marched,VBN,O
5,1,through,IN,O
6,1,London,NNP,B-geo
7,1,to,TO,O
8,1,protest,VB,O
9,1,the,DT,O


In [3]:
def generate_sentences_tuples(df):
    grouped = df.groupby("Sentence #")
    sentences = [agg_sentence_group(group) for _, group in grouped]
    return sentences

def agg_sentence_group(group):
    words = group['Word'].values.tolist()
    pos = group['POS'].values.tolist()
    tags = group['Tag'].values.tolist()
    return list(zip(words, pos, tags))

In [4]:
sentencesTuples1 = generate_sentences_tuples(df1)
sentencesTuples1[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [5]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

In [6]:
X_train = [sent2features(s) for s in sentencesTuples1]
y_train = [sent2labels(s) for s in sentencesTuples1]

## 2. Tạo dữ liệu test

In [7]:
import pandas as pd
import nltk

# Đoạn văn bản chứa nhiều câu
text = "MANCHESTER, England (AP) — Manchester United goalkeeper Andre Onana has defended teammate Alejandro Garnacho after the winger used gorilla emojis in a social media post about him. The 19-year-old Argentina international posted a photo of players congratulating Onana for stopping a penalty late in United’s 1-0 win over Copenhagen in the Champions League on Tuesday. Garnacho used two gorilla emojis above the photo in a post on X, formerly known as Twitter. It was then deleted. Onana, who is Black, wrote on his Instagram Story that he understood what Garnacho meant. “People cannot choose what I should be offended by,” the Cameroon international wrote on Thursday. “I know exactly what (Garnacho) meant: power & strength. This matter should go no further.” \n        \n     The Football Association could still take action, however. Onana also posted a photo of himself and Garnacho hugging, and he added two fist-bumping emojis. "

# Tạo một DataFrame ban đầu với cột "Sentence #" ở vị trí đầu tiên
df2 = pd.DataFrame(columns=['Sentence #', 'Word', 'POS', 'Tag'])

def sent_tokenizer(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

# Hàm xử lý văn bản và gắn thẻ POS
def preprocessing(text):
    # Loại bỏ dư dấu cách thừa và xử lý kí tự xuống dòng "\n"
    text = ' '.join(text.split()).replace("\n", " ")
    return text

def pos_tagging(text):  
    # Tách văn bản thành các câu
    sentences = nltk.sent_tokenize(text)

    # Tạo DataFrame ban đầu với cột "Sentence #" ở vị trí đầu tiên
    df = pd.DataFrame(columns=['Sentence #', 'Word', 'POS', 'Tag'])

    # Gắn thẻ POS cho từng câu và thêm vào DataFrame
    for i, sentence in enumerate(sentences, start=1):
        words = nltk.word_tokenize(sentence)
        pos_tags = nltk.pos_tag(words)
        new_df = pd.DataFrame(pos_tags, columns=['Word', 'POS'])
        new_df['Sentence #'] = i
        df = pd.concat([df, new_df], ignore_index=True)

    return df

In [8]:
sent_tokenizer(preprocessing(text))

['MANCHESTER, England (AP) — Manchester United goalkeeper Andre Onana has defended teammate Alejandro Garnacho after the winger used gorilla emojis in a social media post about him.',
 'The 19-year-old Argentina international posted a photo of players congratulating Onana for stopping a penalty late in United’s 1-0 win over Copenhagen in the Champions League on Tuesday.',
 'Garnacho used two gorilla emojis above the photo in a post on X, formerly known as Twitter.',
 'It was then deleted.',
 'Onana, who is Black, wrote on his Instagram Story that he understood what Garnacho meant.',
 '“People cannot choose what I should be offended by,” the Cameroon international wrote on Thursday.',
 '“I know exactly what (Garnacho) meant: power & strength.',
 'This matter should go no further.” The Football Association could still take action, however.',
 'Onana also posted a photo of himself and Garnacho hugging, and he added two fist-bumping emojis.']

In [9]:
text_preprocessing = preprocessing(text)
text_preprocessing

'MANCHESTER, England (AP) — Manchester United goalkeeper Andre Onana has defended teammate Alejandro Garnacho after the winger used gorilla emojis in a social media post about him. The 19-year-old Argentina international posted a photo of players congratulating Onana for stopping a penalty late in United’s 1-0 win over Copenhagen in the Champions League on Tuesday. Garnacho used two gorilla emojis above the photo in a post on X, formerly known as Twitter. It was then deleted. Onana, who is Black, wrote on his Instagram Story that he understood what Garnacho meant. “People cannot choose what I should be offended by,” the Cameroon international wrote on Thursday. “I know exactly what (Garnacho) meant: power & strength. This matter should go no further.” The Football Association could still take action, however. Onana also posted a photo of himself and Garnacho hugging, and he added two fist-bumping emojis.'

In [10]:
# Tạo DataFrame chứa kết quả sent_tokenize và gắn thẻ POS
df2 = pos_tagging(text_preprocessing)
df2

Unnamed: 0,Sentence #,Word,POS,Tag
0,1,MANCHESTER,NNP,
1,1,",",",",
2,1,England,NNP,
3,1,(,(,
4,1,AP,NNP,
...,...,...,...,...
168,9,added,VBD,
169,9,two,CD,
170,9,fist-bumping,JJ,
171,9,emojis,NN,


In [11]:
def generate_sentences(df):
    grouped = df.groupby("Sentence #")
    sentences = [agg_sentence_group(group) for _, group in grouped]
    return sentences

def agg_sentence_group(group):
    words = group['Word'].values.tolist()
    pos = group['POS'].values.tolist()
    tags = group['Tag'].values.tolist()
    return list(zip(words, pos, tags))

sentences2 = generate_sentences(df2)
sentences2[0]

[('MANCHESTER', 'NNP', nan),
 (',', ',', nan),
 ('England', 'NNP', nan),
 ('(', '(', nan),
 ('AP', 'NNP', nan),
 (')', ')', nan),
 ('—', 'VBP', nan),
 ('Manchester', 'NNP', nan),
 ('United', 'NNP', nan),
 ('goalkeeper', 'NN', nan),
 ('Andre', 'NNP', nan),
 ('Onana', 'NNP', nan),
 ('has', 'VBZ', nan),
 ('defended', 'VBN', nan),
 ('teammate', 'NN', nan),
 ('Alejandro', 'NNP', nan),
 ('Garnacho', 'NNP', nan),
 ('after', 'IN', nan),
 ('the', 'DT', nan),
 ('winger', 'NN', nan),
 ('used', 'VBN', nan),
 ('gorilla', 'NN', nan),
 ('emojis', 'NN', nan),
 ('in', 'IN', nan),
 ('a', 'DT', nan),
 ('social', 'JJ', nan),
 ('media', 'NNS', nan),
 ('post', 'NN', nan),
 ('about', 'IN', nan),
 ('him', 'PRP', nan),
 ('.', '.', nan)]

In [12]:
X_test = [sent2features(s) for s in sentences2]
# y_test = [sent2labels(s) for s in sentences2]

In [13]:
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = True,
         keep_tempfiles=None)

In [14]:
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
y_pred = crf.predict(X_test)

In [15]:
df2['Tag'] = [label for sent in y_pred for label in sent]

In [16]:
df2[41:60]

Unnamed: 0,Sentence #,Word,POS,Tag
41,2,Onana,NNP,B-gpe
42,2,for,IN,O
43,2,stopping,VBG,O
44,2,a,DT,O
45,2,penalty,NN,O
46,2,late,RB,O
47,2,in,IN,O
48,2,United,NNP,B-geo
49,2,’,NNP,I-geo
50,2,s,VBD,O
