**Dylan Govender - 221040222 - COMP703 - CRF POS-Tagger**

**Initialising the Dataset**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!cp '/content/gdrive/My Drive/Colab Notebooks/isiswati_pos_tagging_corpus.txt' 'isiswati_pos_tagging_corpus.txt'

In [3]:
import regex

def uppercase(text):
    def replace(match):
        return '['+match.group(1).upper()+']'
    return regex.sub(r'\[([^]]+)\]', replace, text)

def tokenize_morph_analysis(morph):
    morph = morph.split(']')
    morph = morph[:-1]
    morph_analysis = {}
    previous_word = ""
    for data in morph:
        word, tag = data.split('[')
        if word == '':
            word = previous_word
        else:
            if word[:1] == '-':
                word = word[1:]
        morph_analysis[word] = tag
        previous_word = word
    tokenized_morph = []
    for key in morph_analysis.keys():
        tokenized_morph.append((key, morph_analysis.get(key)))
    return tokenized_morph

filename = '/content/isiswati_pos_tagging_corpus.txt'
corpus = []
tokenized_text = []
tokenized_sent = []
count = 0
problem = []
with open(filename, "r") as file:
    for line in file:
        line_data = line.split()
        if len(line_data) == 5:
            line_data[1] = uppercase(line_data[1])
            corpus.append(line_data)
            tokenized_morph = [(line_data[0], line_data[4])] if line_data[4]=="PUNC" else tokenize_morph_analysis(line_data[1])
            tokenized_sent.append((line_data[0], tokenized_morph, line_data[2], line_data[3], line_data[4]))
            if line_data[0] == '.':
                tokenized_text.append(tokenized_sent)
                tokenized_sent = []

print(tokenized_text[:1])
print("Number of sentences:", str(len(tokenized_text)))

[[('Ngetulu', [('nga', 'ADVPRE'), ('tulu', 'ADV')], 'tulu', 'ADV', 'ADV'), ('kwaloko', [('kwa', 'POSSCONC15'), ('loko', 'POS2')], 'loko', 'POSS15', 'POSS'), (',', [(',', 'PUNC')], ',', 'PUNC', 'PUNC'), ('kuba', [('ku', 'BPRE15'), ('b', 'VROOT'), ('a', 'VERBTERM')], 'ba', 'V', 'V'), ('khona', [('khona', 'CONJ')], 'khona', 'CONJ', 'CONJ'), ('kuniketela', [('ku', 'BPRE15'), ('niket', 'VROOT'), ('el', 'APPLEXT'), ('a', 'VERBTERM')], 'niketa', 'V', 'V'), ('kwekwakhiwa', [('kwe', 'POSSCONC15'), ('ku', 'BPRE15'), ('akh', 'VROOT'), ('iw', 'PASSEXT'), ('a', 'VERBTERM')], 'akha', 'POSS15', 'POSS'), ('kwemaKomidi', [('kwe', 'POSSCONC15'), ('ma', 'BPRE6'), ('komidi', 'NSTEM')], 'komidi', 'POSS15', 'POSS'), ('emaWadi', [('e', 'NPREPRE6'), ('ma', 'BPRE6'), ('wadi', 'NSTEM')], 'wadi', 'N06', 'N'), (',', [(',', 'PUNC')], ',', 'PUNC', 'PUNC'), ('njengemitimba', [('njenga', 'ADVPRE'), ('i', 'NPREPRE4'), ('mi', 'BPRE4'), ('timba', 'NSTEM')], 'timba', 'ADV', 'ADV'), ('yamasipala', [('ya', 'POSSCONC4'), ('

**Checking if the data has been fully captured by the above preprocess.**

In [4]:
import pandas as pd
df = pd.DataFrame(corpus)
df.columns = ['token', 'morph_analysis', 'lemma', 'xpos', 'upos']
df.head()

Unnamed: 0,token,morph_analysis,lemma,xpos,upos
0,Ngetulu,nga[ADVPRE]-tulu[ADV],tulu,ADV,ADV
1,kwaloko,kwa[POSSCONC15]-loko[DEM15][POS2],loko,POSS15,POSS
2,",",",[PUNC]",",",PUNC,PUNC
3,kuba,ku[BPRE15]-b[VROOT]-a[VERBTERM],ba,V,V
4,khona,khona[CONJ],khona,CONJ,CONJ


In [5]:
df.describe()

Unnamed: 0,token,morph_analysis,lemma,xpos,upos
count,42977,42977,42977,42977,42977
unique,14060,15379,2930,99,16
top,.,.[PUNC],.,V,N
freq,2682,2682,2682,7094,9331


**CRF Data-Preprocessing and Training the CRF for POS-Tagging**

In [14]:
import string
import regex

def process_morph_analysis(morph, token_data):
    token, morph_analysis, lemma, xpos, upos = token_data
    root_tags = ["VROOT", "NSTEM", "PRONSTEM7", "PRONSTEM9", "PRONSTEM14"]
    max_word = ''
    max_tag = ''
    max_len = 0
    index = 0
    for i, (word, tag) in enumerate(morph):
        if (word == lemma) or (tag in root_tags) or (tag == upos) or (tag == xpos):
            max_word = lemma
            max_tag = tag
            index = i
            break
        else:
            if len(word) > max_len:
                max_len = len(word)
                max_word = word
                max_tag = tag
                index = i

    return [max_word, max_tag], morph[:index], morph[index+1:]

def extract_features(sentence, token_data, index):
    token, morph_analysis, lemma, xpos, upos = token_data
    root, prefixes, suffixes = process_morph_analysis(morph_analysis, token_data)

    features = {
        "token": token,
        "token_length": len(token),
        "lemma": lemma,
        "previous_token": "",
        "next_token": "",
        "lower_cased_token" : token.lower(),
        "is_start": index==0,
        "is_end": index==len(sentence)-1,
        "is_capitalized": token[0].isupper(),
        "is_upper_cased": token==token.upper(),
        "is_lower_cased": token==token.lower(),
        "is_punctuation": (token in string.punctuation),
        "is_numeric": token.isdigit(),
        "has_hyphen": '-' in token,
        "has_capital_inside": not token[1:].lower()==token[1:],
        "root": root[0],
        "prefix_1": "",
        "prefix_2": "",
        "prefix_3": "",
        "suffix_1": "",
        "suffix_2": "",
        "suffix_3": ""
        # "root_tag": root[1],
        # "prefix_1_tag": "",
        # "prefix_2_tag": "",
        # "prefix_3_tag": "",
        # "suffix_1_tag": "",
        # "suffix_2_tag": "",
        # "suffix_3_tag": "",
    }

    if index>0:
        prev_token, prev_morph, prev_lemma, prev_xpos, prev_upos = sentence[index-1]
        features["previous_token"] = prev_token

    if index<len(sentence)-1:
        next_token, next_morph, next_lemma, next_xpos, next_upos = sentence[index+1]
        features["next_token"] = next_token

    for i, (prefix, tag) in enumerate(prefixes):
        features["prefix_"+str(i+1)] = prefix if i<3 else ""
        #features["prefix_"+str(i+1)+"_tag"] = tag if i<3 else ""

    for i, (suffix, tag) in enumerate(suffixes[::-1]):
        features["suffix_"+str(i+1)] = suffix if i<3 else ""
        #features["suffix_"+str(i+1)+"_tag"] = tag if i<3 else ""

    return features

In [15]:
X, y = [], []
X_train = []
y_train = []
for sentence in tokenized_text:
    X_sentence = []
    y_sentence = []
    for i, token in enumerate(sentence):
        X_sentence.append(extract_features(sentence, token, i))
        y_sentence.append(token[-1]) #-1 for upos / -2 for xpos
    X.append(X_sentence)
    y.append(y_sentence)

split = int(0.8 * len(X))
X_train = X[:split]
y_train = y[:split]
X_test = X[split:]
y_test = y[split:]

print(X_train[:1])

[[{'token': 'Ngetulu', 'token_length': 7, 'lemma': 'tulu', 'previous_token': '', 'next_token': 'kwaloko', 'lower_cased_token': 'ngetulu', 'is_start': True, 'is_end': False, 'is_capitalized': True, 'is_upper_cased': False, 'is_lower_cased': False, 'is_punctuation': False, 'is_numeric': False, 'has_hyphen': False, 'has_capital_inside': False, 'root': 'tulu', 'prefix_1': 'nga', 'prefix_2': '', 'prefix_3': '', 'suffix_1': '', 'suffix_2': '', 'suffix_3': ''}, {'token': 'kwaloko', 'token_length': 7, 'lemma': 'loko', 'previous_token': 'Ngetulu', 'next_token': ',', 'lower_cased_token': 'kwaloko', 'is_start': False, 'is_end': False, 'is_capitalized': False, 'is_upper_cased': False, 'is_lower_cased': True, 'is_punctuation': False, 'is_numeric': False, 'has_hyphen': False, 'has_capital_inside': False, 'root': 'loko', 'prefix_1': 'kwa', 'prefix_2': '', 'prefix_3': '', 'suffix_1': '', 'suffix_2': '', 'suffix_3': ''}, {'token': ',', 'token_length': 1, 'lemma': ',', 'previous_token': 'kwaloko', 'next

In [7]:
!pip install -U sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.10 sklearn_crfsuite-0.3.6


In [17]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

In [18]:
from sklearn_crfsuite import metrics

y_pred = crf.predict(X_test)
print("Accuracy:", metrics.flat_accuracy_score(y_test, y_pred))
print("F1-Score:", metrics.flat_f1_score(y_test, y_pred, average='macro'))

Accuracy: 0.9436485195797517
F1-Score: 0.8371555065104967
