In [1]:
import csv
from collections import Counter
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import CRF

In [2]:
# Reading train file
train_file=open("hi-ud-train.conllu",encoding="utf-8")
csv_reader = csv.reader(train_file, delimiter=",")
data_train=[]
for line in csv_reader:
    data_train.append((line[0],line[1],line[2]))

In [3]:
# Prepare train sentences and tags, accumulating words and tags in same sentence
sent_train=[]
tag_train=[]
sents=[]
tags=[]
for i in range(1,len(data_train)):
    if data_train[i]!=('', '', ''):
        sents.append(data_train[i][1])
        tags.append(data_train[i][2])
    else:
      sent_train.append(sents)
      tag_train.append(tags)
      sents=[]
      tags=[]
sent_train.append(sents)
tag_train.append(tags)

In [4]:
# Reading test file
test_file=open("hi-ud-test .conllu",encoding="utf-8")
csv_reader = csv.reader(test_file, delimiter="\t")
data_test=[]
for line in csv_reader:
    data_test.append((line[0],line[1],line[2]))

In [5]:
# Preparing test sentences and tags, accumulating words and tags in same sentence
sent_test=[]
tag_test=[]
sents=[]
tags=[]
for i in range(1,len(data_test)):
    if data_test[i]!=('', '', ''):
      sents.append(data_test[i][1])
      tags.append(data_test[i][2])
    else:
      sent_test.append(sents)
      tag_test.append(tags)
      sents=[]
      tags=[]
sent_test.append(sents)
tag_test.append(tags)

## Features chosen:
**word**           - Word  
**is_first**       - True if word is the beginning of the sentence  
**is_last** - True if word is the end of the sentence  
**prefix-1**       - Word[0]  
**prefix-2**       - Word[:2]  
**prefix-3**       - Word[:3]  
**suffix-1**       - Word[-1]  
**suffix-2**       - Word[-2:]  
**suffix-3**       - Word[-3:]  
**prev_word**       - previous word  
**next_word**       - next word  
**is_numeric**       - True if all characters of word are digits  
**is_length_one**       - True if word has single character  
**na_at_last**       - True if word has 'na' at end  
**ya_at_last**       - True if word has 'ya' at end  
**is_capitalized**       - True if word starts with a capital letter  
**is_last_a**       - True if word has 'a' at end   
**length**       - Length of the word  

## Justification:
Prefixes,suffixes are used because certain prefixes and suffixes are common for certain parts of speeches. Similarly for previous and next words, first and last words and numeric words. 'na', 'ya' and 'a' are two common suffixes in Hindi, hence they have been used.

In [6]:
# Returns features of a word given sentence and index of word
def features(sentence,index):
    return {
        'word': sentence[index],
        'is_first': int(index == 0),
        'is_last': int(index == len(sentence) - 1),
        'prefix-1': sentence[index][:1],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1:],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == (len(sentence) - 1) else sentence[index + 1],
        'is_numeric': int(sentence[index].isdigit()),
        'is_length_one': int(len(sentence[index])==1),
        'na_at_last': int(sentence[index][-2:].lower()=='na'),
        'ya_at_last': int(sentence[index][-2:].lower()=='ya'),
        'is_capitalized': int(sentence[index][:1].upper() == sentence[index][:1]),
        'is_last_a': int(sentence[index][-1:].lower()=='a'),
        'length': len(sentence[index]),
    }

In [7]:
# Transforms the words in sentences to their features and returns X,y where X are features, y are labels
def transform_to_dataset(tagged_sentences,pos):
    X, y = [], []
    for i in range(len(tagged_sentences)):
        X.append([features(tagged_sentences[i],index) for index in range(len(tagged_sentences[i]))])
        y.append(pos[i])
    return X, y

In [8]:
# Fit the crf on train set
X_train, y_train = transform_to_dataset(sent_train,tag_train)
X_test, y_test = transform_to_dataset(sent_test,tag_test)

crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=800,
    all_possible_transitions=True
)
# Fit the crf on training data
crf = crf.fit(X_train, y_train)

In [9]:
# Predict on train set
y_pred_train = crf.predict(X_train)

#Print the scores on train data
print('\nFor training set\n')
print(metrics.flat_classification_report(y_train, y_pred_train, digits=4))
print('precision :',metrics.flat_precision_score(y_train, y_pred_train, average='weighted'))
print('recall :',metrics.flat_recall_score(y_train, y_pred_train, average='weighted'))
print('f1-score :',metrics.flat_f1_score(y_train, y_pred_train, average='weighted'))
print('accuracy :',metrics.flat_accuracy_score(y_train, y_pred_train))


For training set

              precision    recall  f1-score   support

         ADJ     1.0000    1.0000    1.0000       570
         ADP     0.9978    0.9986    0.9982      1387
         ADV     0.9820    0.9820    0.9820       111
         AUX     0.9825    1.0000    0.9912       730
       CCONJ     0.9934    1.0000    0.9967       150
       COMMA     1.0000    1.0000    1.0000       114
         DET     0.9957    0.9913    0.9935       231
        NOUN     0.9981    0.9994    0.9987      1597
         NUM     1.0000    1.0000    1.0000       152
        PART     1.0000    1.0000    1.0000       163
        PRON     0.9954    0.9954    0.9954       431
       PROPN     1.0000    0.9958    0.9979       708
       PUNCT     1.0000    1.0000    1.0000       564
       SCONJ     0.9839    1.0000    0.9919        61
        VERB     1.0000    0.9781    0.9889       640
           X     1.0000    1.0000    1.0000         2

    accuracy                         0.9966      7611
   macr

In [10]:
# Predict on test data
y_pred = crf.predict(X_test)

#Print the scores on test data
print('\nFor test set\n')
print(metrics.flat_classification_report(y_test, y_pred, digits=4))
print('precision :',metrics.flat_precision_score(y_test, y_pred, average='weighted'))
print('recall :',metrics.flat_recall_score(y_test, y_pred, average='weighted'))
print('f1-score :',metrics.flat_f1_score(y_test, y_pred, average='weighted'))
print('accuracy :',metrics.flat_accuracy_score(y_test, y_pred))


For test set

              precision    recall  f1-score   support

         ADJ     0.6602    0.7234    0.6904        94
         ADP     0.9677    0.9709    0.9693       309
         ADV     0.6667    0.4762    0.5556        21
         AUX     0.9779    0.9568    0.9673       139
       CCONJ     1.0000    1.0000    1.0000        25
         DET     0.8250    0.9167    0.8684        36
        NOUN     0.7978    0.8754    0.8348       329
         NUM     0.9200    0.9200    0.9200        25
        PART     0.9706    1.0000    0.9851        33
        PRON     0.9016    0.8462    0.8730        65
       PROPN     0.6726    0.5241    0.5891       145
       PUNCT     1.0000    1.0000    1.0000       135
       SCONJ     0.7500    1.0000    0.8571         3
        VERB     0.8958    0.8687    0.8821        99

    accuracy                         0.8697      1458
   macro avg     0.8576    0.8627    0.8566      1458
weighted avg     0.8678    0.8697    0.8669      1458

precision 

In [11]:
#Prints the 10 most common and least common transition Features on train set

print("\nTop 10 most common transitions on train set:\n")
print(Counter(crf.transition_features_).most_common(10))

print("\nTop 10 least common transitions on train set:\n")
print(Counter(crf.transition_features_).most_common()[-10:][::-1])


Top 10 most common transitions on train set:

[(('VERB', 'AUX'), 4.284792), (('AUX', 'AUX'), 2.634958), (('NUM', 'NOUN'), 2.409708), (('PART', 'NUM'), 2.216555), (('PRON', 'ADP'), 2.20665), (('DET', 'NOUN'), 2.100979), (('NOUN', 'ADP'), 1.985218), (('PROPN', 'PROPN'), 1.856158), (('AUX', 'SCONJ'), 1.808891), (('ADJ', 'NOUN'), 1.791011)]

Top 10 least common transitions on train set:

[(('ADJ', 'ADP'), -2.640411), (('ADJ', 'PRON'), -2.40467), (('PROPN', 'ADJ'), -2.018176), (('DET', 'ADP'), -1.847357), (('PROPN', 'AUX'), -1.814063), (('PROPN', 'DET'), -1.792265), (('VERB', 'ADJ'), -1.672327), (('AUX', 'ADJ'), -1.597697), (('NOUN', 'ADJ'), -1.476917), (('VERB', 'PROPN'), -1.33317)]


In [12]:
#Prints the 10 most common and least common transition Features on test set
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=800,
    all_possible_transitions=True
)

# Fit the crf on test data just for printing transitions on test set
crf = crf.fit(X_test, y_test)

print("\nTop 10 most common transitions on test set:\n")
print(Counter(crf.transition_features_).most_common(10))

print("\nTop 10 least common transitions on test set:\n")
print(Counter(crf.transition_features_).most_common()[-10:][::-1])


Top 10 most common transitions on test set:

[(('VERB', 'AUX'), 3.351348), (('AUX', 'AUX'), 2.883161), (('NUM', 'NUM'), 2.029152), (('PROPN', 'ADP'), 1.968291), (('AUX', 'PUNCT'), 1.831876), (('NUM', 'NOUN'), 1.809205), (('PART', 'NUM'), 1.673777), (('PROPN', 'PROPN'), 1.620077), (('PRON', 'ADP'), 1.611337), (('DET', 'NOUN'), 1.493193)]

Top 10 least common transitions on test set:

[(('PROPN', 'ADJ'), -1.980153), (('ADJ', 'ADP'), -1.736564), (('PROPN', 'AUX'), -1.682555), (('VERB', 'PROPN'), -1.49978), (('NOUN', 'NOUN'), -1.363538), (('ADJ', 'ADJ'), -1.36261), (('VERB', 'ADJ'), -1.296107), (('NOUN', 'DET'), -1.098471), (('AUX', 'VERB'), -1.067114), (('DET', 'PROPN'), -1.055852)]
