In [1]:
# importing required libraries
import pycrfsuite
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
import random
import numpy as np

In [2]:
#loading the data sentence wise
def load_data(files):
    data, sent = [], []
    for file in files:
        with open(file, 'r',encoding="utf8") as rf:
            for line in rf:
                if line.strip() != '':
                    sent.append(line.strip().split('\t'))
                else:
                    if len(sent) > 0:
                        data.append(sent)
                        sent = []
    return data

sents = load_data(['data/FB_HI_EN_CR.txt', 'data/TWT_HI_EN_CR.txt', 'data/WA_HI_EN_CR.txt',
                   'data/FB_BN_EN_CR.txt','data/TWT_BN_EN_CR.txt', 'data/WA_BN_EN_CR.txt',
                  'data/FB_TE_EN_CR.txt', 'data/TWT_TE_EN_CR.txt', 'data/WA_TE_EN_CR.txt'])

print('Toatal number of sentences:',len(sents))

Toatal number of sentences: 5233


In [3]:
#shuffling data and splitting into train and test data
random.seed(12)
random.shuffle(sents)

#80/20 split - train/valid
train_sents = sents[:int(0.8*len(sents))]  
valid_sents = sents[int(0.8*len(sents)):]
print("Train sentences: %d" % (len(train_sents)))
print("Validation sentences: %d" % (len(valid_sents)),'\n')

# Displaying a sample of train and validation sentences
print('train_sents_sample:','\n',train_sents[1])
print('\n','valid_sents_sample:','\n',valid_sents[2])

Train sentences: 4186
Validation sentences: 1047 

train_sents_sample: 
 [['Grad', 'en', 'G_N'], ['school', 'en', 'G_N'], ['dude', 'en', 'G_N'], ['!', 'univ', 'G_X']]

 valid_sents_sample: 
 [['kya', 'hi', 'G_PRP'], ['hai', 'hi', 'G_V'], ['beeee', 'hi', 'G_PRP'], ['....????', 'univ', 'G_X']]


# Features Selected
1. Current word
2. Language of the current word
3. Whether current word is alphanumeric or not ('True' / 'False')
3. Character n-grams of the current word (n = 1 to 5 are selected)
4. Begin and End of sentence markers (BOS and EOS)
5. Context ( Previous word & Next Word )

In [4]:
def word2features(sent, k):
    #current word
    word = sent[k][0]
    features = ['token=%s' % (word)]
    
    #Language of the current word
    language = sent[k][1]
    features.append(language)
    
    #Whether current word is alphanumeric or not ('True' / 'False')
    features.append(str(word.isalnum()))
    
    # extracting n-grams, for n=1 to 5
    for i in range(1,6):
        # if the value of n is greater than the word length, we exit the loop
        if i > len(word):
            break
        character_features = [word[j:j+i] for j in range(len(word)-i+1)]
        features.extend([
            # is count of individual n-grams important? is the order important?
            "char-%d-gram=%s" % (i, ' '.join(list(set(character_features))))
        ])
    if k == 0:
        # first word in the sentence
        features.append('BOS')
    else:
        # previous word
        features.extend([
            "-1:word=%s" % (sent[k-1][0])
        ])
    if k == (len(sent)-1):
        # last word in the sentence         
        features.append('EOS')
    else:
        # next word
        features.extend([
            "+1:word=%s" % (sent[k+1][0])
        ])
 
    return features
        
def sent2features(sent):
    # generating features for all the words/tokens in a sentence `sent`    
    return [word2features(sent, i) for i in range(len(sent))]

def sent2pos(sent):
    # obtaining parts of speech for all the words/tokens in a sentence `sent` 
    return [pos_tag for token, language_label, pos_tag in sent]

def sent2tokens(sent):
    return [token for token, language_label, pos_tag in sent]

In [5]:
# Obtaining traning and testing features from the given data
X_train = [sent2features(sent) for sent in train_sents]
y_train = [sent2pos(sent) for sent in train_sents]

X_test = [sent2features(sent) for sent in valid_sents]
y_test = [sent2pos(sent) for sent in valid_sents]

# Displaying a sample of training data
print(X_train[1])
print('\n',y_train[0])

[['token=Grad', 'en', 'True', 'char-1-gram=G a r d', 'char-2-gram=ra Gr ad', 'char-3-gram=rad Gra', 'char-4-gram=Grad', 'BOS', '+1:word=school'], ['token=school', 'en', 'True', 'char-1-gram=s h c o l', 'char-2-gram=ol ch ho oo sc', 'char-3-gram=cho ool hoo sch', 'char-4-gram=scho hool choo', 'char-5-gram=schoo chool', '-1:word=Grad', '+1:word=dude'], ['token=dude', 'en', 'True', 'char-1-gram=d u e', 'char-2-gram=de ud du', 'char-3-gram=ude dud', 'char-4-gram=dude', '-1:word=school', '+1:word=!'], ['token=!', 'univ', 'False', 'char-1-gram=!', '-1:word=dude', 'EOS']]

 ['#', 'G_N', 'G_N', 'G_PRT', 'G_J', 'G_J', 'PSP', 'G_N', 'G_N', 'G_X', 'G_N', 'G_SYM', 'G_N', 'G_N', 'G_J', 'G_N', 'G_N', 'G_N', 'G_X', 'G_R', 'G_PRP', 'G_N', 'PSP', 'G_N', 'G_N', 'G_PRP', 'G_PRT', 'G_R', 'PSP', 'G_N', 'G_X', 'G_V', 'PSP', 'G_PRP', 'G_X', 'G_N', 'G_X', 'G_N', 'G_N', 'G_X', 'G_N']


# Training Model 

In [6]:
# Modeling Conditional Random Fields using pycrfsuite

trainer = pycrfsuite.Trainer(verbose=False)  

#appending (features,labels) to the model
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

#Setting model parameters
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 0.001,  # coefficient for L2 penalty
    'max_iterations': 200, 
    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
    })

In [7]:
%%time
trainer.train('POS_model.crfsuite') #Training the model and saving it as POS_model.crfsuite

Wall time: 52.4 s


# Model Evaluation

In [8]:
tagger = pycrfsuite.Tagger()
tagger.open('POS_model.crfsuite')

<contextlib.closing at 0x284615a7cc0>

In [9]:
example_sent = valid_sents[21]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2pos(example_sent)))

@maheshaddict meeru malli prabhas fans tho kalisi snake dance cheseru akkada mee cinema range intha worst ani asalu expect cheyyale

Predicted: @ G_PRP G_N G_N G_N PSP G_N G_N G_N G_N G_V G_PRP G_N G_N G_X G_J G_N G_N G_V G_N
Correct:   @ G_X G_X G_N G_N PSP G_X G_N G_N G_X G_X G_PRP G_N G_N G_X G_J G_X G_X G_V G_X


In [10]:
y_pred = [tagger.tag(xseq) for xseq in X_test]

def analysis_report(y_true, y_pred):
    # to convert the POS taggings into one-hot encoded format
    lb = LabelBinarizer()
    y_true_coded = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_coded = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = list(lb.classes_)
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(y_true_coded,y_pred_coded,
                                 labels= [class_indices[cls] for cls in tagset],
                                 target_names = tagset)

print(analysis_report(y_test, y_pred))

              precision    recall  f1-score   support

           #       0.74      0.47      0.57       220
           $       0.76      0.54      0.63       190
           @       0.77      0.95      0.85       501
          CC       0.81      0.67      0.73       291
          DT       0.86      0.82      0.84       490
           E       0.94      0.77      0.85       156
         G_J       0.79      0.49      0.61       880
         G_N       0.70      0.89      0.78      5666
       G_PRP       0.79      0.72      0.75      1154
       G_PRT       0.66      0.47      0.55       452
         G_R       0.79      0.61      0.69       513
       G_SYM       0.69      0.44      0.53       149
         G_V       0.80      0.65      0.72      2344
         G_X       0.80      0.79      0.80      2944
         PSP       0.80      0.67      0.73      1048
           U       0.59      0.77      0.67       123
        null       0.00      0.00      0.00        73
           ~       1.00    

In [11]:
# Accuracy Score 

lb = LabelBinarizer()
y_true_coded = lb.fit_transform(list(chain.from_iterable(y_test)))
y_pred_coded = lb.transform(list(chain.from_iterable(y_pred)))

import numpy as np
y_true_labels = list(np.argmax(y_true_coded, axis=1))
y_pred_labels = list(np.argmax(y_pred_coded, axis=1))

print("Accuracy Score:", accuracy_score(y_true_labels,y_pred_labels))

Accuracy Score: 0.7522653345724907
