#Import Necessary Libraries and CSV Files

In [None]:
import re
import pandas as pd

In [None]:
ORGS = pd.read_csv("/content/ORGANIZATIONS.csv", encoding= "utf-8")
LOCS = pd.read_csv("/content/LOCATIONS.csv", encoding= "utf-8")
PERSONS = pd.read_csv("/content/PERSONS.csv", encoding= "utf-8")

Flattening lexicon arrays and reducing them to word level since CRF will work on word by word basis

In [None]:
LEXNET = {}
LEXDEEP = {}

for lexIter in [list(ORGS["0"]), list(LOCS["0"]), list(PERSONS["0"])]:
  for org in lexIter:
    org = org.lower()
    if "\n" in org:
      LEXNET[org[0:-1]] = 1
      for i in org[0:-1].split(" "):
        LEXDEEP[i] = 1
    else:
      LEXNET[org] = 1
      for i in org.split(" "):
        LEXDEEP[i] = 1

Implementing ma file and serializing it to a dictionary. Dictionary keys are words as a whole and values are inflectional representations in order to cut run time

In [None]:
filePOS = open("/content/drive/MyDrive/NE.ma.txt", "r", encoding="utf-8")
lines = filePOS.readlines()

In [None]:
posDict = {}

for line in lines:
  data = line.split(" ")
  if "\n" in data[2]:
    posDict[data[1]] = data[2][:-1]
  else:
    posDict[data[1]] = data[2]

Read NE.txt and regulize it by re.sub by reducing some noise

In [None]:
fileNer = open("/content/drive/MyDrive/NE.txt", "r", encoding="utf-8")
lines = fileNer.read(-1)

In [None]:
lines = re.sub(r'[ \t]+', " ", lines)
lines = re.sub(r" \.", ".", lines)
lines = re.sub(r">(?=\S)", "> ", lines)
lines = re.sub(r"(?<=\S)<", " <", lines)
lines = lines.split("\n")

Converting a sentence to tuple representations. I used 6 different state. After code sees <b_enamex tag, the system starts fetching labels. I used regex systems for this parse and string comparisons.

In [None]:
def sent2Tuple(sent):
  res = []
  label = ""
  mode = ""
  for word in sent.split():
    if word == "<e_enamex>":
      mode = ""
    elif mode == "LABELEDFIRST":
      res.append((word, "B-" + label))
      mode = "LABELED"
    elif mode == "LABELED":
      res.append((word, "I-" + label))
    elif mode == "GETLABEL":
      label = re.findall(r'(?<=TYPE=")[^">]*', word)[0]
      mode = "LABELEDFIRST"
    else:
      if word == "<b_enamex":
        mode = "GETLABEL"
      else:
        res.append((word, "O"))
  return res

SAMPLE_TEXTS = [sent2Tuple(line) for line in lines]

In [None]:
SAMPLE_TEXTS[0]

[('Müzik', 'O'),
 ('Şenliği', 'O'),
 ("'ne", 'O'),
 ('hazırlanın', 'O'),
 ('POZİTİF', 'B-ORGANIZATION'),
 ('ve', 'I-ORGANIZATION'),
 ('Açık', 'I-ORGANIZATION'),
 ('Radyo', 'I-ORGANIZATION'),
 ('işbirliğiyle', 'O'),
 ('düzenlenecek', 'O'),
 ('olan', 'O'),
 ('İstanbul', 'B-LOCATION'),
 ('Müzik', 'O'),
 ('Şenliği', 'O'),
 ('2', 'O'),
 (',', 'O'),
 ('müzikseverlere', 'O'),
 ('Aralık', 'O'),
 ('ayında', 'O'),
 ('merhaba', 'O'),
 ('demeye', 'O'),
 ('hazırlanıyor', 'O')]

Splitting data into folds by iteratively adding to other lists. I used mod 5 counter

In [None]:
foldDatas = [[], [], [], [], []]
modCount = 0

for line in SAMPLE_TEXTS:
  foldDatas[modCount % 5].append(line)
  modCount += 1

#Neccessary libraries for CRF model

In [None]:
!pip install sklearn-crfsuite



In [None]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
import numpy as np
from tqdm import tqdm_notebook
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

Getting POS tag, POS root and inflectional representations of words from looking at POS tag dictionaries

In [None]:
def getPOSTag(word):
  if word in posDict.keys():
    infForm = posDict[word]
    start = infForm.rfind("+") + 1
    return infForm[start:]
  else:
    return ""

def getPOSRoot(word):
  inflectional = posDict.get(word, "")
  return inflectional[:inflectional.find("+")]

def getInflectional(word):
  inflectional = posDict.get(word, "")
  return inflectional[inflectional.find("+") + 1:]

Getting case encoding of a word. If it is all lower function gives 0, if it is all upper it gives 1, if it is a proper noun it gives 2 and if it is a combination of upper and lower case chars

In [None]:
def caseEncode(word):
  global posDict

  if "Prop" in posDict.get(word, ""):
    return 2
  if word.islower():
    return 0
  elif word.isupper():
    return 1
  else:
    return 3

Check if word is in lexicon or not. I used dictionary for this

In [None]:
def checkLexicon(word):
  global LEXDEEP

  if LEXDEEP.get(word, "") == 1:
    return True
  return False

My main features implementation. This is best feature set I have been explotied

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    posTag = getPOSTag(word)

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        "Root": getPOSRoot(word),
        "POS": posTag,
        "INFLEC": getInflectional(word),
        "startsUpper": word[0].isupper(),
        "case": caseEncode(word),
        "lex": checkLexicon(word.lower()),
        #'word[-3:]': word[-3:],
        #'word[-2:]': word[-2:],
        #'word.isupper()': word.isupper(),
        #'word.istitle()': word.istitle(),
        #'word.isdigit()': word.isdigit(),
    }
    
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            "-1:Root": getPOSRoot(word1),
            "-1:POS": getPOSTag(word1),
            "-1:INFLEC": getInflectional(word1),
            "-1:startsUpper": word1[0].isupper(),
            "-1:case": caseEncode(word1),
            "-1:lex": checkLexicon(word1.lower()),
            #'-1:word.istitle()': word1.istitle(),
            #'-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            "+1:Root": getPOSRoot(word1),
            "+1:POS": getPOSTag(word1),
            "+1:INFLEC": getInflectional(word1),
            "+1:startsUpper": word1[0].isupper(),
            "+1:case": caseEncode(word1),
            "-1:lex": checkLexicon(word1.lower()),
            #'+1:word.istitle()': word1.istitle(),
            #'+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

Converting fold datas to feature arrays

In [None]:
foldFeatures = [[], [], [], [], []]

foldFeatures[0] = [sent2features(s) for s in tqdm_notebook(foldDatas[0])]
foldFeatures[1] = [sent2features(s) for s in tqdm_notebook(foldDatas[1])]
foldFeatures[2] = [sent2features(s) for s in tqdm_notebook(foldDatas[2])]
foldFeatures[3] = [sent2features(s) for s in tqdm_notebook(foldDatas[3])]
foldFeatures[4] = [sent2features(s) for s in tqdm_notebook(foldDatas[4])]

foldLabels = [[], [], [], [], []]

foldLabels[0] = [sent2labels(s) for s in foldDatas[0]]
foldLabels[1] = [sent2labels(s) for s in foldDatas[1]]
foldLabels[2] = [sent2labels(s) for s in foldDatas[2]]
foldLabels[3] = [sent2labels(s) for s in foldDatas[3]]
foldLabels[4] = [sent2labels(s) for s in foldDatas[4]]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=2001.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




Get fold by foldNo. I used mod 5 counter for combining train data

In [None]:
def getFold(foldNo):
  x_train = foldFeatures[(foldNo + 1) % 5] + foldFeatures[(foldNo + 2) % 5] + foldFeatures[(foldNo + 3) % 5] + foldFeatures[(foldNo + 4) % 5]
  x_test = foldFeatures[foldNo]

  y_train = foldLabels[(foldNo + 1) % 5] + foldLabels[(foldNo + 2) % 5] + foldLabels[(foldNo + 3) % 5] + foldLabels[(foldNo + 4) % 5]
  y_test = foldLabels[foldNo]

  return x_train, x_test, y_train, y_test

In [None]:
def getTestFold(foldNo):
  _, x_test, _, y_test = getFold(foldNo)
  return x_test, y_test

Getting crf model with c1 = 0.1 and c2 = 0.1 and with lbfgs algorithm. I converted crf model to function for kfold

In [None]:
def train_ctf(x_train, y_train):
  crf = sklearn_crfsuite.CRF(
      algorithm= "lbfgs",
      c1 = 0.1,
      c2 = 0.1,
      max_iterations = 100,
      all_possible_transitions = True
  )

  crf.fit(x_train, y_train)

  return crf

Get labels CRF has seen by removing "O" tag

In [None]:
def getCRFLabels(crf):
  labels = list(crf.classes_)
  labels.remove('O')
  return labels

Get flat f1 score of crf model by weighted evaluation

In [None]:
def getCRFTestScore(crf, x_test, y_test):
  y_pred = crf.predict(x_test)
  return metrics.flat_f1_score(y_test, y_pred, average= "weighted", labels= labels)

Generate classification report of a given crf model and test data

In [None]:
def printClassificationReport(labels, ctf, x_test, y_test):
  sorted_labels = sorted(
      labels,
      key= lambda name: (name[1:], name[0])
  )

  y_pred = crf.predict(x_test)
  print(y_pred)

  print(metrics.flat_classification_report(
      y_test, y_pred, labels= sorted_labels, digits= 3
  ))

In [None]:
foldNo = 2
x_train = foldDatas[(foldNo + 1) % 5] + foldDatas[(foldNo + 2) % 5] + foldDatas[(foldNo + 3) % 5] + foldDatas[(foldNo + 4) % 5]
x_test = foldDatas[foldNo]

In [None]:
len(x_test)

2000

In [None]:
len(x_train)

8001

In [None]:
np.intersect1d(foldDatas[1], foldDatas[2]).shape

  return array(a, dtype, copy=False, order=order, subok=True)


(54,)

Iteratively try out each fold and decide which one is better by looking at f1 scores. For many models, fold 2 was the best fold

In [None]:
foldScores = []
foldModels = []
for i in tqdm_notebook(range(5)):
  print(i)
  x_train, x_test, y_train, y_test = getFold(i)

  crf = train_ctf(x_train, y_train)
  labels = getCRFLabels(crf)

  foldScores.append(getCRFTestScore(crf, x_test, y_test))
  foldModels.append(crf)

  printClassificationReport(getCRFLabels(crf), crf, x_test, y_test)

bestFoldIdx = len(foldScores) - 1
print("Best fold: ", bestFoldIdx)
print("Best fold score: ", foldScores[bestFoldIdx])

x_train, x_test, y_train, y_test = getFold(bestFoldIdx)
printClassificationReport(getCRFLabels(foldModels[bestFoldIdx]), foldModels[bestFoldIdx], x_test, y_test)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

0
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'B-LOCATION', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'B-PERSON', 'O', 'B-PERSON', 'I-PERSON', 'O', 'B-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'I-

In [None]:
foldModels



[CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
     averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
     calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
     calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
     gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
     max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
     pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False),
 CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
     averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
     calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
     calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
     gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
     max_linesearch=None, min_freq=None, model_filename=None, 

In [None]:
x_train, x_test, y_train, y_test = getFold(2)

crf = train_ctf(x_train, y_train)
labels = getCRFLabels(crf)

printClassificationReport(getCRFLabels(crf), crf, x_test, y_test)

x_train, x_test, y_train, y_test = getFold(2)

printClassificationReport(getCRFLabels(foldModels[bestFoldIdx]), foldModels[bestFoldIdx], x_test, y_test)

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O'], ['B-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'O', 'B-LOCATION', 'O', 'B-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'B-LOCATION', 'O', 'B-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O'], ['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

Situation if we look out for "O" labels

In [None]:
sorted_labels = sorted(
    getCRFLabels(foldModels[bestFoldIdx]) + ["O"],
    key= lambda name: (name[1:], name[0])
)

y_pred = foldModels[bestFoldIdx].predict(x_test)

print(metrics.flat_classification_report(
    y_test, y_pred, labels= sorted_labels, digits= 3
))

IndexError: ignored

Optimization of c1 and c2 weights with scipy and RandomizedSearchCV. This step did not make significant improvement for many models

In [None]:
%%time

x_train, x_test, y_train, y_test = getFold(bestFoldIdx)

crf = sklearn_crfsuite.CRF(
    algorithm= "lbfgs",
    max_iterations = 100,
    all_possible_transitions = True
)

params_space = {
    "c1": scipy.stats.expon(scale= 0.5),
    "c2": scipy.stats.expon(scale= 0.05)
}

f1_scorer = make_scorer(metrics.flat_f1_score, average= "weighted", labels = labels)

rsCV = RandomizedSearchCV(crf, params_space, cv= 3,
                          verbose= 1, n_jobs= -1, n_iter= 50, scoring= f1_scorer)
rsCV.fit(x_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 66.2min finished


CPU times: user 54min 51s, sys: 34.1 s, total: 55min 25s
Wall time: 1h 6min 40s


In [None]:
print("best params: ", rsCV.best_params_)
print("best CV score: ", rsCV.best_score_)
print('model size: {:0.2f}M'.format(rsCV.best_estimator_.size_ / 1000000))

best params:  {'c1': 0.002430647831950319, 'c2': 0.013642736189441155}
best CV score:  0.8786166206085921
model size: 2.37M


In [None]:
crf = rsCV.best_estimator_
y_pred = crf.predict(x_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels= sorted_labels, digits= 3
))

                precision    recall  f1-score   support

             O      0.993     0.996     0.994     30520
    B-LOCATION      0.938     0.916     0.927       811
    I-LOCATION      0.827     0.653     0.729        95
B-ORGANIZATION      0.929     0.906     0.917       594
I-ORGANIZATION      0.856     0.882     0.869       390
      B-PERSON      0.941     0.904     0.922      1109
      I-PERSON      0.919     0.915     0.917       471

      accuracy                          0.986     33990
     macro avg      0.915     0.882     0.897     33990
  weighted avg      0.986     0.986     0.986     33990

