In [1]:
import os
import numpy as np
import pandas as pd

from pyvi import ViTokenizer, ViPosTagger

import sklearn_crfsuite
from sklearn_crfsuite import metrics

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

import scipy.stats

from feature.doccano2fully import *
from feature.crf_plus_features import *

In [2]:
#the dataset input folder path
dataset_folder_path = 'data/input/'
# target labels
target_labels = ['type', 'brand', 'name', 'origin', 'form', 'color']
# test size
test_size = 0.3

#### Data Preparing
***

In [3]:
def clean_target_label(labels, target_labels):
    targets = []
    converter = lambda label : label if label in target_labels else 'None'
    targets = [converter(label) for label in labels]
    return targets

In [4]:
#return seq_in, seq_out, words_labels
def data_extracting(data_lines):
    seq_in = []
    seq_out = []
    words_labels = []

    for line in data_lines:
        line = doccano2fully(line)
        text, labels, words, tagged = extract_fully_data(line)
        labels = clean_target_label(labels, target_labels)
        if tagged:
            seq_in.append(text)
            seq_out.append(labels)
            words_labels.append([(word, label) for word, label in zip(words, labels)])

    return seq_in, seq_out, words_labels

In [5]:
#return word_posteg_label
def word_postagging(seq_in, words_labels):
    words_postags_labels = []
    for seq_itr in range(len(seq_in)):
        tokens_postags = ViPosTagger.postagging(ViTokenizer.tokenize(seq_in[seq_itr]))
        words_postags = []
        for tp_itr in range(len(tokens_postags[0])):
            if "_" in tokens_postags[0][tp_itr]:
                for word in tokens_postags[0][tp_itr].split('_'):
                    words_postags.append((word,tokens_postags[1][tp_itr]))
            else:
                words_postags.append((tokens_postags[0][tp_itr],tokens_postags[1][tp_itr]))
        words_postags_labels.append([(w, pos, label) for (w, label), (word, pos) in zip(words_labels[seq_itr], words_postags)])
        #break
    return words_postags_labels

In [6]:
# A function for extracting features in documents
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# A function fo generating the list of labels for each document
def sent2labels(sent):
    return [label for (token, postag, label) in sent]

In [7]:
def file2dataset(input_file_name):
    input_file_path = dataset_folder_path + input_file_name
    input_file = open(input_file_path, "r", encoding = "utf8")
    
    data_lines = input_file.readlines()
    input_file.close()
    
    seq_in, seq_out, words_labels = data_extracting(data_lines)
    words_postags_labels = word_postagging(seq_in, words_labels)

    # to features
    X = [sent2features(sent) for sent in words_postags_labels]
    y = [sent2labels(sent) for sent in words_postags_labels]

    # split to training set and testing set
    indices = np.arange(len(words_postags_labels))
    X_train_small, X_test_small, y_train_small, y_test_small, idx1, idx2 = train_test_split(X, y, indices, test_size=test_size, random_state=42)
    
    return X_train_small, X_test_small, y_train_small, y_test_small

In [None]:
#list of input files
input_files = os.scandir(dataset_folder_path)
input_file_names = [input_file.name for input_file in input_files]

#dataset preparing
X_train = []
X_test = []
y_train = []
y_test = []

for input_file_name in input_file_names:
    X_train_small, X_test_small, y_train_small, y_test_small = file2dataset(input_file_name)
    X_train += X_train_small
    X_test += X_test_small
    y_train += y_train_small
    y_test += y_test_small

print("Train set: ", len(X_train))
print("Test set: ", len(X_test))


Train set:  5481
Test set:  2355


#### Training and Testing
***

In [9]:
def CRF_training(X_train, y_train):
    crf = sklearn_crfsuite.CRF(
        algorithm = 'lbfgs',
        c1= 0.1,
        c2= 0.01,
        max_iterations= 8000,
        epsilon= 0.00000010,
        delta= 0.00000010,
        min_freq= 3,
        all_possible_transitions= True,
    )

    crf.fit(X_train, y_train)
    return crf

In [10]:
def classification_report(y_pred, y_test, labels):
    return (metrics.flat_classification_report(
        y_test, y_pred, labels=labels, digits=3
    ))

In [11]:
def metrics_report(y_pred, y_test, labels):
    precision = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=labels)
    recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=labels)
    f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
    return [precision, recall, f1_score]

In [12]:
def CRF_testing(model, X_test, y_test):
    y_pred = model.predict(X_test)
    labels = list(model.classes_)
    labels.remove('None')
    result_report = metrics_report(y_pred, y_test, labels)
    return result_report

In [13]:
def CRF_per_label(X_train, X_test, y_train, y_test, target_labels):
    
    crf_per_label = {}
    result_report_per_label = {}

    for target_label in target_labels:
        y_train_1 = [clean_target_label(y_elem, [target_label]) for y_elem in y_train]
        y_test_1 = [clean_target_label(y_elem, [target_label]) for y_elem in y_test]

        crf_per_label[target_label] = CRF_training(X_train, y_train_1)
        result_report_per_label[target_label] = CRF_testing(crf_per_label[target_label], X_test, y_test_1)

    return result_report_per_label

In [14]:
# Training and Testing CRF for each taregt label with same dataset

result_report_per_label = CRF_per_label(X_train, X_test, y_train, y_test, target_labels)
pd.DataFrame.from_dict(result_report_per_label, orient='index', columns=['Precision', 'Recall', 'F1-Score'])

Unnamed: 0,Precision,Recall,F1-Score
type,0.864087,0.853754,0.85889
brand,0.694238,0.637916,0.664887
name,0.77997,0.700201,0.737936
origin,0.938485,0.922566,0.930457
form,0.786219,0.785525,0.785872
color,0.814241,0.756835,0.784489


In [15]:
# Training and Testing CRF for all target_labels
crf_all_label = CRF_training(X_train, y_train)
y_pred = crf_all_label.predict(X_test)
print(classification_report(y_pred, y_test, target_labels))




              precision    recall  f1-score   support

        type      0.858     0.853     0.855      5101
       brand      0.691     0.656     0.673      1171
        name      0.771     0.711     0.740      2992
      origin      0.922     0.901     0.912      1356
        form      0.773     0.771     0.772      2266
       color      0.804     0.732     0.767       695

   micro avg      0.815     0.790     0.802     13581
   macro avg      0.803     0.771     0.786     13581
weighted avg      0.814     0.790     0.801     13581

