### افزودن کتابخانه های مورد نیاز

In [1]:
import nltk, re, collections
from nltk import word_tokenize

from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from joblib import dump, load
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

### در اینجا فایل آموزش و آزمون را میخوانیم و تبدیل به جملات می کنیم

In [2]:
train_docs = [line.rstrip('\n') for line in open('Data/POStrutf.txt', encoding="utf8")]

start = 0
end = 0
sentences_train = []
sentence_train = []
for i, doc in enumerate(train_docs):
    word = re.split(r'\t+', doc)
    sentence_train.append(tuple([word[0], word[1]]))
    if word[0] == "#":
        start = i
    elif word[0] ==".":
        if start > end:
            nothing_to_do = 0  
        else:
            start = end
        end = i
        sentences_train.append(sentence_train[start+1:end+1])
##################       
test_docs = [line.rstrip('\n') for line in open('Data/POSteutf.txt', encoding="utf8")]

start = 0
end = 0
sentences_test = []
sentence_test = []
true_labels = []
for i, doc in enumerate(test_docs):
    word = re.split(r'\t+', doc)
    true_labels.append(word[1])
    sentence_test.append(tuple([word[0], word[1]]))
    if word[0] == ".":
        sentences_test.append(sentence_test[start:i+1])
        start = i + 1   

### تابع استخراج ویژگی را به صورت زیر تعریف میکنیم

In [3]:
def features(sentence, index):
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
    }

### دو تابع که از آن استفاده ابزاری خواهیم کرد 
###  تابع اول برای جدا کردن تگ ها از کلمات یک جمله است
### تابع دوم برای ایجاد دیتاستی از فیچر، تگ استفاده میشود

In [4]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y

### برای آموزش مدلمان، دیتاست مورد نظر خودمان را ایجاد می کنیم

In [5]:
X, y = transform_to_dataset(sentences_train) 

### در این قسمت مدل خود را ایجاد می کنیم، این کار قبلا در سیستم های قوی تر انجام شده است و به صورت فایل ذخیره شده است

In [6]:
# Load from disk
clf = load('NNModel.joblib')

if clf is None:
    print("clf is none")
    train_model()
else:
    print("Model is loaded")

# We’re now ready to train the classifier. We use Neural Network classifier
def train_model(): 
    clf = Pipeline([
        ('vectorizer', DictVectorizer(sparse=False)),
        ('classifier', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100), random_state=1))
    ])

    clf.fit(X[:53000], y[:53000])   # Use only the first 53K samples
    X_test, y_test = transform_to_dataset(sentences_test)

    print("Accuracy:", clf.score(X_test, y_test))
    # We reach to this accuracy
    # Accuracy: 0.8753189657822732
    
    # Save model to disk
    dump(clf, 'NNModel.joblib')


Model is loaded




In [7]:
X_test, y_test = transform_to_dataset(sentences_test)
print("Accuracy:", clf.score(X_test, y_test))

Accuracy: 0.8753747059999077


#### همانطور که در بالا مشاهده می شود دقت مدل بدست آمده برابر با 87 درصد می باشد

### در این تابع با استفاده از مدلی که ایجاد کردیم، جملات آزمون را تگ گذاری می کنیم

In [8]:
def pos_tag(sentence):
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return zip(sentence, tags)

pred_labels_counter = collections.defaultdict(lambda:0)
pred_labels = []
for sentence in sentences_test:
    x = pos_tag(untag(sentence))
    for word, tag in x:
        pred_labels.append(tag)
        pred_labels_counter[tag] += 1
        
all_true_labels = list(set(true_labels))

### در این قسمت ماتریس سرگشتگی را ایجاد می کنیم

In [9]:
cm = confusion_matrix(true_labels, pred_labels, labels=all_true_labels)
np.set_printoptions(suppress=True)
p = np.zeros((len(all_true_labels), len(all_true_labels)))

rounding_parameter = 4

for i in range(len(cm)):
    for j in range(len(cm)):
        p[i][j] = round((cm[i][j]/sum(cm[i])), rounding_parameter)
        
df = pd.DataFrame(p, columns=all_true_labels, index=all_true_labels)
df

Unnamed: 0,AR,MQUA,OH,PS,OHH,MORP,P,N,DELM,DET,...,ADJ,QUA,MS,CON,ADV,PP,NP,SPEC,PRO,IF
AR,0.0,0.0,0.0,0.0,0.0,0.0,0.0044,0.8157,0.0,0.0,...,0.0166,0.0052,0.0,0.0847,0.0061,0.0,0.0,0.0,0.0367,0.0
MQUA,0.0,0.0667,0.0,0.0,0.0,0.0,0.0,0.1333,0.0,0.0,...,0.2,0.2667,0.0,0.0,0.3333,0.0,0.0,0.0,0.0,0.0
OH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OHH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MORP,0.0,0.0,0.0,0.0,0.0,0.0682,0.0,0.5,0.0,0.0,...,0.4091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P,0.0,0.0,0.0,0.0,0.0,0.0,0.9744,0.0206,0.0,0.0,...,0.0026,0.0,0.0,0.0,0.0023,0.0,0.0,0.0,0.0001,0.0
N,0.0,0.0,0.0,0.0,0.0,0.0,0.005,0.9456,0.0,0.0,...,0.0368,0.001,0.0,0.0013,0.003,0.0,0.0,0.001,0.0002,0.0
DELM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0763,0.9137,0.0,...,0.0048,0.0,0.0,0.0003,0.002,0.0,0.0,0.0,0.0,0.0
DET,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0117,0.0,0.8998,...,0.0039,0.0,0.0,0.001,0.0078,0.0,0.0,0.001,0.0749,0.0


### در اینجا  عناصر غیر قطری را به صورت نزولی مرتب کنیم و سپس 10 عنصری که در آنها بیشترین خطا را داریم استخراج می کنیم

In [10]:
non_diagonal_values = []
non_diagonal_labels = []
for i in range(len(cm)):
    for j in range(len(cm)):
        if j == i:
            continue
        else:   
            non_diagonal_labels.append(tuple([str(all_true_labels[i])+","+str(all_true_labels[j]),p[i][j]]))
            non_diagonal_values.append(p[i][j])
            
sorted_list_of_non_diagonal = np.flipud(np.argsort(non_diagonal_values))

for i in range(len(sorted_list_of_non_diagonal[:10])):
    print(non_diagonal_labels[sorted_list_of_non_diagonal[i]])

('NP,N', 1.0)
('OHH,N', 1.0)
('OH,N', 1.0)
('MS,N', 0.8788)
('AR,N', 0.8157)
('PS,N', 0.8)
('MORP,N', 0.5)
('SPEC,N', 0.4184)
('MORP,ADJ', 0.4091)
('ADJ,N', 0.3429)


#### همانطور که در بالا میشود بیشترین خطاها به دلیل در نظر گرفتن مقدار 
#### N
#### به جای لیبل هایی مانند
#### OHH, NP, OH, MS
#### می باشد

### در اینجا یک فایل ورودی را می گیرد و در خروجی دیگر لیبل ها را تولید می کند

In [11]:
test_docs = [line.rstrip('\n') for line in open('Data/in.txt', encoding="utf8")]

start = 0
end = 0
sentences_test = []
sentence_test = []
true_labels = []
for i, doc in enumerate(test_docs):
    words = re.split(r' ', doc)
    for j, word in enumerate(words):
        sentence_test.append(tuple([word, 'N']))
        if word == ".":
            sentences_test.append(sentence_test[start:j+1])
            
            
X_test, y_test = transform_to_dataset(sentences_test)

outFile = open('Data/out.txt', 'w', encoding="utf8")

for sentence in sentences_test:
    x = pos_tag(untag(sentence))
    for word, tag in x:
        outFile.write(word+" "+tag)
        outFile.write("\n")
outFile.close()