In [29]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import re
import codecs
import subprocess

from matplotlib import pyplot as plt
import pystruct as pystr

from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, GRU, LSTM
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D, MaxPooling1D

import progressbar


%matplotlib inline

По мотивам:
http://deeplearning.net/tutorial/rnnslu.html
https://chsasank.github.io/spoken-language-understanding.html

# Load data

In [30]:
with open('slotfilling-data.json', 'r', encoding='UTF8') as file:
    data = json.load(file)

In [31]:
def process_data(data):
    X = [item['chat'] for item in data]
    y = []
    for item in data:
        entities = item['entities']
        y_item = {}
        for entity in entities:
            y_item[entity['title']] = {
                'start_pos': entity['start_pos'],
                'end_pos': entity['end_pos'],
                'text': entity['text']
            }

        y.append(y_item)
    
    return np.array(X), np.array(y)

In [32]:
data, ans = process_data(data)

In [33]:
# Mix the data
perm = np.random.permutation(len(data))
data, ans = data[perm], ans[perm]

In [34]:
possible_slots = set([item for y_item in ans for item in list(y_item.keys())])
possible_slots

{'ВАЛЮТА',
 'ВРЕМЯ_ДАТА_СНЯТИЯ',
 'ЗА_ГРАНИЦЕЙ',
 'МЕСТО_СНЯТИЯ',
 'НАЗВАНИЕ_БАНКА',
 'НОМЕР_ТЕЛЕФОНА',
 'РАЗМЕР_КОМИССИИ',
 'СУММА_СНЯТИЯ',
 'ТАРИФ_КАРТЫ',
 'ТИП_КАРТЫ'}

In [35]:
(data[0], ans[0])

('1: Добрый вечер! У меня, похоже, после трех неправильных вводов ПИНа заблокировалась карта. Что мне делать?\n2: Тогда коллегам передам, они свяжутся с Вами.\n',
 {})

# Data preprocessing

In [36]:
#remove special symblos and lower 
def deleteExtraSymbols(line):
    if line:
        return re.sub(' +',' ', re.sub(r'[^А-Яа-я0-9€$ ]', u' ', line).lower().rstrip().strip())
    else:
        return None

In [37]:
clean_data = []
for dialog in data:
    clean_data.append(deleteExtraSymbols(dialog))

In [38]:
# лемматизация
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

In [39]:
# Приведем ответы к виду ['О', 'О', ..., 'ИМЯ_СЛОТА', ..., 'О', 'О']
counter = 0
y = []
for i in range(len(clean_data)):
    y_vec = ['0'] * len(clean_data[i].split(' '))
    for slot in possible_slots:
        try:
            for slot_word in deleteExtraSymbols(ans[i][slot]['text']).split(' '):
                y_vec[clean_data[i].split(' ').index(slot_word)] = slot
        except:
            pass
    y.append(y_vec)
    counter += 1

In [40]:
lem_data = []
for dialog in clean_data:
    lem_dialog = ''
    for word in dialog.split(' '):
        p = morph.parse(word)[0]
        lem_dialog += p.normal_form + ' '
    lem_dialog = lem_dialog[:-1]
    lem_data.append(lem_dialog)

KeyboardInterrupt: 

# Vectorize data

In [None]:
#wordList =  [[x.lower() for x in re.findall(r"[\w']+", y)] for y in lem_data]
idx2w = [] 
for dialog in lem_data:
    for word in dialog.split(' '):
        if not word in idx2w:
            idx2w.append(word)

In [None]:
X = []
for dialog in lem_data:
    n_dialog = []
    for word in dialog.split(' '):
        n_dialog.append(idx2w.index(word))
    X.append(n_dialog)

In [None]:
idx2la = []
y_num = []
for line in y:
    y_line = []
    for slot in line:
        if not slot in idx2la:
            idx2la.append(slot)
        y_line.append(idx2la.index(slot))
    y_num.append(y_line)
y = y_num

In [None]:
#from gensim.models import Word2Vec

#model = Word2Vec.load('word2vec/w2v_model_tfidf_size300_window5_mc2.w2v')

#import re
# Список списокв извлеченных из текстов слов
#wordList =  [[x.lower() for x in re.findall(r"[\w']+", y)] for y in lem_data]

# Объединение всех слов из выборки в один уникальный список
#unique_words = list(set([item for sublist in wordList for item in sublist]))

# Формирование списка векторов данных из word2vec
#X = []
#for dialog in data:
#    line = []
#    for word in dialog:
#        try:
#            line.append(model[word])
#        except:
#            line.append([0] * 300) # 300 - длина вектора в v2w
#    X.append(line)

# Обратное преобразование
#word=model.most_similar(positive=[model['тиньков']],topn=1)
#print(word[0][0])

# Solution

In [None]:
def conlleval(p, g, w, filename):
    '''
    INPUT:
    p :: predictions
    g :: groundtruth
    w :: corresponding words

    OUTPUT:
    filename :: name of the file where the predictions
    are written. it will be the input of conlleval.pl script
    for computing the performance in terms of precision
    recall and f1 score
    '''
    out = ''
    for sl, sp, sw in zip(g, p, w):
        out += 'BOS O O\n'
        for wl, wp, w in zip(sl, sp, sw):
            out += w + ' ' + wl + ' ' + wp + '\n'
        out += 'EOS O O\n\n'

    f = open(filename,'w')
    f.writelines(out)
    f.close()
    
    return get_perf(filename)

In [None]:
def get_perf(filename):
    ''' run conlleval.pl perl script to obtain
    precision/recall and F1 score '''
    _conlleval = PREFIX + 'conlleval.pl'
    if not isfile(_conlleval):
        #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl') 
        os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl')
        chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions

    proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    stdout, _ = proc.communicate(open(filename,'rb').read())
    for line in stdout.decode("utf-8").split('\n'):
        if 'accuracy' in line:
            out = line.split()
            break
    
    # out = ['accuracy:', '16.26%;', 'precision:', '0.00%;', 'recall:', '0.00%;', 'FB1:', '0.00']
    
    precision = float(out[3][:-2])
    recall    = float(out[5][:-2])
    f1score   = float(out[7])

    return {'p':precision, 'r':recall, 'f1':f1score}


In [None]:
### Model
n_classes = len(possible_slots) + 1
n_vocab = len(idx2w)

test = []

# Define model
model = Sequential()
model.add(Embedding(n_vocab,100))
model.add(Convolution1D(64,5,border_mode='same', activation='relu'))
model.add(Dropout(0.25))
model.add(GRU(100,return_sequences=True))
model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
model.compile('rmsprop', 'categorical_crossentropy')

### Ground truths etc for conlleval
X_train, y_train, X_test, y_test = X[:6000], y[:6000], X[6000:], y[6000:]

words_val = [ list(map(lambda x: idx2w[x], w)) for w in X_test]
groundtruth_val = [ list(map(lambda x: idx2la[x], y)) for y in y_test]
words_train = [ list(map(lambda x: idx2w[x], w)) for w in X_train]
groundtruth_train = [ list(map(lambda x: idx2la[x], y)) for y in y_train]


### Training
n_epochs = 10

train_f_scores = []
val_f_scores = []
best_val_f1 = 0

for i in range(n_epochs):
    print("Epoch {}".format(i))
    
    print("Training =>")
    train_pred_label = []
    avgLoss = 0
    	
    bar = progressbar.ProgressBar(maxval=len(X_train))
    for n_batch, sent in bar(enumerate(X_train)):
        label = y_train[n_batch]
        label = np.eye(n_classes)[label][np.newaxis,:]
        sent = np.array(sent)
        sent = sent[np.newaxis,:]
        
        if sent.shape[1] > 1: #some bug in keras
            loss = model.train_on_batch(sent, label)
            avgLoss += loss

        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred,-1)[0]
        train_pred_label.append(pred)

    avgLoss = avgLoss/n_batch
    
    predword_train = [ list(map(lambda x: idx2la[x], y)) for y in train_pred_label]
    test = predword_train
    con_dict = conlleval(predword_train, groundtruth_train, words_train, 'r.txt')
    #con_dict = conlleval(train_pred_label, y_train, X_train, 'r.txt')
    train_f_scores.append(con_dict['f1'])
    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))
    
    
    print("Testing =>")
    
    val_pred_label = []
    avgLoss = 0
    
    bar = progressbar.ProgressBar(maxval=len(X_test))
    for n_batch, sent in bar(enumerate(X_test)):
        label = y_test[n_batch]
        label = np.eye(n_classes)[label][np.newaxis,:]
        sent = np.array(sent)
        sent[np.newaxis, :]
        
        if sent.shape[1] > 1: #some bug in keras
            loss = model.test_on_batch(sent, label)
            avgLoss += loss

        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred,-1)[0]
        val_pred_label.append(pred)

    avgLoss = avgLoss/n_batch
    
    predword_val = [ list(map(lambda x: idx2la[x], y)) for y in val_pred_label]
    con_dict = conlleval(predword_val, y_test, X_test, 'r.txt')
    val_f_scores.append(con_dict['f1'])
    
    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

    if con_dict['f1'] > best_val_f1:
    	best_val_f1 = con_dict['f1']
    	open('model_architecture.json','w').write(model.to_json())
    	model.save_weights('best_model_weights.h5',overwrite=True)
    	print("Best validation F1 score = {}".format(best_val_f1))
    print()

In [None]:
#con_dict = conlleval(predword_train, groundtruth_train, words_train, 'r.txt')
predword_train

In [None]:
X_train, y_train, X_test, y_test = X[:6000], y[:6000], X[6000:], y[6000:]

In [None]:
class SimpleSolutionModel:
    def __init__(self):
        self._text_to_slot = {}
    
    def fit(self, X, y):
        for y_item in y:
            for slot_title, slot_info in y_item.items():
                self._text_to_slot[slot_info['text']] = slot_title
            
    def predict(self, X):
        y = []
        
        for x_item in X:
            y_item = {}
            for slot_text, slot_title in self._text_to_slot.items():
                index = x_item.find(slot_text)
                if index != -1:
                    y_item[slot_title] = { 
                        'start_pos': index, 
                        'end_pos': index + len(slot_text), 
                        'text': slot_text
                    }
                
            y.append(y_item)
            
        return y

In [None]:
model = SimpleSolutionModel()

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

# Evaluation

In [None]:
def tokenize(token):
    return ''.join([char for char in token if char not in ['.']])

def q_distance(tokens_test, tokens_pred):
    tokens_test = [tokenize(token) for token in tokens_test]
    tokens_pred = [tokenize(token) for token in tokens_pred]
    
    common = len(set(tokens_test) & set(tokens_pred))
    fp = len(set(tokens_pred) - set(tokens_test))
    fn = len(set(tokens_test) - set(tokens_pred))
    
    return common / (common + fp + fn)

def precision_on_dataset(X, y, y_pred):
    """
    X_test - array of chats
    y_test - hash with slots { 'SLOT_NAME': { 'start_pos': 123, 'end_pos': 135 }, ... }
    y_pred - hash_with_predicted_slots
    """
    
    q_sum = 0
    total = 0
    
    for x_item, y_item, y_pred_item in tqdm(zip(X, y, y_pred)):
        for slot_title, y_pred_slot_info in y_pred_item.items():
            if slot_title in y_item:
                y_pred_tokens = x_item[y_pred_slot_info['start_pos']:y_pred_slot_info['end_pos']].split(' ')
                y_tokens = x_item[y_item[slot_title]['start_pos']:y_item[slot_title]['end_pos']].split(' ')
                
                q_sum += q_distance(y_tokens, y_pred_tokens)
            
            total += 1
            
    return q_sum / total

def recall_on_dataset(X, y, y_pred):
    """
    X_test - array of chats
    y_test - hash with slots { 'SLOT_NAME': { 'start_pos': 123, 'end_pos': 135 }, ... }
    y_pred - hash_with_predicted_slots
    """
    
    q_sum = 0
    total = 0
    
    for x_item, y_item, y_pred_item in tqdm(zip(X, y, y_pred)):
        for slot_title, y_pred_slot_info in y_item.items():
            if slot_title in y_pred_item:
                y_pred_tokens = x_item[y_pred_slot_info['start_pos']:y_pred_slot_info['end_pos']].split(' ')
                y_tokens = x_item[y_item[slot_title]['start_pos']:y_item[slot_title]['end_pos']].split(' ')
                
                q_sum += q_distance(y_tokens, y_pred_tokens)
            
            total += 1
            
    return q_sum / total

def f1_on_dataset(X, y, y_pred):
    precision = precision_on_dataset(X, y, y_pred)
    recall = recall_on_dataset(X, y, y_pred)
    
    return 2 * precision * recall / (precision + recall)

In [None]:
f1_on_dataset(X_test, y_test, y_pred)

In [None]:
for i in range(len(ans)):
    print (data[i], ans[i])

In [None]:
len(data)

In [None]:
w2v = np.load('word2vec/w2v_model_tfidf_size300_window5_mc2.w2v.syn0.npy')


In [None]:
model = Word2Vec.load('word2vec/w2v_model_tfidf_size300_window5_mc2.w2v')

In [None]:
model.wv.most_similar_cosmul(positive=['тинькофф'])

In [None]:
print(data)

In [None]:
ans[1]

In [None]:
ans[1]['ВАЛЮТА']['text']

In [None]:
wordList[0]