In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import json

from matplotlib import pyplot as plt

%matplotlib inline

ImportError: No module named tqdm

# Load data

In [8]:
with open('slotfilling-data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

TypeError: 'encoding' is an invalid keyword argument for this function

In [None]:
def process_data(data):
    X = [item['chat'] for item in data]
    y = []
    for item in data:
        entities = item['entities']
        y_item = {}
        for entity in entities:
            y_item[entity['title']] = {
                'start_pos': entity['start_pos'],
                'end_pos': entity['end_pos'],
                'text': entity['text']
            }

        y.append(y_item)
    
    return np.array(X), np.array(y)

In [None]:
X, y = process_data(data)
len(X)

In [None]:
# Mix the data
perm = np.random.permutation(len(X))
X, y = X[perm], y[perm]

In [None]:
possible_slots = set([item for y_item in y for item in list(y_item.keys())])
possible_slots

In [None]:
(X[0], y[0])

# Solution

In [2]:
X_train, y_train, X_test, y_test = X[:6000], y[:6000], X[6000:], y[6000:]

NameError: name 'X' is not defined

In [3]:
class SimpleSolutionModel:
    def __init__(self):
        self._text_to_slot = {}
    
    def fit(self, X, y):
        for y_item in y:
            for slot_title, slot_info in y_item.items():
                self._text_to_slot[slot_info['text']] = slot_title
            
    def predict(self, X):
        y = []
        
        for x_item in X:
            y_item = {}
            for slot_text, slot_title in self._text_to_slot.items():
                index = x_item.find(slot_text)
                if index != -1:
                    y_item[slot_title] = { 
                        'start_pos': index, 
                        'end_pos': index + len(slot_text), 
                        'text': slot_text
                    }
                
            y.append(y_item)
            
        return y

In [4]:
model = SimpleSolutionModel()

model.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [5]:
y_pred = model.predict(X_test)

NameError: name 'X_test' is not defined

# Evaluation

In [6]:
def tokenize(token):
    return ''.join([char for char in token if char not in ['.']])

def q_distance(tokens_test, tokens_pred):
    tokens_test = [tokenize(token) for token in tokens_test]
    tokens_pred = [tokenize(token) for token in tokens_pred]
    
    common = len(set(tokens_test) & set(tokens_pred))
    fp = len(set(tokens_pred) - set(tokens_test))
    fn = len(set(tokens_test) - set(tokens_pred))
    
    return common / (common + fp + fn)

def precision_on_dataset(X, y, y_pred):
    """
    X_test - array of chats
    y_test - hash with slots { 'SLOT_NAME': { 'start_pos': 123, 'end_pos': 135 }, ... }
    y_pred - hash_with_predicted_slots
    """
    
    q_sum = 0
    total = 0
    
    for x_item, y_item, y_pred_item in tqdm(zip(X, y, y_pred)):
        for slot_title, y_pred_slot_info in y_pred_item.items():
            if slot_title in y_item:
                y_pred_tokens = x_item[y_pred_slot_info['start_pos']:y_pred_slot_info['end_pos']].split(' ')
                y_tokens = x_item[y_item[slot_title]['start_pos']:y_item[slot_title]['end_pos']].split(' ')
                
                q_sum += q_distance(y_tokens, y_pred_tokens)
            
            total += 1
            
    return q_sum / total

def recall_on_dataset(X, y, y_pred):
    """
    X_test - array of chats
    y_test - hash with slots { 'SLOT_NAME': { 'start_pos': 123, 'end_pos': 135 }, ... }
    y_pred - hash_with_predicted_slots
    """
    
    q_sum = 0
    total = 0
    
    for x_item, y_item, y_pred_item in tqdm(zip(X, y, y_pred)):
        for slot_title, y_pred_slot_info in y_item.items():
            if slot_title in y_pred_item:
                y_pred_tokens = x_item[y_pred_slot_info['start_pos']:y_pred_slot_info['end_pos']].split(' ')
                y_tokens = x_item[y_item[slot_title]['start_pos']:y_item[slot_title]['end_pos']].split(' ')
                
                q_sum += q_distance(y_tokens, y_pred_tokens)
            
            total += 1
            
    return q_sum / total

def f1_on_dataset(X, y, y_pred):
    precision = precision_on_dataset(X, y, y_pred)
    recall = recall_on_dataset(X, y, y_pred)
    
    return 2 * precision * recall / (precision + recall)

In [7]:
f1_on_dataset(X_test, y_test, y_pred)

NameError: name 'X_test' is not defined