In [406]:
import pandas as pd

In [407]:
from textblob import TextBlob

In [408]:
import logging
import pprint
from rasa_nlu.training_data import load_data
from rasa_nlu import config
from rasa_nlu.model import Trainer
from rasa_nlu.model import Interpreter
from rasa_nlu.test import run_evaluation

In [418]:
def build_rasa(basedir, data, config_str):
    logging.basicConfig(filename=basedir + 'nlu_model.log', level=logging.INFO)
    
    df_data = pd.DataFrame.from_dict(list(data.items()))
    df_data.columns = ['text', 'intent']
    
    i = df_data['intent'].unique()[0]
    series_intents = df_data.groupby('intent')['text'].apply(
        lambda texts: '## intent:' + texts.name + '\n' + '\n'.join(['- ' + t for t in texts])
    )
    intents = '\n\n'.join(series_intents)
    with open(basedir + 'data/nlu.md', "w") as text_file:
        print(intents, file=text_file)
        
    with open(basedir + 'config.yml', "w") as text_file:
        print(config_str, file=text_file)
        
    # train
    training_data = load_data(basedir + 'data/nlu.md')
    trainer = Trainer(config.load(basedir + 'config.yml'))
    trainer.train(training_data)
    model_directory = trainer.persist(
        basedir + 'models', 
        project_name='current', 
        fixed_model_name='nlu'
    )
    run_evaluation(basedir + 'data/nlu.md', model_directory)
    
    return(Interpreter.load(basedir + 'models/current/nlu'))

In [419]:
dict_data = {
    'private company': 'co:ltd',
    'limited company': 'co:ltd',
    'private limited company': 'co:ltd',
    'limited liability company': 'co:ltd',
    'gmbh': 'co:ltd',
    'beschränkter haftung': 'co:ltd',
    
    'publicly traded company': 'co:plc',
    'public company': 'co:plc',
    'public limited company': 'co:plc',
    'plc': 'co:plc',
    'ag': 'co:plc',
    'aktiengesellschaft': 'co:plc',
    
    'shareholder': 'sh',
    'stockholder': 'sh',
    'mehrheitseigner': 'sh',
}

config_str = '''
language: en

pipeline:
- name: "WhitespaceTokenizer"
- name: "RegexFeaturizer"
- name: "CRFEntityExtractor"
- name: "EntitySynonymMapper"
- name: "CountVectorsFeaturizer"
- name: "CountVectorsFeaturizer"
  analyzer: "char_wb"
  min_ngram: 1
  max_ngram: 6
- name: "CountVectorsFeaturizer"
  analyzer: "word"
  min_ngram: 1
  max_ngram: 3
- name: "EmbeddingIntentClassifier"
'''

In [421]:
interpreter = build_rasa('./rasa/', dict_data, config_str)


Epochs:   0%|          | 0/300 [00:00<?, ?it/s][A
Epochs:   0%|          | 0/300 [00:00<?, ?it/s, loss=1.462, acc=0.800][A
Epochs:   0%|          | 1/300 [00:00<02:42,  1.84it/s, loss=1.462, acc=0.800][A
Epochs:   0%|          | 1/300 [00:00<02:42,  1.84it/s, loss=1.156, acc=0.800][A
Epochs:   0%|          | 1/300 [00:00<02:42,  1.84it/s, loss=1.016, acc=0.800][A
Epochs:   0%|          | 1/300 [00:00<02:42,  1.84it/s, loss=0.750, acc=0.800][A
Epochs:   0%|          | 1/300 [00:00<02:42,  1.84it/s, loss=0.718, acc=0.800][A
Epochs:   2%|▏         | 5/300 [00:00<01:54,  2.58it/s, loss=0.718, acc=0.800][A
Epochs:   2%|▏         | 5/300 [00:00<01:54,  2.58it/s, loss=0.680, acc=0.800][A
Epochs:   2%|▏         | 5/300 [00:00<01:54,  2.58it/s, loss=0.600, acc=0.800][A
Epochs:   2%|▏         | 5/300 [00:00<01:54,  2.58it/s, loss=0.552, acc=0.800][A
Epochs:   2%|▏         | 5/300 [00:00<01:54,  2.58it/s, loss=0.541, acc=0.800][A
Epochs:   3%|▎         | 9/300 [00:00<01:21,  3.56it/s

Epochs:  25%|██▌       | 75/300 [00:03<00:07, 30.61it/s, loss=0.301, acc=1.000][A
Epochs:  25%|██▌       | 75/300 [00:03<00:07, 30.61it/s, loss=0.299, acc=1.000][A
Epochs:  25%|██▌       | 75/300 [00:03<00:07, 30.61it/s, loss=0.297, acc=1.000][A
Epochs:  25%|██▌       | 75/300 [00:03<00:07, 30.61it/s, loss=0.295, acc=1.000][A
Epochs:  26%|██▋       | 79/300 [00:03<00:07, 29.88it/s, loss=0.295, acc=1.000][A
Epochs:  26%|██▋       | 79/300 [00:03<00:07, 29.88it/s, loss=0.293, acc=1.000][A
Epochs:  26%|██▋       | 79/300 [00:03<00:07, 29.88it/s, loss=0.291, acc=1.000][A
Epochs:  26%|██▋       | 79/300 [00:03<00:07, 29.88it/s, loss=0.289, acc=1.000][A
Epochs:  26%|██▋       | 79/300 [00:03<00:07, 29.88it/s, loss=0.287, acc=1.000][A
Epochs:  28%|██▊       | 83/300 [00:03<00:07, 29.72it/s, loss=0.287, acc=1.000][A
Epochs:  28%|██▊       | 83/300 [00:03<00:07, 29.72it/s, loss=0.285, acc=1.000][A
Epochs:  28%|██▊       | 83/300 [00:03<00:07, 29.72it/s, loss=0.283, acc=1.000][A
Epoc

Epochs:  51%|█████     | 152/300 [00:05<00:04, 33.34it/s, loss=0.185, acc=1.000][A
Epochs:  51%|█████     | 152/300 [00:05<00:04, 33.34it/s, loss=0.184, acc=1.000][A
Epochs:  51%|█████     | 152/300 [00:05<00:04, 33.34it/s, loss=0.183, acc=1.000][A
Epochs:  52%|█████▏    | 156/300 [00:05<00:05, 28.45it/s, loss=0.183, acc=1.000][A
Epochs:  52%|█████▏    | 156/300 [00:05<00:05, 28.45it/s, loss=0.182, acc=1.000][A
Epochs:  52%|█████▏    | 156/300 [00:05<00:05, 28.45it/s, loss=0.185, acc=1.000][A
Epochs:  52%|█████▏    | 156/300 [00:05<00:05, 28.45it/s, loss=0.179, acc=1.000][A
Epochs:  52%|█████▏    | 156/300 [00:06<00:05, 28.45it/s, loss=0.178, acc=1.000][A
Epochs:  53%|█████▎    | 160/300 [00:06<00:04, 30.42it/s, loss=0.178, acc=1.000][A
Epochs:  53%|█████▎    | 160/300 [00:06<00:04, 30.42it/s, loss=0.181, acc=1.000][A
Epochs:  53%|█████▎    | 160/300 [00:06<00:04, 30.42it/s, loss=0.176, acc=1.000][A
Epochs:  53%|█████▎    | 160/300 [00:06<00:04, 30.42it/s, loss=0.175, acc=1.

Epochs:  76%|███████▌  | 228/300 [00:08<00:02, 33.04it/s, loss=0.117, acc=1.000][A
Epochs:  77%|███████▋  | 232/300 [00:08<00:02, 29.69it/s, loss=0.117, acc=1.000][A
Epochs:  77%|███████▋  | 232/300 [00:08<00:02, 29.69it/s, loss=0.117, acc=1.000][A
Epochs:  77%|███████▋  | 232/300 [00:08<00:02, 29.69it/s, loss=0.116, acc=1.000][A
Epochs:  77%|███████▋  | 232/300 [00:08<00:02, 29.69it/s, loss=0.118, acc=1.000][A
Epochs:  77%|███████▋  | 232/300 [00:08<00:02, 29.69it/s, loss=0.115, acc=1.000][A
Epochs:  79%|███████▊  | 236/300 [00:08<00:02, 30.78it/s, loss=0.115, acc=1.000][A
Epochs:  79%|███████▊  | 236/300 [00:08<00:02, 30.78it/s, loss=0.114, acc=1.000][A
Epochs:  79%|███████▊  | 236/300 [00:08<00:02, 30.78it/s, loss=0.114, acc=1.000][A
Epochs:  79%|███████▊  | 236/300 [00:08<00:02, 30.78it/s, loss=0.113, acc=1.000][A
Epochs:  79%|███████▊  | 236/300 [00:08<00:02, 30.78it/s, loss=0.112, acc=1.000][A
Epochs:  80%|████████  | 240/300 [00:08<00:01, 31.63it/s, loss=0.112, acc=1.

In [422]:
question = "Can we bh shareholders: ppublic limited ?"

interpreter.parse(question)

{'intent': {'name': 'co:plc', 'confidence': 0.7614661455154419},
 'entities': [],
 'intent_ranking': [{'name': 'co:plc', 'confidence': 0.7614661455154419},
  {'name': 'sh', 'confidence': 0.5860928893089294},
  {'name': 'co:ltd', 'confidence': 0.0}],
 'text': 'Can we bh shareholders: ppublic limited ?'}

In [186]:
min_intent_conf = 0.3

out = pd.Series(0, index=['question', 'lang:en', 'lang:de', 'co:ltd', 'co:plc', 'sh'])

det_lang = TextBlob(question).detect_language()
if 'lang:' + det_lang in df_out.index:
    out['lang:' + det_lang] = 1

df_intents = pd.DataFrame.from_dict(interpreter.parse(question)['intent_ranking'])
pred_intents = df_intents[df_intents['confidence'] >= min_intent_conf]['name']
out[pred_intents] = 1

out

question    0
lang:en     1
lang:de     0
co:ltd      1
co:plc      0
sh          1
dtype: int64