In [8]:
import pandas as pd
import os

In [3]:
from textblob import TextBlob

In [4]:
import logging
import pprint
from rasa_nlu.training_data import load_data
from rasa_nlu import config
from rasa_nlu.model import Trainer
from rasa_nlu.model import Interpreter
from rasa_nlu.test import run_evaluation

In [37]:
class RasaClassifier:
    
    
    def __init__(self, base_dir, data, config_str, verbose=0):
        self.base_dir = base_dir
        self.data_dir = os.path.join(self.base_dir, 'data')
        self.models_dir = os.path.join(self.base_dir, 'models')
        self.project_name = 'current'
        self.model_name = 'nlu'
        self.last_model_dir = os.path.join(self.models_dir, self.project_name, self.model_name)
        self.log_file = os.path.join(self.base_dir, 'nlu_model.log')
        self.nlu_file = os.path.join(self.data_dir, 'nlu.md')
        self.config_file = os.path.join(self.base_dir, 'config.yml')

        try:
            for d in [self.data_dir, self.last_model_dir]:
                os.makedirs(d)
        except OSError:
            print("Creation of the directory structure {} failed".format(base_dir))
        else:
            if verbose > 0:
                print("Successfully created the base directory structure {}".format(base_dir))

        logging.basicConfig(filename=self.log_file, level=logging.INFO)

        df_data = pd.DataFrame.from_dict(list(data.items()))
        df_data.columns = ['text', 'intent']

        i = df_data['intent'].unique()[0]
        series_intents = df_data.groupby('intent')['text'].apply(
            lambda texts: '## intent:' + texts.name + '\n' + '\n'.join(['- ' + t for t in texts])
        )
        intents = '\n\n'.join(series_intents)
        with open(self.nlu_file, "w") as text_file:
            print(intents, file=text_file)

        with open(self.config_file, "w") as text_file:
            print(config_str, file=text_file)

    def train(self, verbose=0):
        training_data = load_data(self.nlu_file)
        trainer = Trainer(config.load(self.config_file))
        trainer.train(training_data)
        model_directory = trainer.persist(
            self.models_dir, 
            project_name=self.project_name, 
            fixed_model_name=self.model_name
        )

        return(Interpreter.load(self.last_model_dir))
    
o = RasaClassifier('./rasa_en/', data['en'], config_str)

In [38]:
o.train()

Epochs: 100%|██████████| 300/300 [00:04<00:00, 82.37it/s, loss=0.100, acc=1.000]
100%|██████████| 9/9 [00:00<00:00, 75.69it/s]


<rasa_nlu.model.Interpreter at 0x7fcafac4fdd8>

In [22]:
# This format avoids some text appearing twice, linked to either the same or other intent.
data_en = {
    'private company': 'co:ltd',
    'limited company': 'co:ltd',
    'private limited company': 'co:ltd',
    'limited liability company': 'co:ltd',
    
    'publicly traded company': 'co:plc',
    'public company': 'co:plc',
    'public limited company': 'co:plc',
    
    'shareholder': 'sh',
    'stockholder': 'sh'
}
data_de = {
    'gmbh': 'co:ltd',
    'beschränkter haftung': 'co:ltd',
    
    'plc': 'co:plc',
    'ag': 'co:plc',
    'aktiengesellschaft': 'co:plc',
    
    'mehrheitseigner': 'sh',
}
data = {'en': data_en, 'de': data_de}

config_str = '''
language: {}

pipeline:
- name: "WhitespaceTokenizer"
- name: "RegexFeaturizer"
- name: "CRFEntityExtractor"
- name: "EntitySynonymMapper"
- name: "CountVectorsFeaturizer"
- name: "CountVectorsFeaturizer"
  analyzer: "char_wb"
  min_ngram: 1
  max_ngram: 6
- name: "CountVectorsFeaturizer"
  analyzer: "word"
  min_ngram: 1
  max_ngram: 3
- name: "EmbeddingIntentClassifier"
'''

In [23]:
interpreter = build_rasa('./rasa_en/', data['en'], config_str)

Successfully created the base directory structure ./rasa_en/


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Epochs: 100%|██████████| 300/300 [00:04<00:00, 72.42it/s, loss=0.098, acc=1.000]
100%|██████████| 9/9 [00:00<00:00, 72.64it/s]


In [24]:
question = "Can we bh shareholders: ppublic limited gmbh ?"

interpreter.parse(question)

{'intent': {'name': 'co:plc', 'confidence': 0.7435868978500366},
 'entities': [],
 'intent_ranking': [{'name': 'co:plc', 'confidence': 0.7435868978500366},
  {'name': 'sh', 'confidence': 0.5338120460510254},
  {'name': 'co:ltd', 'confidence': 0.016869664192199707}],
 'text': 'Can we bh shareholders: ppublic limited gmbh ?'}

In [186]:
min_intent_conf = 0.3

out = pd.Series(0, index=['question', 'lang:en', 'lang:de', 'co:ltd', 'co:plc', 'sh'])

det_lang = TextBlob(question).detect_language()
if 'lang:' + det_lang in df_out.index:
    out['lang:' + det_lang] = 1

df_intents = pd.DataFrame.from_dict(interpreter.parse(question)['intent_ranking'])
pred_intents = df_intents[df_intents['confidence'] >= min_intent_conf]['name']
out[pred_intents] = 1

out

question    0
lang:en     1
lang:de     0
co:ltd      1
co:plc      0
sh          1
dtype: int64