In [8]:
import pandas as pd
import os

In [3]:
from textblob import TextBlob

In [4]:
import logging
import pprint
from rasa_nlu.training_data import load_data
from rasa_nlu import config
from rasa_nlu.model import Trainer
from rasa_nlu.model import Interpreter
from rasa_nlu.test import run_evaluation

In [43]:
# This format avoids some text appearing twice, linked to either the same or other intent.
data_en = {
    'private company': 'co:ltd',
    'limited company': 'co:ltd',
    'private limited company': 'co:ltd',
    'limited liability company': 'co:ltd',
    
    'publicly traded company': 'co:plc',
    'public company': 'co:plc',
    'public limited company': 'co:plc',
    
    'shareholder': 'sh',
    'stockholder': 'sh'
}
data_de = {
    'gmbh': 'co:ltd',
    'beschränkter haftung': 'co:ltd',
    
    'plc': 'co:plc',
    'ag': 'co:plc',
    'aktiengesellschaft': 'co:plc',
    
    'mehrheitseigner': 'sh',
}
data = {'en': data_en, 'de': data_de}

config_str = '''
language: {lang}

pipeline:
- name: "WhitespaceTokenizer"
- name: "RegexFeaturizer"
- name: "CRFEntityExtractor"
- name: "EntitySynonymMapper"
- name: "CountVectorsFeaturizer"
- name: "CountVectorsFeaturizer"
  analyzer: "char_wb"
  min_ngram: 1
  max_ngram: 6
- name: "CountVectorsFeaturizer"
  analyzer: "word"
  min_ngram: 1
  max_ngram: 3
- name: "EmbeddingIntentClassifier"
'''

In [63]:
class RasaClassifier:
    
    
    def __init__(self, base_dir, data, config_str, verbose=0):
        self.base_dir = base_dir
        self.data_dir = os.path.join(self.base_dir, 'data')
        self.models_dir = os.path.join(self.base_dir, 'models')
        self.project_name = 'current'
        self.model_name = 'nlu'
        self.last_model_dir = os.path.join(self.models_dir, self.project_name, self.model_name)
        self.log_file = os.path.join(self.base_dir, 'nlu_model.log')
        self.nlu_file = os.path.join(self.data_dir, 'nlu.md')
        self.config_file = os.path.join(self.base_dir, 'config.yml')

        # TODO check if a model already exists and give the option to overwrite
        try:
            for d in [self.data_dir, self.last_model_dir]:
                os.makedirs(d)
        except OSError:
            print("Creation of the directory structure {} failed".format(base_dir))
        else:
            if verbose > 0:
                print("Successfully created the base directory structure {}".format(base_dir))

        logging.basicConfig(filename=self.log_file, level=logging.INFO)

        df_data = pd.DataFrame.from_dict(list(data.items()))
        df_data.columns = ['text', 'intent']

        i = df_data['intent'].unique()[0]
        series_intents = df_data.groupby('intent')['text'].apply(
            lambda texts: '## intent:' + texts.name + '\n' + '\n'.join(['- ' + t for t in texts])
        )
        intents = '\n\n'.join(series_intents)
        with open(self.nlu_file, "w") as text_file:
            print(intents, file=text_file)

        with open(self.config_file, "w") as text_file:
            print(config_str, file=text_file)

    def train(self, verbose=0):
        training_data = load_data(self.nlu_file)
        trainer = Trainer(config.load(self.config_file))
        trainer.train(training_data)
        model_directory = trainer.persist(
            self.models_dir, 
            project_name=self.project_name, 
            fixed_model_name=self.model_name
        )
        self.interpreter = Interpreter.load(self.last_model_dir)
        
    def predict_proba(self, question, verbose=0):
        out = pd.Series(0, index=['question', 'lang:en', 'lang:de', 'co:ltd', 'co:plc', 'sh'])
        # detect language
        det_lang = TextBlob(question).detect_language()
        if 'lang:' + det_lang in out.index:
            out['lang:' + det_lang] = 1
        # get confidence
        intent_ranking = self.interpreter.parse(question)['intent_ranking']
        df_intents = pd.DataFrame.from_dict(intent_ranking)
        out[df_intents['name']] = df_intents['confidence']
        # return a pd.Series()
        return(out)
        
    def predict(self, question, min_intent_conf, verbose=0):
        out = predict_proba(self, question, verbose=0)
        out[out < min_intent_conf] = 0
        out[out >= min_intent_conf] = 1
        return(out)
    

    # TODO score() function
    


In [64]:
cls = {}
for lang in ['de', 'en']:
    cls[lang] = RasaClassifier('./rasa_' + lang, data[lang], config_str.format(lang=lang))
    cls[lang].train()

  self.MIN_EXAMPLES_PER_INTENT))
Epochs: 100%|██████████| 300/300 [00:04<00:00, 93.39it/s, loss=0.110, acc=1.000]
100%|██████████| 6/6 [00:00<00:00, 55.10it/s]
Epochs: 100%|██████████| 300/300 [00:04<00:00, 70.02it/s, loss=0.095, acc=1.000]
100%|██████████| 9/9 [00:00<00:00, 53.09it/s]


In [66]:
cls['de'].predict_proba('Kann ich Gesellschafter einer GmbH sein')

question    0.000000
lang:en     0.000000
lang:de     1.000000
co:ltd      0.787192
co:plc      0.422404
sh          0.213260
dtype: float64