In [314]:
import numpy as np
import pandas as pd
import os
import shutil

In [3]:
from textblob import TextBlob

In [4]:
import logging
import pprint
from rasa_nlu.training_data import load_data
from rasa_nlu import config
from rasa_nlu.model import Trainer
from rasa_nlu.model import Interpreter
from rasa_nlu.test import run_evaluation

In [43]:
# This format avoids some text appearing twice, linked to either the same or other intent.
data_en = {
    'private company': 'co:ltd',
    'limited company': 'co:ltd',
    'private limited company': 'co:ltd',
    'limited liability company': 'co:ltd',
    
    'publicly traded company': 'co:plc',
    'public company': 'co:plc',
    'public limited company': 'co:plc',
    
    'shareholder': 'sh',
    'stockholder': 'sh'
}
data_de = {
    'gmbh': 'co:ltd',
    'beschränkter haftung': 'co:ltd',
    
    'plc': 'co:plc',
    'ag': 'co:plc',
    'aktiengesellschaft': 'co:plc',
    
    'mehrheitseigner': 'sh',
}
data = {'en': data_en, 'de': data_de}

config_str = '''
language: {lang}

pipeline:
- name: "WhitespaceTokenizer"
- name: "RegexFeaturizer"
- name: "CRFEntityExtractor"
- name: "EntitySynonymMapper"
- name: "CountVectorsFeaturizer"
- name: "CountVectorsFeaturizer"
  analyzer: "char_wb"
  min_ngram: 1
  max_ngram: 6
- name: "CountVectorsFeaturizer"
  analyzer: "word"
  min_ngram: 1
  max_ngram: 3
- name: "EmbeddingIntentClassifier"
'''

In [579]:
class RasaLangClassifier:
    
    
    def __init__(self, base_dir, data, config_str, overwrite=False, verbose=0):
        self.verbose = verbose
        self.base_dir = base_dir
        self.data_dir = os.path.join(self.base_dir, 'data')
        self.models_dir = os.path.join(self.base_dir, 'models')
        self.project_name = 'current'
        self.model_name = 'nlu'
        self.last_model_dir = os.path.join(self.models_dir, self.project_name, self.model_name)
        self.log_file = os.path.join(self.base_dir, 'nlu_model.log')
        self.nlu_file = os.path.join(self.data_dir, 'nlu.md')
        self.config_file = os.path.join(self.base_dir, 'config.yml')
        self.tags = ['co:ltd', 'co:plc', 'sh'] # TODO get from data object instead
        
        logging.basicConfig(filename=self.log_file, level=logging.INFO)

        # check if a model already exists and pre-delete if overwrite flag active
        if os.path.exists(self.base_dir):
            if not overwrite:
                raise(OSError('Base directory for classifier already exists.'))
            else:
                shutil.rmtree(self.base_dir)
        for d in [self.data_dir, self.last_model_dir]:
            os.makedirs(d)
        if self.verbose > 0:
            print("Successfully created base directory structure {}".format(base_dir))

        # format data dict into Rasa NLU markdown file format
        df_data = pd.DataFrame.from_dict(list(data.items()))
        df_data.columns = ['text', 'intent']
        i = df_data['intent'].unique()[0]
        series_intents = df_data.groupby('intent')['text'].apply(
            lambda texts: '## intent:' + texts.name + '\n' + '\n'.join(['- ' + t for t in texts])
        )
        intents = '\n\n'.join(series_intents)
        with open(self.nlu_file, "w") as text_file:
            print(intents, file=text_file)

        # write configuration to config file
        with open(self.config_file, "w") as text_file:
            print(config_str, file=text_file)

    def train(self):
        training_data = load_data(self.nlu_file)
        trainer = Trainer(config.load(self.config_file))
        trainer.train(training_data)
        model_directory = trainer.persist(
            self.models_dir, 
            project_name=self.project_name, 
            fixed_model_name=self.model_name
        )
        self.interpreter = Interpreter.load(self.last_model_dir)
    
    def predict_proba(self, X):
        def predict_proba_single(self, question):
            out = pd.Series(0, index=self.tags)
            # get confidence from interpreter
            try:
                intent_ranking = self.interpreter.parse(question)['intent_ranking']
            except AttributeError as error:
                raise AttributeError('The model needs to be trained first.') from error
            df_intents = pd.DataFrame.from_dict(intent_ranking)
            out[df_intents['name']] = df_intents['confidence']
            # return a pd.Series()
            return(out)
        
        X_ = [X] if np.isscalar(X) else X
        out = pd.DataFrame([predict_proba_single(self, Xi) for Xi in X_])
        return(out)
        
    def predict(self, X, min_intent_conf):
        def predict_single(self, question, min_intent_conf):
            out = self.predict_proba(question)
            out[out < min_intent_conf] = 0
            out[out >= min_intent_conf] = 1
            return(out)
    
        X_ = [X] if np.isscalar(X) else X
        return(pd.concat([predict_single(self, Xi, min_intent_conf) for Xi in X_]))

In [590]:
class RasaClassifier:
    
    
    def __init__(self, base_dir, data, config_str, langs, overwrite=False, verbose=0):
        self.verbose = verbose
        self.base_dir = base_dir
        self.langs = langs
        self.lang_dirs = {lang: os.path.join(self.base_dir, lang) for lang in self.langs}
        self.lang_classifiers = {}
        for lang in self.langs:
            self.lang_classifiers[lang] = RasaLangClassifier(
                self.lang_dirs[lang], 
                data[lang], 
                config_str.format(lang=lang), 
                overwrite=overwrite
            )
        
    def train(self):
        for cls in self.lang_classifiers.values():
            cls.train()
        
    def _internal_predict(self, X, func_name, **kwargs):
        def check_langs(det_lang):
            unknown_langs = list(set(det_lang).difference(set(self.langs)))
            if len(unknown_langs) > 0:
                raise IndexError(
                    'Unsupported languages detected: {unk}. Available: {langs}.'.format(
                        unk=unknown_langs, langs=self.langs
                    )
                )
        
        X_ = [X] if np.isscalar(X) else X
        det_lang = np.array([TextBlob(question).detect_language() for question in X_])
        check_langs(det_lang)
        out = pd.DataFrame()
        for lang in set(det_lang):
            func = getattr(cls.lang_classifiers[lang], func_name)
            out_l = func(np.array(X_)[det_lang==lang], **kwargs)
            out = out.append(out_l, ignore_index=True)
        lang_dummies = pd.get_dummies(pd.Series(['lang:' + lang for lang in det_lang]))
        out = pd.concat([out, lang_dummies], axis=1)
        return(out)
    
    def predict_proba(self, X):
        return(self._internal_predict(X, 'predict_proba', **{}))
    
    def predict(self, X, min_intent_conf):
        return(self._internal_predict(X, 'predict', min_intent_conf=min_intent_conf)) # TODO pass extra params
    

#     def score(self, X, y):
    # TODO score() function

In [591]:
cls = RasaClassifier('./rasa_full/', data, config_str, list(data.keys()), overwrite=True)
cls.train()

Epochs: 100%|██████████| 300/300 [00:04<00:00, 69.32it/s, loss=0.103, acc=1.000]
100%|██████████| 9/9 [00:00<00:00, 64.35it/s]
  self.MIN_EXAMPLES_PER_INTENT))
Epochs: 100%|██████████| 300/300 [00:03<00:00, 75.39it/s, loss=0.100, acc=1.000]
100%|██████████| 6/6 [00:00<00:00, 47.37it/s]


In [592]:
questions = [
    'Can I be shareholder of a limited company?',
    'I want to become shareholder of a limited company.',
    'Can I be shareholder of a limited public corporate?',
    'Can my company be shareholder of a limited company?',
    'Can my company be shareholder of a public corporate?',
    'Kann ich Gesellschafter einer GmbH sein?',
    'Wer kann Gesellschafter einer AG sein?',
    'Kann ich mit meiner GmbH Mehrheitseigner einer AG sein?'
]

In [593]:
cls.predict_proba(questions)

Unnamed: 0,co:ltd,co:plc,sh,lang:de,lang:en
0,0.06893,0.178491,0.760199,0,1
1,0.0,0.713084,0.534806,0,1
2,0.0,0.143489,0.912268,0,1
3,0.7724,0.0,0.450179,0,1
4,0.759448,0.0,0.456259,0,1
5,0.22356,0.390491,0.633656,1,0
6,0.885208,0.0,0.247214,1,0
7,0.151186,0.674754,0.450892,1,0
