In [None]:
import numpy as np
import pandas as pd

In [43]:
import os
import shutil
import logging
import wget

In [44]:
import fasttext

In [46]:
from rasa_nlu.training_data import load_data
from rasa_nlu import config
from rasa_nlu.model import Trainer
from rasa_nlu.model import Interpreter
from rasa_nlu.test import run_evaluation

In [45]:
import sklearn.metrics
from sklearn.model_selection import train_test_split

In [47]:
class RasaLangClassifier:
    
    
    def __init__(self, config_str, base_dir=None, verbose=0):
        self.verbose = verbose
        self.base_dir = base_dir
        self.data_dir = os.path.join(self.base_dir, 'data')
        self.models_dir = os.path.join(self.base_dir, 'models')
        self.project_name = 'current'
        self.model_name = 'nlu'
        self.last_model_dir = os.path.join(self.models_dir, self.project_name, self.model_name)
        self.log_file = os.path.join(self.base_dir, 'nlu_model.log')
        self.nlu_file = os.path.join(self.data_dir, 'nlu.md')
        self.config_file = os.path.join(self.base_dir, 'config.yml')
        
        logging.basicConfig(filename=self.log_file, level=logging.INFO)

        if os.path.exists(self.base_dir):
            shutil.rmtree(self.base_dir)
        for d in [self.data_dir, self.last_model_dir]:
            os.makedirs(d)
        if self.verbose > 0:
            print("Successfully created base directory structure {}".format(base_dir))

        # write configuration to config file
        with open(self.config_file, "w") as text_file:
            print(config_str, file=text_file)

    def train(self, X, y):
        # format data into Rasa NLU markdown file format
        df_data = pd.DataFrame({'text': X})
        df_data['intent'] = y.apply(
            lambda i: '+'.join(i.index[i>0]), axis=1
        )
        series_intents = df_data.groupby('intent')['text'].apply(
            lambda texts: '## intent:' + texts.name + '\n' + '\n'.join(['- ' + t for t in texts])
        )
        intents = '\n\n'.join(series_intents)
        with open(self.nlu_file, "w") as text_file:
            print(intents, file=text_file)
        self.tags = df_data['intent'].unique()
        
        training_data = load_data(self.nlu_file)
        trainer = Trainer(config.load(self.config_file))
        trainer.train(training_data)
        model_directory = trainer.persist(
            self.models_dir, 
            project_name=self.project_name, 
            fixed_model_name=self.model_name
        )

        self.interpreter = Interpreter.load(self.last_model_dir)
    
    def predict_conf(self, X):
        def predict_conf_single(self, question):
            # get confidence from interpreter
            try:
                intent_ranking = self.interpreter.parse(question)['intent_ranking']
            except AttributeError as error:
                raise AttributeError('The model needs to be trained first.') from error
            out = pd.Series(0, index=self.tags)            
            df_intents = pd.DataFrame.from_dict(intent_ranking)
            out[df_intents['name']] = df_intents['confidence']
            # return a pd.Series()
            return(out)
        
        X_ = [X] if np.isscalar(X) else X
        out = pd.DataFrame([predict_conf_single(self, Xi) for Xi in X_])
        return(out)
        
    def predict(self, X):
        def predict_single(self, question):
            s_conf = self.predict_conf(question)
            indiv_tags = list(set([
                item 
                for sublist in [t.split('+') for t in self.tags] 
                for item in sublist
            ]))
            s = pd.Series(0.0, index=indiv_tags)
            s[s_conf.iloc[0].idxmax().split('+')] = 1
            return(s)
    
        X_ = [X] if np.isscalar(X) else X
        return(pd.DataFrame([predict_single(self, Xi) for Xi in X_]))

In [48]:
class RasaClassifier:
    
    
    def __init__(self, config_str, base_dir=None, verbose=0):
        self.config_str = config_str
        self.base_dir = base_dir # TODO replace by temp folder
        self.verbose = verbose
        
    def train(self, X, y):
        self.langs = [lang.replace('lang:', '') for lang in y.filter(regex='^lang:').columns]
        self.lang_dirs = {lang: os.path.join(self.base_dir, lang) for lang in self.langs}
        self.lang_classifiers = {}
        for lang in self.langs:
            self.lang_classifiers[lang] = RasaLangClassifier(
                self.config_str.format(lang=lang),
                self.lang_dirs[lang],
                verbose=self.verbose
            )
        for lang, cls in self.lang_classifiers.items():
            cls.train(
                X[y['lang:' + lang] > 0],
                y[y['lang:' + lang] > 0].filter(regex='^[^lang:]')
            )
        
    def _internal_predict(self, X, func_name, **kwargs):
        def check_langs(det_lang):
            unknown_langs = list(set(det_lang).difference(set(self.langs)))
            if len(unknown_langs) > 0:
                raise IndexError(
                    'Unsupported languages detected: {unk}. Available: {langs}.'.format(
                        unk=unknown_langs, langs=self.langs
                    )
                )
        
        X_ = [X] if np.isscalar(X) else X
        
        # NOTE this would be part of the prediction package in final implementation
        try:
            lang_model = fasttext.load_model('lid.176.ftz')
        except:
            wget.download('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz')
            lang_model = fasttext.load_model('lid.176.ftz')
        det_lang = np.array([lang_model.predict(q)[0][0].replace('__label__','') for q in X_])
        check_langs(det_lang)
        out = pd.DataFrame()
        ids = []
        for lang in np.sort(list(set(det_lang))):
            func = getattr(cls.lang_classifiers[lang], func_name)
            out_l = func(np.array(X_)[det_lang==lang], **kwargs)
            ids = np.concatenate([ids, np.where(det_lang==lang)[0]])
            out = out.append(out_l, ignore_index=True)
        lang_dummies = pd.get_dummies(pd.Series(['lang:' + lang for lang in np.sort(det_lang)]))
        out = pd.concat([lang_dummies, out], axis=1).iloc[np.argsort(ids)]
        return(out)
    
    def predict_conf(self, X):
        return(self._internal_predict(X, 'predict_conf', **{}))
    
    def predict(self, X):
        return(self._internal_predict(X, 'predict', **{}))

    def score(self, X, y, func, **kwargs):
        y_hat = self._internal_predict(X, 'predict', **{})
        s = func(
            np.array(y, dtype='float'), 
            np.array(y_hat[y.columns], dtype='float'), 
            **kwargs
        )
        return(s)

In [49]:
config_str = '''
language: {lang}

pipeline:
- name: "WhitespaceTokenizer"
- name: "RegexFeaturizer"
- name: "CRFEntityExtractor"
- name: "EntitySynonymMapper"
- name: "CountVectorsFeaturizer"
- name: "CountVectorsFeaturizer"
  analyzer: "char_wb"
  min_ngram: 1
  max_ngram: 6
- name: "CountVectorsFeaturizer"
  analyzer: "word"
  min_ngram: 1
  max_ngram: 3
- name: "EmbeddingIntentClassifier"
  intent_tokenization_flag: true
  intent_split_symbol: "+"
'''

In [51]:
Xy = pd.read_csv('co_sh_questions.csv')

In [53]:
X_train, X_test, y_train, y_test = train_test_split(
    Xy['Question'], Xy.drop('Question', axis=1), test_size=0.2, random_state=27
)

In [71]:
# %%capture

cls = RasaClassifier(config_str, './rasa_full/')

cls.train(X_train, y_train)

  self.MIN_EXAMPLES_PER_INTENT))
Epochs: 100%|██████████| 300/300 [00:13<00:00, 21.81it/s, loss=0.330, acc=1.000]
  self.MIN_EXAMPLES_PER_INTENT))
Epochs: 100%|██████████| 300/300 [00:12<00:00, 27.91it/s, loss=0.211, acc=1.000]


In [68]:
cls.score(
    X_test, 
    y_test, 
    sklearn.metrics.recall_score, 
    average='samples'
)




0.94

In [56]:
pd.concat(
    [
        X_test.reset_index()['Question'], 
        cls.predict(X_test).reset_index().drop('index', axis=1)
    ], 
    axis=1
).head()




Unnamed: 0,Question,lang:de,lang:en,Unnamed: 4,co:ltd,sh,co:plc
0,How many shareholders does a public company have?,0,1,0.0,0.0,0.0,1.0
1,Can I give shares to my son?,0,1,0.0,0.0,1.0,0.0
2,What are some examples of private limited comp...,0,1,0.0,1.0,0.0,0.0
3,At what age can you own shares?,0,1,0.0,0.0,1.0,0.0
4,Wie hoch ist das Stammkapital einer GmbH?,1,0,0.0,0.0,1.0,0.0
