In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [3]:
from rasa_classifier import RasaClassifier

In [4]:
# Used Chrome extension Scraper to scrape questions from Google's "People Also Ask" section
# See https://chrome.google.com/webstore/detail/scraper/mbigbapnjcgaffohmbkdlecaccepngjd

Xy = pd.read_csv('co_sh_questions.csv')

In [5]:
Xy.shape

(239, 6)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    Xy['Question'], Xy.drop('Question', axis=1), test_size=0.2, random_state=27
)

In [7]:
config = '''
pipeline:
- name: "WhitespaceTokenizer"
- name: "RegexFeaturizer"
- name: "CRFEntityExtractor"
- name: "EntitySynonymMapper"
- name: "CountVectorsFeaturizer"
{char} 
{word}
- name: "EmbeddingIntentClassifier"
  intent_tokenization_flag: true
  intent_split_symbol: "+"
'''
char_featurizer = '''
- name: "CountVectorsFeaturizer"
  analyzer: "char_wb"
  min_ngram: 1
  max_ngram: {max}
'''
word_featurizer = '''
- name: "CountVectorsFeaturizer"
  analyzer: "word"
  min_ngram: 1
  max_ngram: {max}
'''

In [8]:
%%capture

# if using this specific pipeline, classifier parameters could rather represent the pipeline steps
# that would make cross-validation much simpler
params = {
    'config_str': [
        config.format(char='', word=''),
        config.format(char=char_featurizer.format(max=4), word=''),
        config.format(char=char_featurizer.format(max=6), word=''),
        config.format(char=char_featurizer.format(max=4), word=word_featurizer.format(max=3)),
        config.format(char=char_featurizer.format(max=6), word=word_featurizer.format(max=3)),
        config.format(char='', word=word_featurizer.format(max=3)),
        config.format(char='', word=word_featurizer.format(max=3)),
    ]
}

cvm = GridSearchCV(estimator=RasaClassifier(), param_grid=params, cv=5, n_jobs=2)
cvm = cvm.fit(X_train, y_train)

In [9]:
cvm.best_params_

{'config_str': '\npipeline:\n- name: "WhitespaceTokenizer"\n- name: "RegexFeaturizer"\n- name: "CRFEntityExtractor"\n- name: "EntitySynonymMapper"\n- name: "CountVectorsFeaturizer"\n \n\n- name: "CountVectorsFeaturizer"\n  analyzer: "word"\n  min_ngram: 1\n  max_ngram: 3\n\n- name: "EmbeddingIntentClassifier"\n  intent_tokenization_flag: true\n  intent_split_symbol: "+"\n'}

In [10]:
cvm.score(X_test, y_test)




0.9282738095238096

In [11]:
cvm.best_estimator_.predict_conf(X_test).head(3)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Unnamed: 0,lang:de,lang:en,co:ltd,co:ltd+co:plc,co:ltd+co:plc+sh,co:ltd+sh,co:plc,co:plc+sh,sh
13,0,1,0.619702,0.129281,0.064542,0.524644,0.0,0.0,0.0
14,0,1,0.0,0.249244,0.0,0.0,0.718785,0.512952,0.364036
15,0,1,0.871631,0.318964,0.0,0.02985,0.0,0.0,0.0


In [12]:
cvm.predict(X_test).head(3)




Unnamed: 0,lang:de,lang:en,co:ltd,sh,co:plc
13,0,1,1.0,0.0,0.0
14,0,1,0.0,0.0,1.0
15,0,1,1.0,0.0,0.0
