# Importing Libraries

In [1]:
import json
import yaml
import os
import rasa
import nest_asyncio
import asyncio
import random
import pandas as pd

nest_asyncio.apply()

# Loading the dataset

In [2]:
def load_data(dataset_dir, load_dataset_path, no_intents=None, entities=True):
    with open(load_dataset_path, 'w') as yaml_file:
        rasa_version = "3.1"
        yaml_file.writelines(f"version: {rasa_version}")
        yaml_file.write("\n\n")
        yaml_file.writelines("nlu:\n")
        intents = os.listdir(dataset_dir)
        for intent_file_name in intents:
            intent_path = os.path.join(dataset_dir, intent_file_name)
            intent_name = intent_file_name[:-5]
            yaml_file.writelines(f"- intent: {intent_name}\n")
            yaml_file.writelines("  examples: |\n")
            with open(intent_path) as json_file:
                json_data = json.load(json_file)
                intent_examples = json_data[intent_name]
                if(no_intents==None):
                    no_intents=len(intent_examples)
                text_examples = []
                intent_examples = random.sample(intent_examples, no_intents)
                for data in intent_examples:
                    texts = data["data"]
                    example = ""
                    for text in texts:
                        word = text["text"]
                        if entities:
                            if "entity" in text.keys():
                                entity = text["entity"]
                                word = f"[{word}]({entity})"
                        example = example + word
                    yaml_file.writelines(f"    - {example}\n")
            yaml_file.writelines("\n")
            json_file.close()
    yaml_file.close()

In [3]:
train_dataset_dir = "./dataset/Train/"
val_dataset_dir = "./dataset/Validate/"

In [11]:
load_data(train_dataset_dir, "./model_data/train_20_data.yml", no_intents=20, entities=True)
load_data(train_dataset_dir, "./model_data/train_100_data.yml", no_intents=100, entities=True)
load_data(train_dataset_dir, "./model_data/train_200_data.yml", no_intents=200, entities=True)

load_data(val_dataset_dir, "./model_data/val_data_entities.yml", entities=True)
load_data(val_dataset_dir, "./model_data/val_data_no_entities.yml", entities=False)

In [4]:
load_data(train_dataset_dir, "./model_data/train_20_data.yml", no_intents=20, entities=True)
load_data(val_dataset_dir, "./model_data/val_data_entities.yml", entities=True)
load_data(val_dataset_dir, "./model_data/val_data_no_entities.yml", entities=False)

# Training model

In [24]:
from rasa import model_training

In [27]:
rasa.model_training.train_nlu("./configs/config-origin.yml", "./model_data/train_20_data.yml", "./models/")

  More info at https://rasa.com/docs/rasa/components#countvectorsfeaturizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Your Rasa model is trained and saved at 'models\nlu-20221220-110132-fast-regression.tar.gz'.


'models\\nlu-20221220-110132-fast-regression.tar.gz'

# Evaluating the models

In [28]:
from rasa.core.agent import Agent

model_name = "spacy-nlp"

model_path = f"./models/{model_name}.gz"
nlu_agent = Agent.load(model_path=model_path)

def get_prediction(query):
    pred = asyncio.run(nlu_agent.parse_message(message_data=query))
    return pred

In [23]:
rasa.nlu.extractors.crf_entity_extractor

<module 'rasa.nlu.extractors.crf_entity_extractor' from 'c:\\Darshan\\contextual-chatbot\\env\\lib\\site-packages\\rasa\\nlu\\extractors\\crf_entity_extractor.py'>

In [33]:
get_prediction("add the artist Pete Murray to my relaxing playlist")

{'text': 'add the artist Pete Murray to my relaxing playlist',
 'intent': {'name': 'AddToPlaylist', 'confidence': 0.997940942789076},
 'entities': [{'entity': 'music_item',
   'start': 8,
   'end': 14,
   'confidence_entity': 0.7597190824015085,
   'value': 'artist',
   'extractor': 'CRFEntityExtractor'},
  {'entity': 'playlist_owner',
   'start': 30,
   'end': 32,
   'confidence_entity': 0.8044529980000107,
   'value': 'my',
   'extractor': 'CRFEntityExtractor'}],
 'text_tokens': [(0, 3),
  (4, 7),
  (8, 14),
  (15, 19),
  (20, 26),
  (27, 29),
  (30, 32),
  (33, 41),
  (42, 50)],
 'intent_ranking': [{'name': 'AddToPlaylist', 'confidence': 0.997940942789076},
  {'name': 'SearchCreativeWork', 'confidence': 0.0009855166099354347},
  {'name': 'PlayMusic', 'confidence': 0.0008531223411077812},
  {'name': 'SearchScreeningEvent', 'confidence': 9.37053682549645e-05},
  {'name': 'RateBook', 'confidence': 6.285805722360564e-05}]}

In [5]:
import rasa.shared.nlu.training_data.loading as nlu_loading

val_data = nlu_loading.load_data("./model_data/val_data_no_entities.yml")
val_data = [m.as_dict() for m in val_data.intent_examples]

val_data_entity = nlu_loading.load_data("./model_data/val_data_entities.yml")
val_data_entity = [m.as_dict() for m in val_data_entity.entity_examples]

In [32]:
columns = ["Query", "Expected Entity", "Predicted Entity", "Confidence"]
result = pd.DataFrame([], columns=columns)

for i in range(len(val_data)):
    query = val_data[i]["text"]
    pred = get_prediction(query)
    pred = pred["entities"]
    expec = val_data_entity[i]["entities"]
    token_point = {}
    for pred_entity in pred:
        token_point[pred_entity["start"]] = {
            "Expected Entity": ""
        }
        token_point[pred_entity["start"]]["Predicted Entity"] = pred_entity["entity"]
        token_point[pred_entity["start"]]["Confidence"] = pred_entity["confidence_entity"]
    for expec_entity in expec:
        if expec_entity["start"] not in token_point:
            token_point[expec_entity["start"]] = {
                "Predicted Entity": "",
                "Confidence": 0,
            }
        token_point[expec_entity["start"]]["Expected Entity"] = expec_entity["entity"]

    for token in token_point.values():
        row= {}
        row["Query"] = query
        row["Expected Entity"] = token["Expected Entity"]
        row["Predicted Entity"] = token["Predicted Entity"]
        row["Confidence"] = token["Confidence"]
        result = pd.concat([result, pd.DataFrame(row, index=[0])], ignore_index=True)

result.to_csv(f"./logs/entity-{model_name}.csv", index=False)

In [10]:
get_prediction("play some music by imagine dragons")

{'text': 'play some music by imagine dragons',
 'intent': {'name': 'PlayMusic', 'confidence': 0.9769954244727165},
 'entities': [],
 'text_tokens': [(0, 4), (5, 9), (10, 15), (16, 18), (19, 26), (27, 34)],
 'intent_ranking': [{'name': 'PlayMusic', 'confidence': 0.9769954244727165},
  {'name': 'SearchScreeningEvent', 'confidence': 0.008738545910329966},
  {'name': 'SearchCreativeWork', 'confidence': 0.0072978377319007975},
  {'name': 'AddToPlaylist', 'confidence': 0.004068762565745417},
  {'name': 'RateBook', 'confidence': 0.0013848580976571585}]}

In [30]:
columns = ["Query", "Expected Intent", "Predicted Intent"]+[intent[:-5] for intent in os.listdir("./dataset/Train/")]
result = pd.DataFrame([], columns=columns)

true_positive = 0

for data in val_data:
    row = {}
    query = data["text"]
    expected_intent = data["intent"]
    row["Query"] = query
    row["Expected Intent"] = expected_intent
    pred = get_prediction(query)
    row["Predicted Intent"] = pred["intent"]["name"]
    if row["Predicted Intent"] == row["Expected Intent"]:
        true_positive += 1
    for pred_intent in pred["intent_ranking"]:
        row[pred_intent["name"]] = pred_intent["confidence"]

    result = pd.concat([result, pd.DataFrame(row, index=[0])], ignore_index=True)

result.to_csv(f"./logs/intent-{model_name}.csv")

In [31]:
print(true_positive/len(val_data))

0.8852223816355811
