In [1]:
#! pip3 install --user -U textblob

In [2]:
import nltk
from nltk.stem.lancaster import LancasterStemmer # for english
from nltk.stem.snowball import FrenchStemmer # for french
from nltk.corpus import stopwords 
from textblob import TextBlob #to detect the language used => 'fr' or 'en'
# import our chat-bot intents file
import json
from spellchecker import SpellChecker
import random
import numpy as np

In [3]:
def detect_language(sentence):
    if(len(sentence)>3):
        b = TextBlob(sentence)
        return b.detect_language()  
    else:
        return "en"

In [4]:
def spell_correction(tokenized_sentence):
    spell = SpellChecker()
    misspelled=spell.unknown(tokenized_sentence)
    for i in tokenized_sentence:
        if i  in misspelled:
            tokenized_sentence[tokenized_sentence.index(i)]=spell.correction(i)
    return tokenized_sentence

In [5]:
def tokenization(sentence):
    return nltk.word_tokenize(sentence)

In [6]:
def stopwords_list():
    return list(set(stopwords.words('french')))+ list(set(stopwords.words('english')))+["?","!",".",";",","]

In [7]:
def stemming(sentence):
    sentence=sentence.lower()
    """lang=detect_language(sentence)
    stemmer = LancasterStemmer() if lang=="en" else FrenchStemmer()"""
    stemmer=LancasterStemmer()
    return stemmer.stem(sentence)

In [8]:
def get_data(file):
    with open(file) as json_data:
        data = json.load(json_data)
    return data

In [19]:
def pre_processing(file):
    data=get_data(file)
    stopwords=stopwords_list()
    words = [] # our vocabulary 
    classes = [] #label (intent or entity)
    documents = [] #each pattern tokenized and stemmed along with its corresponding label (intent or entity)
    # loop through each sentence in our intents patterns
    for row in data[file.split(".")[0]]: #intent or entity
        for pattern in row['patterns']:
            # 1- TOKENIZATION
            w = spell_correction(tokenization(pattern))
            # add to our words list
            words.extend(w)
            # add to documents in our corpus
            documents.append((w, row['tag']))
            # add to our classes list
            if row['tag'] not in classes:
                classes.append(row['tag']) 
    # stem and lower each word and remove duplicates
    words = [stemming(w) for w in words if w not in stopwords] # STEMMING & STOPWORDS
    # remove duplicates
    words = sorted(list(set(words)))
    classes = sorted(list(set(classes)))
    return dict({"words":words,"classes":classes,"documents":documents})

In [20]:
pre_processing("intents.json")

{'classes': ['effectiveDate',
  'goodbye',
  'greeting',
  'hours',
  'mopeds',
  'opentoday',
  'payments',
  'rental',
  'salary',
  'sex',
  'thanks',
  'today',
  'translation',
  'weather'],
 'documents': [(['Hi'], 'greeting'),
  (['How', 'are', 'you'], 'greeting'),
  (['Is', 'anyone', 'there', '?'], 'greeting'),
  (['Hello'], 'greeting'),
  (['Good', 'day'], 'greeting'),
  (['Tell', 'me', 'my', 'sex'], 'sex'),
  (['I', 'am', 'a', 'female', 'or', 'a', 'male'], 'sex'),
  (['sex'], 'sex'),
  (['Tell', 'me', 'my', 'effectiveDate'], 'effectiveDate'),
  (['effectiveDate'], 'effectiveDate'),
  (['Tell', 'me', 'my', 'salary'], 'salary'),
  (['what', 'is', 'my', 'salary'], 'salary'),
  (['Bye'], 'goodbye'),
  (['See', 'you', 'later'], 'goodbye'),
  (['Goodbye'], 'goodbye'),
  (['Thanks'], 'thanks'),
  (['Thank', 'you'], 'thanks'),
  (['That', 'is', 'helpful'], 'thanks'),
  (['What', 'hours', 'are', 'you', 'open', '?'], 'hours'),
  (['What', 'are', 'your', 'hours', '?'], 'hours'),
  (['Whe

In [10]:
def train_x_train_y(file):
    vocabulary=pre_processing(file)
    words=vocabulary["words"]
    classes=vocabulary["classes"]
    documents=vocabulary["documents"]
  
    # create our training data
    training = []
    output = []
    # create an empty array for our output
    output_empty = [0] * len(classes)
    # training set, bag of words for each sentence
    for doc in documents:
        # initialize our bag of words
        bag = []
        # list of tokenized words for the pattern (pattern = what user says)
        pattern_words = doc[0] 
        # stem each word
        pattern_words = [stemming(word) for word in pattern_words]
        # create our bag of words array
        # mark the presence of words as a boolean value, 0 for absent, 1 for present.
        for w in words:
            bag.append(pattern_words.count(w)) 
        # output is a '0' for each tag and '1' for current tag
        output_row = list(output_empty)
        output_row[classes.index(doc[1])] = 1
        training.append([bag, output_row])
    
    # shuffle our features and turn into np.array
    random.shuffle(training)
    training = np.array(training)
    # create train and test lists
    train_x = list(training[:,0])# the sentence
    train_y = list(training[:,1])# the intent
    return dict({"train_x":train_x,"train_y":train_y})    