In [4]:
!pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import json
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
import os
from zipfile import ZipFile

from langdetect import detect
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc

sns.set(color_codes=True) # adds a nice background to the graphs

## time
import time

%matplotlib inline

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD
import random

In [7]:
# !pip install pycaret

In [8]:
"""
:param file - file path

This method takes the file path as input and returns the dataFrame.

"""
def read_csv(file):
    return pd.read_csv(file)

In [9]:
"""
:param df - dataFrame

This method takes the dataFrame as input and print the data info  
    - dataFrame shape, no.of columns, rows
    - column names
    - categorical columns
    - non-categorical columns.

"""
def read_info(df):
    df_shape = df.shape
    df_cols = np.asarray(df.columns)
    df_int_cols = np.asarray(df.select_dtypes('number').columns)
    df_cat_cols = np.asarray(df.select_dtypes('object').columns)
    print(f'shape of dataset: {df_shape} ; number of rows: {df_shape[0]} ; number of columns: {df_shape[1]}')
    print('')
    print(f'cloumns: {df_cols}')
    print('')
    print('feature types::')
    print(df.dtypes)
    print('')
    print(f'non-categorical features: {df_int_cols}')
    print(f'categorical features: {df_cat_cols}')
    print('')

In [10]:
"""
:param df - dataFrame

This method takes the dataFrame and prints the missing/null values in it
"""
def print_missing_values(df):
    # check for null values
    print('null values - info')
    print(df.isnull().sum())
    print('')
    # missing values percentage
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
    print('missing values percentage::')
    print(missing_value_df)
    print('')

In [11]:
"""
:param df - dataFrame

This method takes the dataFrame and handles the missing/null values in it

If the missing values perncentage is greater than the given threshold percentage,
then we it simply drops the rows containig the missing values.
Else it will impute the missing values with `mean`.

"""
def handle_missing_values(df, threshold):
    # check for null values
    print('null values - info')
    null_arr = np.asarray(df.isnull().sum())
    print(df.isnull().sum())
    print('')

    # missing values percentage
    percent_missing = round(df.isnull().sum() * 100 / len(df), 2)
    missing_value_df = pd.DataFrame({'percent_missing': percent_missing}).sort_values(by=['percent_missing'], ascending=False)
    print('missing values percentage (in desc order )::')
    print(missing_value_df)
    print('')
    cols_need_to_drop = []
    for i in df.columns:
        print('column:::', i, ';; percent_missing:: ', percent_missing[i])
        if percent_missing[i] > 0 :
          # If the missing values perncentage is greater than the given threshold percentage, then we simply drops the column containig the missing values.
          if percent_missing[i] > threshold:
            cols_need_to_drop.append(i)
              # print('dropping column::', i)
              # df.drop([i], axis=1, inplace=True)
          else:
              # impute using mean
              mean = df[i].mean()
              df[i].fillna(mean, inplace=True)
    # cols that are need to drop
    print('dropping columns:::', cols_need_to_drop)
    df.drop(cols_need_to_drop, axis=1, inplace=True)
    print('')
    print('null/missing values handled')
    return df

In [12]:
"""
:param df - dataFrame

This method takes the dataFrame and handles the duplicate entries in it

It will drop the identical entries aka duplicates.

"""
def handle_duplicates(df):
    # check for duplicates in the data
    dupes = df.duplicated()
    if sum(dupes) > 0:
        df.drop_duplicates(inplace=True)
    print('duplicates handled')
    print('')
    return df

In [13]:
"""
:param df - dataFrame

This method prints the unique values in each column/entries of dataFrame.

"""
def list_unique_values(df):
    # unique values in each coulmn
    for i in np.asarray(df.columns):
        print(f'unique values in {i} : ')
        print(df[i].unique())
        print('')

In [14]:
"""
:param df - dataFrame

This method prints the values counts in each column/entries of dataFrame.

"""
def value_counts(df):
    # unique values in each coulmn
    for i in np.asarray(df.columns):
        print(f'unique values in {i} : ')
        print(df[i].value_counts())
        print('')

In [15]:
#Performance metrics -MICRO/MACRO/WEIGHTED/SAMPLES
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

average_types = ['micro', 'weighted', 'macro']

"""
:param y_val
:param predicted

This method prints the performace metrices.

"""
def print_performance_scores(y_val, predicted):
  for average_type in average_types:
      print('----------------'+average_type+'--------------------------')
      print('Accuracy score: ', accuracy_score(y_val, predicted))
      print('Recall score: ', recall_score(y_val, predicted, average=average_type))
      print('F1 score: ', f1_score(y_val, predicted, average=average_type))
      print('Precision score: ', precision_score(y_val, predicted, average=average_type))

In [16]:
%pwd

'/content/drive/MyDrive/AI&ML/NLP/project-1'

In [80]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import random

In [81]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [82]:
# Reading JSON data
f=open('GL+Bot.json','r',errors = 'ignore')

with f as file :
    data = json.load(file)

In [83]:
# Pretty Printing json data

print(json.dumps(data, indent=4, sort_keys=True))

{
    "intents": [
        {
            "context_set": "",
            "patterns": [
                "hi",
                "how are you",
                "is anyone there",
                "hello",
                "whats up",
                "hey",
                "yo",
                "listen",
                "please help me",
                "i am learner from",
                "i belong to",
                "aiml batch",
                "aifl batch",
                "i am from",
                "my pm is",
                "blended",
                "online",
                "i am from",
                "hey ya",
                "talking to you for first time"
            ],
            "responses": [
                "Hello! how can i help you ?"
            ],
            "tag": "Intro"
        },
        {
            "context_set": "",
            "patterns": [
                "thank you",
                "thanks",
                "cya",
                "see you",
              

In [84]:
for x in data['intents']:
  if len(x['responses']) > 1:
    print('yes')

In [85]:
# refernce df
df_ref = pd.DataFrame(columns=['tag', 'patterns', 'response'])
for i in data['intents']:
    intent = i['tag']
    for pattern in i['patterns']:
      row = {'tag': intent, 'patterns': pattern, 'response': i['responses'][0]}
      df_ref = df_ref.append(row, ignore_index=True)

In [86]:
df_ref.head(10)

Unnamed: 0,tag,patterns,response
0,Intro,hi,Hello! how can i help you ?
1,Intro,how are you,Hello! how can i help you ?
2,Intro,is anyone there,Hello! how can i help you ?
3,Intro,hello,Hello! how can i help you ?
4,Intro,whats up,Hello! how can i help you ?
5,Intro,hey,Hello! how can i help you ?
6,Intro,yo,Hello! how can i help you ?
7,Intro,listen,Hello! how can i help you ?
8,Intro,please help me,Hello! how can i help you ?
9,Intro,i am learner from,Hello! how can i help you ?


In [87]:
df_ref.shape

(128, 3)

In [88]:
df_ref.sample(10)

Unnamed: 0,tag,patterns,response
14,Intro,my pm is,Hello! how can i help you ?
107,Bot,hours of operation,I am your virtual learning assistant
8,Intro,please help me,Hello! how can i help you ?
94,NN,otimizer,Link: Neural Nets wiki
39,Olympus,olympus window not working,Link: Olympus wiki
102,Bot,what is your name,I am your virtual learning assistant
55,SL,i am not able to understand knn imputer,Link: Machine Learning wiki
19,Intro,talking to you for first time,Hello! how can i help you ?
12,Intro,aifl batch,Hello! how can i help you ?
86,NN,artificial intelligence,Link: Neural Nets wiki


In [89]:
words=[]
classes = []
documents = []
ignore_words = ['?', '!']

In [90]:
# Tokenize each word and add the document in the corpus
intents = data['intents']
for intent in intents:
    for pattern in intent['patterns']:
        #tokenize each word
        wrd = nltk.word_tokenize(pattern)
        words.extend(wrd)
        #add documents in the corpus
        documents.append((wrd, intent['tag']))

        # add to our classes list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

In [91]:
# lemmaztize and lower-case each word and remove duplicates
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(wrd.lower()) for wrd in words if wrd not in ignore_words]
words = sorted(list(set(words)))

# sort classes
classes = sorted(list(set(classes)))

# documents will be a combination between patterns and intents
print (len(documents), "documents")

# classes = intents
print (len(classes), "classes", classes)

# words = all words, vocabulary
print (len(words), "unique words", words)

128 documents
8 classes ['Bot', 'Exit', 'Intro', 'NN', 'Olympus', 'Profane', 'SL', 'Ticket']
158 unique words ['a', 'able', 'access', 'activation', 'ada', 'adam', 'aifl', 'aiml', 'am', 'an', 'ann', 'anyone', 'are', 'artificial', 'backward', 'bad', 'bagging', 'batch', 'bayes', 'belong', 'best', 'blended', 'bloody', 'boosting', 'bot', 'buddy', 'classification', 'contact', 'create', 'cross', 'cya', 'day', 'deep', 'did', 'diffult', 'do', 'ensemble', 'epoch', 'explain', 'first', 'for', 'forest', 'forward', 'from', 'function', 'good', 'goodbye', 'gradient', 'great', 'hate', 'have', 'hell', 'hello', 'help', 'helped', 'hey', 'hi', 'hidden', 'hour', 'how', 'hyper', 'i', 'imputer', 'in', 'intelligence', 'is', 'jerk', 'joke', 'knn', 'later', 'layer', 'learner', 'learning', 'leaving', 'link', 'listen', 'logistic', 'lot', 'machine', 'me', 'ml', 'my', 'naive', 'name', 'nb', 'net', 'network', 'neural', 'no', 'not', 'of', 'olympus', 'olypus', 'on', 'online', 'operation', 'opertions', 'otimizer', 'para

In [92]:
pickle.dump(words,open('words.pkl','wb'))
pickle.dump(classes,open('classes.pkl','wb'))

In [93]:
# create our training data
training = []

# create an empty array for our output
output_empty = [0 for x in range(len(classes))]

# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]

    # lemmatize each word - create base word, in attempt to represent related words
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    
    # create our bag of words array with 1, if word match found in current pattern
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)
    
    # output is a '0' for each tag and '1' for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    
    training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)

training = np.array(training)

# create train and test lists. X - patterns, Y - intents
train_x = list(training[:,0])
train_y = list(training[:,1])
print("Training data created")

Training data created


In [94]:
# Create model: Three Layers
# First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

In [95]:
# Compile model. SGD with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [96]:
#fitting and saving the model 
hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save('chatbot.h5', hist)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [97]:
from tensorflow.keras.models import load_model
model = load_model('chatbot.h5')
words = pickle.load(open('words.pkl','rb'))
classes = pickle.load(open('classes.pkl','rb'))

In [98]:
def clean_up_sentence(sentence):
    # tokenize the pattern - split words into array
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word - create short form for word
    sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
    return sentence_words

In [99]:
# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence

def bag_of_words(sentence, words, show_details=True):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)

    # bag of words - matrix of N words, vocabulary matrix
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)
    return(np.array(bag))


In [100]:
def predict_class(sentence, model):
    # filter out predictions below a threshold
    p = bag_of_words(sentence, words,show_details=False)
    res = model.predict(np.array([p]))[0]
    ERROR_THRESHOLD = 0.25
    results = [[i,r] for i,r in enumerate(res) if r>ERROR_THRESHOLD]
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({"intent": classes[r[0]], "probability": str(r[1])})
    return return_list

In [101]:
def get_response(ints, intents_json):
    tag = ints[0]['intent']
    list_of_intents = intents_json['intents']
    for i in list_of_intents:
        if(i['tag']== tag):
            result = random.choice(i['responses'])
            break
    return result

In [102]:
def chatbot_response(msg):
    ints = predict_class(msg, model)
    res = get_response(ints, data)
    return res

In [103]:

# Text chat utility function
def chat():
  print("Hello I am olympus, your virtual assistant for great learning.")
  print("How can I help you ?")
  print("Chat with olympus (type: quit to stop)")
  while True:
    inp = input("\n\nYou:")
    if inp.lower() == "quit":
      break
    response = chatbot_response(inp)
    print("olympus: " + response + '\n\n')

In [104]:
# HI
# whats up
# useless piece of shit
# explain me how machine learning works
# lot of problems with olympus
# naive bayes
# listen
# hours of operation
# goodbye
# quit
chat()

Hello I am olympus, your virtual assistant for great learning.
How can I help you ?
Chat with olympus (type: quit to stop)


You:HI
olympus: Hello! how can i help you ?




You:whats up
olympus: Hello! how can i help you ?




You:useless piece of shit
olympus: Please use respectful words




You:explain me how machine learning works
olympus: Link: Machine Learning wiki 




You:lot of problems with olympus
olympus: Link: Olympus wiki




You:naive bayes
olympus: Link: Machine Learning wiki 




You:listen
olympus: Hello! how can i help you ?




You:hours of operation
olympus: I am your virtual learning assistant




You:goodbye
olympus: I hope I was able to assist you, Good Bye




You:quit
