In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import json
import numpy as np
import pandas as pd
import random
from matplotlib import pyplot as plt
import seaborn as sns
from wordcloud import WordCloud,STOPWORDS
import missingno as msno

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from tensorflow.keras.preprocessing import text
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.callbacks import ReduceLROnPlateau

from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer

import torch
from torch.utils.data import Dataset

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline
from transformers import DistilBertTokenizerFast
from transformers import BertForSequenceClassification, BertTokenizerFast
# from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
from transformers import BertTokenizer#, TFBertForSequenceClassification, BertConfig
from transformers import TrainingArguments, Trainer

torch.cuda.is_available()

from sklearn.metrics import classification_report, confusion_matrix




In [3]:
def load_json_file(filename):
    with open(filename) as f:
        file = json.load(f)
    return file

filename = 'intents_exp.json'

intents = load_json_file(filename)

In [4]:
def create_df():
    df = pd.DataFrame({
        'Pattern' : [],
        'Tag' : []
    })
    
    return df

df = create_df()
df

Unnamed: 0,Pattern,Tag


In [5]:
def extract_json_info(json_file, df):
    
    for intent in json_file['intents']:
        
        for pattern in intent['patterns']:
            
            sentence_tag = [pattern, intent['tag']]
            df.loc[len(df.index)] = sentence_tag
                
    return df

df = extract_json_info(intents, df)
df.head()

Unnamed: 0,Pattern,Tag
0,Hi,greeting
1,How are you?,greeting
2,Is anyone there?,greeting
3,Hello,greeting
4,Good day,greeting


In [6]:

# Ensure NLTK resources are downloaded
# nltk.download('wordnet')
from nltk.corpus import wordnet

# nltk.download('omw-1.4')

def extract_json_info(json_file, df):
    
    for intent in json_file['intents']:
        
        for pattern in intent['patterns']:
            
            sentence_tag = [pattern, intent['tag']]
            df.loc[len(df.index)] = sentence_tag
                
    return df

df = extract_json_info(intents, df)
# df.head()

# Function to get synonyms for a word
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace("_", " "))
    return synonyms

# Function to create new patterns with synonyms
def expand_patterns_with_synonyms(df):
    expanded_data = []
    for _, row in df.iterrows():
        pattern = row['Pattern']
        tag = row['Tag']
        words = pattern.split()
        new_patterns = [pattern]
        
        # Generate new patterns by replacing words with synonyms
        for word in words:
            synonyms = get_synonyms(word)
            for synonym in synonyms:
                new_patterns.append(pattern.replace(word, synonym))
        
        # Add all generated patterns with the same tag to the dataframe
        for new_pattern in set(new_patterns):
            expanded_data.append({'Pattern': new_pattern, 'Tag': tag})
    
    return pd.DataFrame(expanded_data)

df = expand_patterns_with_synonyms(df)

In [7]:
df2 = df.copy()
# df2.head()

In [8]:
def print_shape_df(df, ds_name="df"):
    print(f"{ds_name} dataset has {df.shape[0]} rows and {df.shape[1]} columns")
    
print_shape_df(df, "Chatbot")

Chatbot dataset has 48630 rows and 2 columns


In [9]:
def print_dfInfo(df, ds_name="df"):
    print(f"The info of {ds_name} dataset\n")
    print(df.info())
    
print_dfInfo(df, "Chatbot")

The info of Chatbot dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48630 entries, 0 to 48629
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Pattern  48630 non-null  object
 1   Tag      48630 non-null  object
dtypes: object(2)
memory usage: 760.0+ KB
None


In [10]:
def num_classes(df, target_col, ds_name="df"):
    print(f"The {ds_name} dataset has {len(df[target_col].unique())} classes")
    
num_classes(df, 'Tag', "Chatbot")

The Chatbot dataset has 38 classes


In [11]:
def check_null(df, ds_name='df'):
    print(f"Null Values in each col in the {ds_name} dataset:\n")
    print(df.isnull().sum())
    
check_null(df, "Chatbot")

Null Values in each col in the Chatbot dataset:

Pattern    0
Tag        0
dtype: int64


In [12]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ghostmaga\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [14]:
stemmer = PorterStemmer()
ignore_words=['?', '!', ',', '.']

def preprocess_pattern(pattern, stemmer, ignore_words):
    words = word_tokenize(pattern.lower())
    stemmed_words = [stemmer.stem(word) for word in words if word not in ignore_words]
    return " ".join(stemmed_words)  

df['Pattern'] = df['Pattern'].apply(lambda x: preprocess_pattern(x, stemmer, ignore_words))

In [15]:
print(df['Pattern'].isnull().sum())

0


In [16]:
def get_corpus(series):
    words = []
    for text in series:
        for word in text.split():
            words.append(word.strip())
    return words

corpus = get_corpus(df.Pattern)
corpus[:5]

['hello', 'hawaii', 'howdi', 'aloha', 'state']

In [17]:
print(f"dataset contains {len(corpus)} words")

dataset contains 263580 words


In [18]:
from collections import Counter
counter = Counter(corpus)
most_common = counter.most_common(10)
most_common = dict(most_common)
most_common

{'the': 13272,
 'you': 9518,
 'i': 7798,
 'what': 6520,
 'for': 5572,
 'is': 5538,
 'can': 5020,
 'a': 4834,
 'colleg': 4682,
 'how': 4670}

In [19]:
def get_top_text_ngrams(corpus, n,g):
    vec = CountVectorizer(ngram_range=(1, 1)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [20]:
labels = df2['Tag'].unique().tolist()
labels = [s.strip() for s in labels]
labels

['greeting',
 'goodbye',
 'creator',
 'name',
 'hours',
 'number',
 'course',
 'fees',
 'location',
 'hostel',
 'event',
 'document',
 'floors',
 'syllabus',
 'library',
 'infrastructure',
 'canteen',
 'menu',
 'placement',
 'ithod',
 'computerhod',
 'extchod',
 'principal',
 'sem',
 'admission',
 'scholarship',
 'facilities',
 'college intake',
 'uniform',
 'committee',
 'random',
 'swear',
 'vacation',
 'sports',
 'salutaion',
 'task',
 'ragging',
 'head']

In [21]:
num_labels = len(labels)
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label:id for id, label in enumerate(labels)}

In [22]:
id2label

{0: 'greeting',
 1: 'goodbye',
 2: 'creator',
 3: 'name',
 4: 'hours',
 5: 'number',
 6: 'course',
 7: 'fees',
 8: 'location',
 9: 'hostel',
 10: 'event',
 11: 'document',
 12: 'floors',
 13: 'syllabus',
 14: 'library',
 15: 'infrastructure',
 16: 'canteen',
 17: 'menu',
 18: 'placement',
 19: 'ithod',
 20: 'computerhod',
 21: 'extchod',
 22: 'principal',
 23: 'sem',
 24: 'admission',
 25: 'scholarship',
 26: 'facilities',
 27: 'college intake',
 28: 'uniform',
 29: 'committee',
 30: 'random',
 31: 'swear',
 32: 'vacation',
 33: 'sports',
 34: 'salutaion',
 35: 'task',
 36: 'ragging',
 37: 'head'}

In [23]:
label2id

{'greeting': 0,
 'goodbye': 1,
 'creator': 2,
 'name': 3,
 'hours': 4,
 'number': 5,
 'course': 6,
 'fees': 7,
 'location': 8,
 'hostel': 9,
 'event': 10,
 'document': 11,
 'floors': 12,
 'syllabus': 13,
 'library': 14,
 'infrastructure': 15,
 'canteen': 16,
 'menu': 17,
 'placement': 18,
 'ithod': 19,
 'computerhod': 20,
 'extchod': 21,
 'principal': 22,
 'sem': 23,
 'admission': 24,
 'scholarship': 25,
 'facilities': 26,
 'college intake': 27,
 'uniform': 28,
 'committee': 29,
 'random': 30,
 'swear': 31,
 'vacation': 32,
 'sports': 33,
 'salutaion': 34,
 'task': 35,
 'ragging': 36,
 'head': 37}

In [24]:
df2['labels'] = df2['Tag'].map(lambda x: label2id[x.strip()])
df2.head()

Unnamed: 0,Pattern,Tag,labels
0,hello,greeting,0
1,Hawaii,greeting,0
2,howdy,greeting,0
3,Aloha State,greeting,0
4,hi,greeting,0


In [25]:
X = list(df2['Pattern'])
X[:5]

['hello', 'Hawaii', 'howdy', 'Aloha State', 'hi']

In [26]:
y = list(df2['labels'])
y[:5]

[0, 0, 0, 0, 0]

In [27]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 123)

In [28]:
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = max(len(seq) for seq in X_train_seq)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Build custom model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_len),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(num_labels, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
history = model.fit(
    X_train_padded, np.array(y_train),
    validation_data=(X_test_padded, np.array(y_test)),
    epochs=5,
    batch_size=32
)

# Evaluate the model
y_pred = model.predict(X_test_padded).argmax(axis=1)
print("Classification Report:")
# print(classification_report(y_test, y_pred, target_names=labels))

# Save the model and tokenizer
model.save("chatbot_model11.h5")

import pickle
with open('tokenizer11.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Predict function
def predict(text, tokenizer, model):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')
    prediction = model.predict(padded).argmax(axis=1)
    return id2label[prediction[0]]

# Chat function



Epoch 1/5
[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 31ms/step - accuracy: 0.3127 - loss: 2.3111 - val_accuracy: 0.9074 - val_loss: 0.3426
Epoch 2/5
[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 32ms/step - accuracy: 0.9306 - loss: 0.2610 - val_accuracy: 0.9626 - val_loss: 0.1310
Epoch 3/5
[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 30ms/step - accuracy: 0.9717 - loss: 0.1026 - val_accuracy: 0.9821 - val_loss: 0.0684
Epoch 4/5
[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 30ms/step - accuracy: 0.9815 - loss: 0.0657 - val_accuracy: 0.9851 - val_loss: 0.0594
Epoch 5/5
[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 31ms/step - accuracy: 0.9897 - loss: 0.0395 - val_accuracy: 0.9863 - val_loss: 0.0515
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step




Classification Report:


In [29]:
def chat(model, tokenizer):
     print("Chatbot: Hi! I am your virtual assistant. Type 'quit' to exit.\n")
     text = input("User: ").strip().lower()

     while text != 'quit':
          pred_tag = predict(text, tokenizer, model)
          responses = [intent['responses'] for intent in intents['intents'] if intent['tag'] == pred_tag]
          response = random.choice(responses[0]) if responses else "I'm sorry, I didn't understand that."
          
          seq = tokenizer.texts_to_sequences([text])
          padded = pad_sequences(seq, maxlen=max_len, padding='post')
          
          if max(model.predict(padded)[0]) > 0.7:
               print(f"Chatbot: {response}\n")
          else:
               print("Chatbot: I'm sorry, I didn't understand that.\n")
          text = input("User: ").strip().lower()

# Load and start chatting
chat(model, tokenizer)

Chatbot: Hi! I am your virtual assistant. Type 'quit' to exit.



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
Chatbot: I'm sorry, I didn't understand that.

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Chatbot: Good to see you again!

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Chatbot: Goodbye!

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Chatbot: Nazarbayev University offers courses in Engineering, Sciences, Social Sciences, Business, Medicine, and more. For details, visit the official course catalog on the NU website.

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/