## Installing Packages
- To install necessary python modules for NLP chat bot

In [17]:
!pip install nltk numpy scikit-learn

Defaulting to user installation because normal site-packages is not writeable


You should consider upgrading via the 'C:\Program Files\Python39\python.exe -m pip install --upgrade pip' command.


## Importing all the packages

In [30]:
import nltk
import spacy
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Download necessary NLTK data
- downloading punctuations, stopword, and wordnet to increase the accuracy of the chatbot and avoid training of stop words and punctuations like `.` and `,` etc.

In [31]:

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kunal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kunal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kunal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading the spacy model for english
- Load spaCy model using `spacy.load()`.

In [32]:
nlp = spacy.load("en_core_web_sm")

## Load and preprocess the dataset
- Here we first create 2 empty arrays, `questions` and `answers`.
- We know that out data is in the format `Consists of two columns: question \t answer \n . Suitable for simple chatbots. Contains 3725 items`.
- The dataset is uploaded along with the `.ipynb` file or you can direct use the kaggle link [here](https://www.kaggle.com/datasets/grafstor/simple-dialogs-for-chatbot).

In [33]:
def load_dataset(file_path):
    questions = []
    answers = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            q, a = line.strip().split('\t')
            questions.append(q)
            answers.append(a)
    return questions, answers

questions, answers = load_dataset('data.txt')

## Preprocess text
- Tokenize
- Remove stopwords and punctuation
- Lemmatize

In [34]:

def preprocess(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

## Preprocess questions
- Create TF-IDF vectorizer

In [35]:


processed_questions = [preprocess(q) for q in questions]

vectorizer = TfidfVectorizer()
question_vectors = vectorizer.fit_transform(processed_questions)

## Function to get the most similar question
- We first preprocess the user input.
- Transform the input vector into `TF-IDF` vectors.
- We find the similarties array using the `cosine_similarity` function between the `input_vectors` and `question_vectors`.
- We find the similarity index using the `argmax` function.


In [36]:
def get_most_similar_question(user_input):
    processed_input = preprocess(user_input)
    input_vector = vectorizer.transform([processed_input])
    similarities = cosine_similarity(input_vector, question_vectors)
    most_similar_idx = similarities.argmax()
    return most_similar_idx, similarities[0][most_similar_idx]

# Function to generate response using spaCy
- First we set a threshold to similarity (`0.5`) which defines wether the chatbot answers the questioon or not.
- Use spaCy for advanced NLP processing.
- Check for negation in the document.
- Identify the question type asked by the user.
- Modify response based on NLP analysis

In [37]:
def generate_response(user_input):
    idx, similarity = get_most_similar_question(user_input)
    
    if similarity < 0.5: 
        return "I'm sorry, I don't understand. Could you please rephrase your question?"
    
    doc = nlp(user_input)
    
    negation = any(token.dep_ == 'neg' for token in doc)
    
    question_words = ['what', 'when', 'where', 'who', 'why', 'how']
    question_type = next((token.text.lower() for token in doc if token.text.lower() in question_words), None)
    
    response = answers[idx]
    if negation:
        response = "I understand you're asking a negative question. " + response
    if question_type:
        response = f"To answer your {question_type} question: " + response
    
    return response


# Chat loop

In [38]:
print("Chatbot: Hello! How can I help you today? (Type 'quit' to exit)")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        print("Chatbot: Goodbye!")
        break
    response = generate_response(user_input)
    print("User:", user_input)
    print("Chatbot:", response)

Chatbot: Hello! How can I help you today? (Type 'quit' to exit)
User: Hi, how are you
Chatbot: To answer your how question: i'm fine. how about yourself?
User: hiiii then
Chatbot: I'm sorry, I don't understand. Could you please rephrase your question?
User: what do you do in your free time
Chatbot: To answer your what question: it starts at 8 o'clock.
User: which school do you attend
Chatbot: i'm attending Amrita Vishwa Vidyapeetham right now.
Chatbot: Goodbye!


### Evaluate the model


In [40]:
def evaluate_model(test_size=0.2, random_state=42):
    from sklearn.model_selection import train_test_split
    
    # Split the data into training and testing sets
    train_questions, test_questions, train_answers, test_answers = train_test_split(
        questions, answers, test_size=test_size, random_state=random_state
    )
    
    # Preprocess and vectorize the training questions
    processed_train_questions = [preprocess(q) for q in train_questions]
    vectorizer = TfidfVectorizer()
    train_question_vectors = vectorizer.fit_transform(processed_train_questions)
    
    # Initialize counters
    correct = 0
    total = len(test_questions)
    
    for test_q, test_a in zip(test_questions, test_answers):
        # Get the model's response
        response = generate_response(test_q)
        
        # Compare the response to the actual answer
        if response.lower() == test_a.lower():
            correct += 1
    
    # Calculate accuracy
    accuracy = correct / total
    return accuracy

# Evaluate the model
accuracy = evaluate_model()
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.60
