# **Data Preprocessing**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import zipfile
import os
import pandas as pd
import ast
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
# zip_path = '/content/drive/MyDrive/Data/movie_corpus.zip' # Kasha~af
zip_path = '/content/drive/MyDrive/Internship/ChatBot.zip' # Ahmed S.

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/movie_corpus')

# Path to the extracted files
extracted_path = '/content/movie_corpus'
print(os.listdir(extracted_path))

['movie_titles_metadata.txt', 'movie_characters_metadata.txt', '.DS_Store', 'movie_lines.txt', 'raw_script_urls.txt', 'chameleons.pdf', 'movie_conversations.txt', 'README.txt']


In [4]:
# Load movie lines
lines_path = os.path.join(extracted_path, 'movie_lines.txt')
conversations_path = os.path.join(extracted_path, 'movie_conversations.txt')

# Read movie_lines.txt
# 'sep' is the seperator by which data is seperated in our txt file
# 'header= None' means that there is no header row in the txt file by default
# 'names' list gives us the option to specify headers on our own
lines = pd.read_csv(lines_path, sep='\+\+\+\$\+\+\+', header=None, engine='python', names=['lineID', 'characterID', 'movieID', 'character', 'text'], encoding='latin-1')

# Read movie_conversations.txt
conversations = pd.read_csv(conversations_path, sep='\+\+\+\$\+\+\+', header=None, engine='python', names=['character1ID', 'character2ID', 'movieID', 'utteranceIDs'], encoding='latin-1')

In [5]:
# Create a dictionary to map each line's ID to its text
id2line = {line.lineID: line.text for line in lines.itertuples()}

# This will be the sample output:
# {
#    1: 'Hello, how are you?',
#    2: "I'm fine, thank you.",
#    3: 'Good morning!',
#    4: 'Hi there.'
# }

# Fixed an error, there was an extra space after each lineID
id2line_new = {key.replace(" ", "") if " " in key else key: value for key, value in id2line.items()}
id2line = id2line_new

In [6]:
# Extract conversations
conversations_data = []

for conv in conversations.itertuples():
    # Convert the string representation of list to a list of strings
    utterance_ids = ast.literal_eval(conv.utteranceIDs)
    # Get the text corresponding to each line ID
    conv_texts = [id2line.get(uid, '') for uid in utterance_ids if uid in id2line]
    if conv_texts:  # Only add non-empty conversations
        conversations_data.append(conv_texts)

In [7]:
def preprocess_text(text):
    # Convert text to str (for some reason the text was being considered float😒)
    text = str(text)
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)
     # Replace contractions and common abbreviations with full forms
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    # Remove leading and trailing spaces
    return text.strip()

# Apply preprocessing to each line in the conversations
preprocessed_conversations = []
for conv in conversations_data:
    # Preprocess each line in the conversation
    preprocessed_conversations.append([preprocess_text(line) for line in conv])

In [8]:
input_texts = []

target_texts = []
#Iterating through each conversation that is preprocessed
for conv in preprocessed_conversations:
    #Iterating through each line in the conversation
    for i in range(len(conv) - 1):
        #Appending the input and target texts
        input_texts.append(conv[i]) #input text
        target_texts.append(conv[i + 1])  #target text

In [9]:
# Tokenize the input and target texts
tokenizer = Tokenizer()
#Fit the tokenizer on the input and target texts
#This will create the vocabulary of words used in the texts
#This will split the words
#This will also assign a unique integer to each word
#This will also count the frequency of each word
# 1: i (i occured most for example) 2: am (am occured 2nd most for example)
tokenizer.fit_on_texts(input_texts + target_texts)

# Convert the input texts to sequences of integers
# 'hello how are you' becomes [6, 7, 8, 5]
input_sequences = tokenizer.texts_to_sequences(input_texts)

# Convert the target texts to sequences of integers
# 'i am fine thank you' becomes [1, 2, 3, 4, 5].
target_sequences = tokenizer.texts_to_sequences(target_texts)


#finding max of both sequences
max_length = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in target_sequences))
#pad zeros in small length input sequences at the end
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='post')
#pad zeros in small length target sequences at the end
target_sequences = pad_sequences(target_sequences, maxlen=max_length, padding='post')


In [10]:
#From here on split data into training and testing data
# chose network of the choice LSTM, Transformer

# **Natural Language Understanding**

NLU is a subfield of AI that focuses on enabling computers to understand the meaning and intent behind human language.

NLU is used in this project to:
1.  Understand the user's goals and intentions.
2.  Extract relevant information from user input.
3.  Respond in a way that aligns with the identified intent and context


In [11]:
import spacy
import json
import random

## **Intent Recognition**

In [12]:
INTENT_PATH = '/content/drive/MyDrive/Internship/intents.json'

# Open the file and load the JSON data into the 'intents'
with open(INTENT_PATH) as f:
    intents = json.load(f)

# Extract the 'intents' key from the loaded JSON data
intents = intents['intents']

In [13]:
# Intents.json file has various intent dictionaries each following the following structure
# {
#     "tag": "unique_intent_identifier",  # e.g., "greeting", "goodbye"
#     "patterns": [ List of user phrases that trigger this intent ],
#     "responses": [ List of possible responses the chatbot can give ],
#     "context_set": ""   used for context-aware chatbots
# }


def get_intent(user_input):
    """
    Searches for a matching intent based on user input.

    This function iterates through all intents in the loaded `intents` list.
    For each intent, it checks if any patterns (user phrases) match the lowercased user input.
    If a match is found, the corresponding intent dictionary is returned. Otherwise, the function returns None.

    Args:   user_input (str): The user's input phrase.

    Returns:  dict or None: The matching intent dictionary if found, otherwise None.
    """

    for intent in intents:
        for pattern in intent['patterns']:
            if pattern.lower() in user_input.lower():
                return intent
    return None


def get_response(intent):
    """
    Selects a random response from the provided intent.

    This function assumes the `intent` dictionary has a "responses" key containing a list of possible chatbot responses.
    It randomly chooses one of these responses and returns it.

    Args:   intent (dict): The intent dictionary containing user patterns and responses.

    Returns:  str: A random response chosen from the intent's "responses" list.
    """
    return random.choice(intent['responses'])

## **Name Entity Recognition**

In [14]:
# Load the spaCy English small model
nlp = spacy.load('en_core_web_sm')

def extract_entities(user_input):
    """
    Extracts named entities from user input using spaCy.

    This function takes a string of user input and processes it using the loaded spaCy model (`nlp`).
    The model identifies and classifies named entities within the text. The function then iterates through the identified entities,
    extracting their text and label (type of entity) and returning a list of tuples containing these values.

    Args:   user_input (str): The user's input text.

    Returns:  list: A list of tuples where each tuple contains (entity_text, entity_label).
    """

    # for example
    # User Input: Barack Obamma was the president of United States of America
    # Entities:
    #         1) Barack Obamma -> Person
    #         2) United States of America -> GPE (GeoPolitical Entity)

    doc = nlp(user_input)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

## **Context Management**

In [15]:
current_context = ""

# Getter and Setter functions for context
def get_context():
    return current_context

def set_context(context):
    global current_context
    current_context = context

In [16]:
# Dictionary to store context information for intents that will be used to set context
intent_context_map = {}
for intent in intents:
    if intent['context_set']:
        intent_context_map[intent['tag']] = intent['context_set']
    else:
        intent_context_map[intent['tag']] = None

In [17]:
def handle_input(user_input):
    """
    Processes user input, returning a response and extracted entities.

    This function takes the user's input as a string. It first calls `get_intent` to identify the intent (purpose)
    behind the user's input based on the loaded intents and their patterns.

    If an intent is found:
        - `get_response` is called to retrieve a random response from the corresponding intent's list of responses.
        - `extract_entities` is used to identify and extract named entities from the user's input using the spaCy library.
        - The `intent_context_map` is used to look up the context associated with the identified intent tag. If context exists, the `set_context`
          function is called to set the current conversational context. Otherwise, the context is set to None.

    If no intent is found, a default response indicating confusion is returned, and an empty list of entities is provided.

    Args:   user_input (str): The user's input text.

    Returns:  tuple: A tuple containing (response_text, list_of_entities).
    """

    intent = get_intent(user_input)

    if intent:
        response = get_response(intent)
        entities = extract_entities(user_input)
        context = intent_context_map[intent['tag']]
        if context:
            set_context(context)
        else:
            set_context(None)
    else:
        response = "I'm not quite sure what you mean by that. Can you try explaining it in a different way?"
        entities = []

    return response, entities

In [18]:
# Testing NLU

while True:
    user_input = input("You: ")
    if user_input.lower() == "bye":
        response, _ = handle_input(user_input)
        print("Bot:", response)
        break

    response, entities = handle_input(user_input)
    print("Bot:", response)
    print('Entities:', entities)
    print("Context:", get_context())
    print('\n'+'='*50+'\n')

You: Hello
Bot: Hello!
Entities: []
Context: None


You: donald trump made america great?
Bot: I'm not quite sure what you mean by that. Can you try explaining it in a different way?
Entities: []
Context: None


You: bye
Bot: Goodbye!


# **Response Generation**