In [3]:
import random
import numpy as np
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
import pickle


In [4]:
lemmetizer = WordNetLemmatizer()

In [5]:
chat_df = pd.read_csv(
    r'C:\projects\internship_project\chatbot_data.csv', 
    dtype={'categories': str, 'title': str, 'abstract': str},  # Adjust column names and types as needed
    low_memory=False
)

In [6]:
words = []
classes = []
documents = []
ignore_letters = ["?", "!", ".", ","]

In [7]:
for _, row in chat_df.iterrows():
    category = row['categories']
    pattern = row['title']  
    response = row['abstract']  
    
    word_list = nltk.word_tokenize(pattern)
    words.extend(word_list)
    documents.append((word_list, category))
    if category not in classes:
        classes.append(category)

In [8]:
words = [lemmetizer.lemmatize(word) for word in words if word not in ignore_letters]
words = sorted(set(words))
classes = sorted(set(classes))

In [9]:
pickle.dump(words, open("words.pkl", "wb"))
pickle.dump(classes, open("classes.pkl", "wb")) 

In [10]:
intent_data = []

for _, row in chat_df.iterrows():
    category = row['categories']
    pattern = row['title']
    response = row['abstract']
    
    # Append to the intent_data list
    intent_data.append([category, pattern, response])

In [11]:
intent_df = pd.DataFrame(intent_data, columns=["category", "pattern", "response"])
intent_df.to_csv(r"C:\projects\internship_project\chatbot_intents.csv", index=False)
print("Intent data saved to chatbot_intents.csv")

Intent data saved to chatbot_intents.csv


In [None]:
training = []
output_empty = [0] * len(classes)

for document in documents:
    bag = []
    word_patterns = document[0]
    word_patterns = [lemmetizer.lemmatize(word.lower()) for word in word_patterns]
    
    for word in words:
        bag.append(1) if word in word_patterns else bag.append(0)

    output_row = list(output_empty)
    output_row[classes.index(document[1])] = 1
    training.append(bag + output_row)

In [None]:
random.shuffle(training)
training = np.array(training)

trainX = training[:, :len(words)]
trainY = training[:, len(words):]

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(64, input_shape=(len(trainX[0]),), activation="relu"))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(len(trainY[0]), activation="softmax"))

sgd = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True)

model.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=["accuracy"])

hist = model.fit(np.array(trainX), np.array(trainY), epochs=100, batch_size=32, verbose=1)

model.save("chatbot_model.h5")
print("Model training complete and saved!")

In [None]:
loss, accuracy = model.evaluate(np.array(testX), np.array(testY), verbose=1)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
predictions = model.predict(np.array(testX))
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(testY, axis=1)
accuracy = np.mean(predicted_classes == true_classes)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
from sklearn.metrics import classification_report
print(classification_report(true_classes, predicted_classes))
