In [1]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Bidirectional, Dense, Dropout, Conv1D, MaxPooling1D, Activation
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, BertTokenizerFast, BertForSequenceClassification, BertTokenizer, TrainingArguments, Trainer, pipeline
import pandas as pd
import random
from matplotlib import pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import missingno as msno
import json
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')




In [None]:
import json
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Load JSON file
def load_json_file(filename):
    with open(filename) as f:
        file = json.load(f)
    return file

# Create a DataFrame
def create_df():
    return pd.DataFrame({'Pattern': [], 'Tag': []})

# Extract information from JSON and populate the DataFrame
def extract_json_info(json_file, df):
    for intent in json_file['intents']:
        for pattern in intent['patterns']:
            df.loc[len(df.index)] = [pattern, intent['tag']]
    return df

# Preprocess text data
def preprocess_pattern(pattern, stemmer, ignore_words):
    words = word_tokenize(pattern.lower())
    stemmed_words = [stemmer.stem(word) for word in words if word not in ignore_words]
    return " ".join(stemmed_words)

# Load intents JSON
filename = 'intents_exp.json'
intents = load_json_file(filename)

# Initialize DataFrame and extract patterns and tags
df = create_df()
df = extract_json_info(intents, df)

# Initialize stemmer and ignore list
stemmer = PorterStemmer()
ignore_words = stopwords.words('english') + ['?', '!', ',', '.']

# Preprocess patterns
df['Pattern'] = df['Pattern'].apply(lambda x: preprocess_pattern(x, stemmer, ignore_words))

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

# Load stopwords
stop_words = set(stopwords.words('english'))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Contraction map
CONTRACTIONS = {
    "don't": "do not",
    "can't": "cannot",
    "i'm": "i am",
    "it's": "it is",
    "you're": "you are",
    "isn't": "is not",
    # Add more contractions as needed
}

def clean_text(text):
    """
    Clean and normalize text.
    - Expands contractions
    - Removes special characters, numbers, and punctuations
    - Converts to lowercase
    """
    # Expand contractions
    for contraction, expansion in CONTRACTIONS.items():
        text = re.sub(r'\b' + contraction + r'\b', expansion, text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower().strip()
    return text

def remove_stopwords(tokens):
    """
    Removes stopwords from a list of tokens.
    """
    return [word for word in tokens if word not in stop_words]

def lemmatize_tokens(tokens):
    """
    Lemmatizes tokens to their base form.
    """
    return [lemmatizer.lemmatize(word) for word in tokens]

def preprocess_text(text):
    """
    Full preprocessing pipeline for a single text input.
    """
    # Step 1: Clean text
    cleaned_text = clean_text(text)
    
    # Step 2: Tokenize
    tokens = word_tokenize(cleaned_text)
    
    # Step 3: Remove stopwords
    tokens = remove_stopwords(tokens)
    
    # Step 4: Lemmatize tokens
    lemmatized_tokens = lemmatize_tokens(tokens)
    
    # Step 5: Join tokens back to a single string
    return " ".join(lemmatized_tokens)

# Apply the preprocessing pipeline to the DataFrame
df['Processed_Pattern'] = df['Pattern'].apply(preprocess_text)

# Optional: Extract additional features for analysis (e.g., sentence length)
df['Length'] = df['Pattern'].apply(lambda x: len(x.split()))


# Prepare labels
labels = df['Tag'].unique().tolist()
labels = [label.strip() for label in labels]

# Map labels to integers
label2id = {label: id for id, label in enumerate(labels)}
id2label = {id: label for id, label in enumerate(labels)}
df['labels'] = df['Tag'].map(lambda x: label2id[x.strip()])

# Prepare data for training
X = list(df['Processed_Pattern'])
y = list(df['labels'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_len = 100
X_train_pad = pad_sequences(X_train_seq, padding='post', maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, padding='post', maxlen=max_len)

# Build the model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(labels), activation='softmax')
])

# Compile the model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

# Train the model
history = model.fit(
    X_train_pad, np.array(y_train),
    validation_data=(X_test_pad, np.array(y_test)),
    epochs=50,
    batch_size=32,
    callbacks=[early_stopping, lr_scheduler]
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, np.array(y_test))
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Save the model
model.save("intent_classifier_model.h5")

# Example prediction
def predict_intent(text, tokenizer, model, label_map):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, padding='post', maxlen=max_len)
    pred = model.predict(padded)
    return label_map[np.argmax(pred)]

example_text = "How can I reset my password?"
predicted_label = predict_intent(example_text, tokenizer, model, id2label)
print(f"Predicted Intent: {predicted_label}")


Epoch 1/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 97ms/step - accuracy: 0.0137 - loss: 3.6392 - val_accuracy: 0.0432 - val_loss: 3.6254 - learning_rate: 0.0010
Epoch 2/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step - accuracy: 0.0481 - loss: 3.6183 - val_accuracy: 0.0719 - val_loss: 3.5971 - learning_rate: 0.0010
Epoch 3/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step - accuracy: 0.0614 - loss: 3.5864 - val_accuracy: 0.0432 - val_loss: 3.5663 - learning_rate: 0.0010
Epoch 4/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - accuracy: 0.0561 - loss: 3.5989 - val_accuracy: 0.0432 - val_loss: 3.5401 - learning_rate: 0.0010
Epoch 5/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - accuracy: 0.0646 - loss: 3.5213 - val_accuracy: 0.0647 - val_loss: 3.3837 - learning_rate: 0.0010
Epoch 6/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 



Test Loss: 1.8581576347351074
Test Accuracy: 0.6187050342559814
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 412ms/step
Predicted Intent: goodbye
