In [141]:
import json
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
with open ("dataset.json", "r") as f:
    data = json.load(f)

features, targets = [], []
classes = set()
responses = dict()

# print(nlp.pipe_names)       
def clean_text(text, nlp):
    doc = nlp(text)
    cleaned_doc = [token.lemma_ for token in doc if not token.is_punct and not token.is_space and token.is_alpha]
    return " ".join(cleaned_doc)

for item in data:
    classes.add(item["intent"])
    for t in item["text"]:
        features.append(t)
        targets.append(item["intent"])
    responses[item["intent"]] = []
    for res in item["responses"]:
        responses[item["intent"]].append(res)

le = LabelEncoder()
labeled_target = le.fit_transform(targets)
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'attribute_ruler', 'ner'])
vectorise_model = TfidfVectorizer( lowercase=True, ngram_range=(1,2))

df = pd.DataFrame({"target": targets, "feature": features})
df["labeled_target"] = labeled_target
df["cleaned_feature"] = df["feature"].apply(lambda text: clean_text(text, nlp))
vectorised_text = vectorise_model.fit_transform(df["cleaned_feature"])
vectorised_text = vectorised_text.toarray()
num_vectorised_features = vectorised_text.shape[1]
vectorised_columns = [f"vectorised_feature_{i+1}" for i in range(num_vectorised_features)]
vectorised_df = pd.DataFrame(vectorised_text, columns=vectorised_columns)
df = pd.concat([df, vectorised_df], axis=1)
df.to_csv("cleaned_dataset.csv", index=False)




In [142]:
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB

# Ensure that df["vectorised_feature"] is a list of lists (convert it to a proper NumPy array)
x = df.drop(columns=["target", "feature", "cleaned_feature"])# Stack the vectors into a 2D NumPy array
y = df["labeled_target"].values  # Convert to NumPy array

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=42)

# Define hyperparameter grid
params = {'alpha': [0.01, 0.5, 0.07, 0.1, 0.5, 1.0, 10.0]}

# Initialize and fit GridSearchCV
nb_grid = GridSearchCV(MultinomialNB(), params, n_jobs=-1, cv=5, verbose=5)
nb_grid.fit(x_train, y_train)
# Print best parameters
print('Best parameter value(s): {}'.format(nb_grid.best_params_))
model = nb_grid.best_estimator_




Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best parameter value(s): {'alpha': 0.01}


In [None]:
from sklearn import metrics


train_preds = model.predict(x_train)
print('Training - Accuracy Score: {}'.format(metrics.accuracy_score(y_train, train_preds)))

val_preds = model.predict(x_test)
print('Training - Accuracy Score: {}'.format(metrics.accuracy_score(y_test, val_preds)))

# Training - Accuracy Score: 1.0
# Training - Accuracy Score: 0.8620689655172413

Training - Accuracy Score: 1.0
Training - Accuracy Score: 0.8620689655172413
