In [None]:
import json
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import os
import shutil

if os.path.exists("datasets"):
    shutil.rmtree("datasets")
    print(f"Folder '{"datasets"}' already existed and was deleted.")

os.makedirs("datasets")
    
    
with open ("datasets/dataset.json", "r") as f:
    data = json.load(f)

features, targets = [], []
classes = set()
responses = dict()

# print(nlp.pipe_names)       
def clean_text(text, nlp):
    doc = nlp(text)
    cleaned_doc = [token.lemma_ for token in doc if not token.is_punct and not token.is_space and token.is_alpha]
    return " ".join(cleaned_doc)

def vectoriser(text, nlp):
    doc = nlp(text)
    return doc.vector

for item in data:
    classes.add(item["intent"])
    for t in item["text"]:
        features.append(t)
        targets.append(item["intent"])
    responses[item["intent"]] = []
    for res in item["responses"]:
        responses[item["intent"]].append(res)

le = LabelEncoder()
labeled_target = le.fit_transform(targets)

vectorize_model = TfidfVectorizer()

df = pd.DataFrame({"target": targets, "feature": features})
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'attribute_ruler', 'ner'])
df["labeled_target"] = labeled_target
df["cleaned_feature"] = df["feature"].apply(lambda text: clean_text(text, nlp))
vectorized_features = df["cleaned_feature"].apply(lambda text: nlp(text).vector)
vectorized_array = np.vstack(vectorized_features.values)  # Stack into a NumPy 2D array
num_features = vectorized_array.shape[1]  # Get number of vector dimensions

# Generate column names dynamically
vector_column_names = [f"vectorised_feature_{i+1}" for i in range(num_features)]

# Convert to DataFrame with correct column names
vectorized_df = pd.DataFrame(vectorized_array, columns=vector_column_names)

# Merge vectorized data into original DataFrame
df = pd.concat([df, vectorized_df], axis=1)

# Save to CSV
df.to_csv("cleaned_vectorized_dataset.csv", index=False)

print("Data processing completed and saved to cleaned_vectorized_dataset.csv")





Data processing completed and saved to cleaned_vectorized_dataset.csv


In [172]:
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# Ensure that df["vectorised_feature"] is a list of lists (convert it to a proper NumPy array)
x = df.drop(columns=["target", "feature", "cleaned_feature"])# Stack the vectors into a 2D NumPy array
y = df["labeled_target"].values  # Convert to NumPy array

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=42)

# Define hyperparameter grid
params = {'alpha': [0.01, 0.5, 0.07, 0.1, 0.5, 1.0, 10.0]}

# # Initialize and fit GridSearchCV
# nb_grid = GridSearchCV(MultinomialNB(), params, n_jobs=-1, cv=5, verbose=5)
# nb_grid.fit(x_train, y_train)
# # Print best parameters
# print('Best parameter value(s): {}'.format(nb_grid.best_params_))
# model = nb_grid.best_estimator_

model = SVC()
model.fit(x_train, y_train,)

In [173]:
from sklearn import metrics


train_preds = model.predict(x_train)
print('Training - Accuracy Score: {}'.format(metrics.accuracy_score(y_train, train_preds)))

val_preds = model.predict(x_test)
print('Training - Accuracy Score: {}'.format(metrics.accuracy_score(y_test, val_preds)))

# Training - Accuracy Score: 0.44
# Training - Accuracy Score: 0.06

Training - Accuracy Score: 0.4473684210526316
Training - Accuracy Score: 0.06896551724137931
