In [18]:
import pandas as pd

# Load the dataset from a JSON file
file_path = r'C:\Users\chana\Desktop\sentiment_analysis\data\intents.json'
df = pd.read_json(file_path)

# Create a new DataFrame for sentiment analysis
sentiment_df = pd.DataFrame(columns=['text', 'label'])

# Iterate through each "tag" and its "patterns"
for intent in df['intents']:
    tag = intent['tag']
    patterns = intent.get('patterns', [])

    for pattern in patterns:
        sentiment_df = pd.concat([sentiment_df, pd.DataFrame({'text': pattern, 'label': tag}, index=[0])], ignore_index=True)

# Display the first few rows of the new DataFrame
print(sentiment_df.head())

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Split the data into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(
    sentiment_df['text'], sentiment_df['label'], test_size=0.2, random_state=42
)

# Vectorize text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
train_vectors = tfidf_vectorizer.fit_transform(train_data)
test_vectors = tfidf_vectorizer.transform(test_data)

# Train and evaluate models

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(train_vectors, train_labels)
lr_predictions = lr_model.predict(test_vectors)
lr_accuracy = accuracy_score(test_labels, lr_predictions)

print("Logistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(test_labels, lr_predictions, zero_division=1))  # Set zero_division to 1 or another appropriate value



# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(train_vectors, train_labels)
nb_predictions = nb_model.predict(test_vectors)
nb_accuracy = accuracy_score(test_labels, nb_predictions)

print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(test_labels, nb_predictions, zero_division=1))

# Support Vector Machine (SVM)
svm_model = SVC()
svm_model.fit(train_vectors, train_labels)
svm_predictions = svm_model.predict(test_vectors)
svm_accuracy = accuracy_score(test_labels, svm_predictions)

print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:\n", classification_report(test_labels, svm_predictions, zero_division=1))


import joblib

# Save the best model (for example, Logistic Regression in this case)
joblib.dump(lr_model, 'models/best_model.pkl')

joblib.dump(tfidf_vectorizer, 'models/tfidf_vectorizer.pkl')


           text     label
0         Hello  greeting
1            Hi  greeting
2           Hey  greeting
3     Greetings  greeting
4  Good morning  greeting
Logistic Regression Accuracy: 0.5454545454545454
Logistic Regression Classification Report:
                                precision    recall  f1-score   support

            Adventure stories       0.00      1.00      0.00         0
             American fiction       1.00      1.00      1.00         1
          Body, Mind & Spirit       1.00      1.00      1.00         1
         Business & Economics       1.00      1.00      1.00         1
           Children's stories       1.00      0.00      0.00         2
                    Computers       1.00      1.00      1.00         1
Detective and mystery stories       1.00      1.00      1.00         1
                    Education       1.00      1.00      1.00         1
       Family & Relationships       1.00      1.00      1.00         1
                      Fiction       0.00

['models/tfidf_vectorizer.pkl']