# Preprocessing the dataset

In [42]:
import json
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import PorterStemmer
from textblob import TextBlob

# Load data from file
with open('mental_health_conversational_dataset.json') as f:
    data = json.load(f)

# Preprocess each intent
for intent in data['intents']:
    # Get patterns for the intent
    patterns = [pattern.lower() for pattern in intent['patterns']]
    
    # Tokenization
    tokenized_patterns = [word_tokenize(pattern) for pattern in patterns]
    
    # Removing punctuation
    table = str.maketrans('', '', punctuation)
    patterns_no_punct = [[word.translate(table) for word in pattern] for pattern in tokenized_patterns]
    
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    patterns_no_stop = [[word for word in pattern if word not in stop_words] for pattern in patterns_no_punct]
    
    # Stemming
    stemmer = PorterStemmer()
    patterns_stemmed = [[stemmer.stem(word) for word in pattern] for pattern in patterns_no_stop]
    
    # Removing HTML tags
    patterns_no_html = [[re.sub(r'<[^>]+>', '', word) for word in pattern] for pattern in patterns_stemmed]
    
    # Lowercasing
    patterns_lower = [[word.lower() for word in pattern] for pattern in patterns_no_html]
    
    # Removing numbers
    patterns_no_num = [[re.sub(r'\d+', '', word) for word in pattern] for pattern in patterns_lower]
    
    # Removing special characters
    patterns_no_spec = [[re.sub(r'\W+', '', word) for word in pattern] for pattern in patterns_no_num]
    
    # Removing URLs
    patterns_no_url = [[re.sub(r'http\S+', '', word) for word in pattern] for pattern in patterns_no_spec]
    
    # Spell correction
    patterns_corrected = [[str(TextBlob(word).correct()) for word in pattern] for pattern in patterns_no_url]
    
    # Update patterns in the intent
    intent['patterns'] = patterns_corrected
    
    # preprocess of responses are quite not required as my dataset are optimized 
    responses = intent['responses']
    
#print(data)

#save preprocessed data to file 
with open('preprocessed_dataset.json','w')as f:
    json.dump(data,f)

{'intents': [{'intent': 'greeting', 'patterns': [['hi', ''], ['hello', ''], ['hey', ''], ['good', 'more', ''], ['good', 'afternoon', ''], ['greet', ''], [''], ['how', '']], 'responses': ['Hi! How can I help you today?', 'Hello there! How are you feeling?', 'Hey! How can I assist you today?', 'Good morning! How can I be of service?', 'Good afternoon! How are you doing?', 'Greetings! How can I support you today?', "What's up? How can I assist you?", 'Howdy! How can I help you feel better?']}, {'intent': 'help', 'patterns': [['help', ''], ['need', 'help', ''], ['help', ''], [''], ['assist', ''], ['struggle', '', 'help', '']], 'responses': ["Sure, I'm here to help. What can I do for you?", 'Of course, what do you need help with?', "I'll do my best to assist you. What do you need help with?", "Yes, I can help. What's the problem?", 'Absolutely, how can I be of assistance?', "I'm here to help. What's going on?"]}, {'intent': 'goodbye', 'patterns': [['goodby', ''], ['see', 'later', ''], ['bye

### Preprocessing of the dataset has been done

# Analsying the Sentiment of the preprocessed dataset

In [3]:
!pip install nltk



### Algorithm used to sentiment analyses of the patterns

The sentiment analysis algorithm used in this code is the Sentiment Intensity Analyzer (SIA) from the Natural Language Toolkit (NLTK) library based on a machine learning approach in Python. The SIA algorithm uses a lexicon-based approach to calculate the sentiment of a piece of text by assigning a score to each word in the text based on its polarity (positive, negative, or neutral) and intensity (strongly positive or negative, or weakly positive or negative). The scores of individual words are then combined to calculate an overall sentiment score for the text.

However, it is not based on deep learning or neural networks, but rather a rule-based system.

The SentimentIntensityAnalyzer (SIA) from the NLTK library is . 

In [45]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer

# Convert data.json to a pandas dataframe
with open('preprocessed_dataset.json') as f:
    data = json.load(f)
rows = []
for intent in data['intents']:
    for pattern in intent['patterns']:
        rows.append({
            'intent': intent['intent'],
            'pattern': ' '.join(pattern),
            'response': ' '.join(intent['responses'])
        })
df = pd.DataFrame(rows)

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Define a function to calculate sentiment score for each pattern
# Define a function to calculate sentiment score for each pattern
def calculate_sentiment_score(pattern):
    if "end life" in pattern or "feel reason continue live" in pattern or "difficult live" in pattern or "can't take it anymore" in pattern or "die" in pattern or "don't want to live" in pattern or "hurt others" in pattern or "kill myself" in pattern or "end life" in pattern or "leave earth" in pattern or "run away" in pattern or "suicide" in pattern or "suicidal thoughts" in pattern:
        return 1.0
    else:
        return sia.polarity_scores(pattern)['compound']


# Apply sentiment analysis to each pattern and store the scores in a new column
df['sentiment_score'] = df['pattern'].apply(calculate_sentiment_score)

# Group patterns by intent and calculate average sentiment score
sentiment_by_intent = df.groupby('intent')['sentiment_score'].mean()

# Print the results
print(sentiment_by_intent)


intent
Anxiety                           -0.000737
Bipolar Disorder                  -0.232924
Depression                        -0.281847
Depression_conversations          -0.125777
PTSD                              -0.059121
PTSD_symptoms                     -0.417664
anxiety_symptoms                  -0.368192
bipolar_disorder_symptoms          0.128810
depression_symptoms                0.364385
disdisruptive_behavior_disorder   -0.348436
eating_disorder                   -0.112400
goodbye                            0.148544
greeting                           0.149875
help                               0.218067
mental_health_concern             -0.222845
neurodevelopmental_disorder       -0.203400
sad                               -0.211090
schizophrenia                      0.022079
thanks                             0.306589
Name: sentiment_score, dtype: float64


In [38]:
# Print the results
#print(df[['intent', 'pattern', 'sentiment_score']])

In [44]:
# Convert data.json to a pandas dataframe
with open('preprocessed.json') as f:
    data = json.load(f)
rows = []
for intent in data['intents']:
    pattern_list = []
    for pattern in intent['patterns']:
        sentiment_score = calculate_sentiment_score(' '.join(pattern))
        pattern_dict = {"pattern": ' '.join(pattern), "sentiment_score": sentiment_score}
        pattern_list.append(pattern_dict)
    rows.append({
        'intent': intent['intent'],
        'patterns': pattern_list,
        'responses': intent['responses']
    })
df = pd.DataFrame(rows)

# Write updated data to json file
with open('sentiment_analysed_dataset.json', 'w') as f:
    json.dump({'intents': rows}, f, indent=4)
