## Applying Sentiment Analysis to the "Sentiment Analysis for Mental Health" Dataset

In [2]:
# Step 1: Load the Dataset
import pandas as pd

# Load the dataset
df = pd.read_csv(r'C:\Users\User\Documents\Brain Station\Data Science\Capstone\capstone_social_listening_GC\data\Mental Health\Combined Data.csv',index_col=0)

# Display the first few rows of the dataset
print(df.head())

                                           statement   status
0                                         oh my gosh  Anxiety
1  trouble sleeping, confused mind, restless hear...  Anxiety
2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3  I've shifted my focus to something else but I'...  Anxiety
4  I'm restless and restless, it's been a month n...  Anxiety


In [4]:
# Step 2: Text Preprocessing
# To not count the attributeError: 'float' object has no attribute 'split', suggests that some of the entries in the statement column are not strings but float types. Which typically represent missing or null values.
# To fix it we will need to handle this issue by first converting all entries in the statement column to strings and then applying the split function.
# Filter the rows where 'statement' is a float
#float_values = df[df['statement'].apply(lambda x: isinstance(x, float))]

# Check if all the float values are NaN
#all_nan = float_values['statement'].isna().all()

# Print the result
#print("Are all float values NaN?:", all_nan)

# Convert all entries in the 'statement' column to strings
df['statement'] = df['statement'].astype(str)

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Example of text cleaning function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    words = word_tokenize(text)  # Tokenize the text
    #words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

# Apply the text preprocessing function
df['cleaned_text'] = df['statement'].apply(preprocess_text)
df[['cleaned_text','status']]

Unnamed: 0,cleaned_text,status
0,oh my gosh,Anxiety
1,trouble sleeping confused mind restless heart ...,Anxiety
2,all wrong back off dear forward doubt stay in ...,Anxiety
3,i ve shifted my focus to something else but i ...,Anxiety
4,i m restless and restless it s been a month no...,Anxiety
...,...,...
53038,nobody takes me seriously i ve 24m dealt with ...,Anxiety
53039,selfishness i don t feel very good it s like i...,Anxiety
53040,is there any way to sleep better i can t sleep...,Anxiety
53041,public speaking tips hi all i have to give a p...,Anxiety


In [6]:
# Step 3: Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])
X

<53043x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 3094150 stored elements in Compressed Sparse Row format>

In [8]:
# Step 4: Model Training
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming 'sentiment' is the label column
X_train, X_test, y_train, y_test = train_test_split(X, df['status'], test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                      precision    recall  f1-score   support

             Anxiety       0.81      0.77      0.79       779
             Bipolar       0.90      0.69      0.78       580
          Depression       0.71      0.73      0.72      3100
              Normal       0.87      0.96      0.91      3327
Personality disorder       0.65      0.52      0.58       248
              Stress       0.75      0.46      0.57       557
            Suicidal       0.68      0.67      0.67      2018

            accuracy                           0.77     10609
           macro avg       0.76      0.69      0.72     10609
        weighted avg       0.77      0.77      0.77     10609



In [None]:
# Step 5: Sentiment Prediction (Apply the trained model to new or unseen data to predict sentiment.)

In [None]:
# Step 6: Analysis and Insights (Visualize Results: Use visualizations like bar charts or word clouds to present the results of your sentiment analysis. Trend Analysis: If your data includes timestamps, analyze how sentiment trends over time.)