In [1]:
#Sentiment analysis using Random Forest Classifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,  precision_recall_fscore_support

In [2]:
df = pd.read_csv('datasets/Combined Data.csv')
print(df.columns)
df = df.dropna(subset=["statement"])

Index(['No', 'statement', 'status'], dtype='object')


In [3]:
negative_labels = ['Anxiety', 'Depression', 'Suicidal', 'Stress', 'Bipolar', 'Personality disorder']
positive_labels = ['Normal']
df['sentiment'] = df['status'].apply(lambda x: 'Negative' if x in negative_labels else 'Positive')

In [4]:
def clean_text(statement):
    statement = statement.lower()
    statement = re.sub(r"http\S+|www\S+|https\S+", '', statement, flags=re.MULTILINE)  
    statement = re.sub(r'\@w+|\#', '', statement) 
    statement = re.sub(r'[^A-Za-z\s]', '', statement)  
    statement = re.sub(r'\s+', ' ', statement).strip()  
    return statement

In [5]:
if 'statement' in df.columns:
    df['cleaned_text'] = df['statement'].apply(clean_text)
else:
    print("Column 'text' not found!")

In [6]:
label_encoder = LabelEncoder()
df['encoded_sentiment'] = label_encoder.fit_transform(df['sentiment'])

In [7]:
print("Label Encoding:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

Label Encoding: {'Negative': np.int64(0), 'Positive': np.int64(1)}


In [8]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['encoded_sentiment']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
model = RandomForestClassifier()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print(f"Precision: {precision.mean():.2f}")
print(f"Recall:    {recall.mean():.2f}")
print(f"F1 Score:  {f1.mean():.2f}")

Accuracy: 93.85%
Precision: 0.93
Recall:    0.93
F1 Score:  0.93


In [None]:
user_input = input("\nEnter a review statement: ")
user_cleaned = clean_text(user_input)
user_vector = vectorizer.transform([user_cleaned])
user_prediction = model.predict(user_vector)[0]
sentiment_label = "Positive" if user_prediction == 1 else "Negative"
print("\nReview Sentiment:", sentiment_label)


Review Sentiment: Negative
