In [1]:
#Sentiment analysis using linear regression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score,precision_recall_fscore_support

In [2]:
df = pd.read_csv('datasets/Combined Data.csv') 
df.sample(5)

Unnamed: 0,No,statement,status
9009,9009,I feel done. I think I am done.Nothing seems e...,Depression
9995,9995,i have severe depression and my episodes can l...,Suicidal
24282,24282,"as a borderline, I have never had a single dec...",Suicidal
35778,35778,I'm scared of breaking my neck by turning it t...,Anxiety
7679,7679,My grandmother told me you are not the grandda...,Suicidal


In [3]:
df = df.dropna(subset=["statement"])

In [4]:
negative_labels = ['Anxiety', 'Depression', 'Suicidal', 'Stress', 'Bipolar', 'Personality disorder']
positive_labels = ['Normal']
df['sentiment'] = df['status'].apply(lambda x: 'Negative' if x in negative_labels else 'Positive')

In [5]:
def clean_text(statement):
    statement = statement.lower()
    statement = re.sub(r"http\S+|www\S+|https\S+", '', statement, flags=re.MULTILINE)  
    statement = re.sub(r'\@w+|\#', '', statement) 
    statement = re.sub(r'[^A-Za-z\s]', '', statement)  
    statement = re.sub(r'\s+', ' ', statement).strip()  
    return statement

In [6]:
if 'statement' in df.columns:
    df['cleaned_text'] = df['statement'].apply(clean_text)
else:
    print("Column 'text' not found!")

In [7]:
label_encoder = LabelEncoder()
df['encoded_sentiment'] = label_encoder.fit_transform(df['sentiment'])

In [8]:
print("Label Encoding:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))) 

Label Encoding: {'Negative': np.int64(0), 'Positive': np.int64(1)}


In [9]:
df.sample(5)

Unnamed: 0,No,statement,status,sentiment,cleaned_text,encoded_sentiment
35083,35083,would my may2018 ECG of picked this up? worrie...,Anxiety,Negative,would my may ecg of picked this up worried so ...,0
15368,15368,Today I was supposed to get my marks but I end...,Suicidal,Negative,today i was supposed to get my marks but i end...,0
10910,10910,So basically me and my ex broke up in December...,Depression,Negative,so basically me and my ex broke up in december...,0
4343,4343,"hype mamamoo, purple kiss is here. There, crea...",Normal,Positive,hype mamamoo purple kiss is here there create ...,1
43794,43794,completing report and meeting this morning int...,Normal,Positive,completing report and meeting this morning int...,1


Fit panra apo vantha error ( Unable to allocate 29.0 GiB for an array with shape (52681, 74007) )  ithukaaga thaa max_features add panen

In [10]:
vectorizer = TfidfVectorizer(max_features=5000)  
X = vectorizer.fit_transform(df['cleaned_text']).toarray()
y = df['encoded_sentiment']

Since the dateset is imbalanced, we use RandomUnderSampler to balance the dataset

X, y is resampled as X_resampled, y_resampled


In [11]:

from imblearn.under_sampling import RandomUnderSampler

# Initialize the undersampler
rus = RandomUnderSampler(random_state=42)

# Resample the data
X_resampled, y_resampled = rus.fit_resample(X.reshape(-1, 1), y)

# Check the new distribution
print(pd.Series(y_resampled).value_counts())


encoded_sentiment
0    16343
1    16343
Name: count, dtype: int64




In [12]:
print(y.value_counts())

encoded_sentiment
0    36338
1    16343
Name: count, dtype: int64


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

#X , y ku bathulaa X_resampled, y_resampled

In [14]:
model = LinearSVC()

In [15]:
model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print(f"Precision: {precision.mean():.2f}")
print(f"Recall:    {recall.mean():.2f}")
print(f"F1 Score:  {f1.mean():.2f}")

Accuracy: 49.57%
Precision: 0.25
Recall:    0.50
F1 Score:  0.33


In [18]:
user_input = input("\nEnter a review statement: ")
user_cleaned = clean_text(user_input)
user_vector = vectorizer.transform([user_cleaned])
print(user_vector.shape)


(1, 5000)


In [19]:
print(model.coef_.shape)


(1, 1)


In [20]:
# If you're taking only the first feature from the 5000 available features
user_vector = user_vector[:, 0].reshape(1, 1)


In [21]:

user_vector = user_vector.reshape(1, 1)
user_prediction = model.predict(user_vector)[0] 
sentiment_label = "Positive" if user_prediction == 1 else "Negative"
print("\nReview Sentiment:", sentiment_label) 


Review Sentiment: Positive
