In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

# Load preprocessed and SMOTE-balanced data
balanced_df = pd.read_csv("/content/drive/MyDrive/Dessertation/balanced_sentiment_data.csv", encoding="latin1")

# Features and labels
X = balanced_df['processed_text']
y = balanced_df['sentiment_label']

# Handle missing values in 'processed_text' by replacing them with an empty string
# drop rows with missing values instead, use:
# balanced_df.dropna(subset=['processed_text'], inplace=True)
X = X.fillna('')

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42, stratify=y
)

# Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, solver='liblinear')
lr_model.fit(X_train, y_train)

# Predictions
y_pred = lr_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"🔹 Logistic Regression Accuracy: {accuracy:.4f}\n")
print("🔹 Classification Report:\n")
print(classification_report(y_test, y_pred))

🔹 Logistic Regression Accuracy: 0.8269

🔹 Classification Report:

              precision    recall  f1-score   support

    Negative       0.87      0.74      0.80      1076
     Neutral       0.79      0.91      0.85      1077
    Positive       0.83      0.83      0.83      1077

    accuracy                           0.83      3230
   macro avg       0.83      0.83      0.83      3230
weighted avg       0.83      0.83      0.83      3230

