In [2]:
!pip install imbalanced-learn scikit-learn

import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Dessertation/sentiment_results.csv", encoding="latin1")

# Check sentiment distribution before SMOTE
print("Sentiment distribution before SMOTE:")
print(df['sentiment_label'].value_counts())

# Fill missing values in 'processed_text' with an empty string
df['processed_text'] = df['processed_text'].fillna('')  # Replace NaN with empty string

# Convert text into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limit features for efficiency
X_tfidf = vectorizer.fit_transform(df['processed_text'])

# Encode sentiment labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['sentiment_label'])

# Apply SMOTE to balance sentiment classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y_encoded)

# Convert back to sentiment labels
y_resampled_labels = label_encoder.inverse_transform(y_resampled)

balanced_df = pd.DataFrame({
    'processed_text': vectorizer.inverse_transform(X_resampled),  # Convert back to tokens
    'sentiment_label': y_resampled_labels
})

# Join tokens back into text
balanced_df['processed_text'] = balanced_df['processed_text'].apply(lambda tokens: ' '.join(tokens))

# Save to CSV
balanced_df.to_csv("/content/drive/MyDrive/Dessertation/balanced_sentiment_data.csv", index=False)

# Check sentiment distribution after SMOTE
print("\nSentiment distribution after SMOTE:")
print(pd.Series(y_resampled_labels).value_counts())

Sentiment distribution before SMOTE:
sentiment_label
Negative    5382
Positive    3234
Neutral     2583
Name: count, dtype: int64

Sentiment distribution after SMOTE:
Negative    5382
Neutral     5382
Positive    5382
Name: count, dtype: int64
