In [2]:
import pandas as pd
import numpy as np
import joblib
import pickle
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from scipy.sparse import hstack, csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load processed data
df = pd.read_csv('processed_reviews_with_features.csv')
print(f"Dataset shape: {df.shape}")

Dataset shape: (4915, 27)


In [4]:
# Prepare features
X_text = df['cleaned_text']
X_meta = df[['polarity', 'subjectivity', 'word_count', 'char_count', 
             'helpful_ratio', 'wilson_lower_bound', 'score_average_rating',
             'exclamation_count', 'question_count', 'uppercase_ratio']]

y = df['sentiment']

In [5]:
# Load saved vectorizer and encoder
tfidf = joblib.load('tfidf_vectorizer.joblib')
label_encoder = joblib.load('label_encoder.joblib')

In [6]:
# Transform text
df['cleaned_text'] = df['cleaned_text'].fillna("")
X_text = df['cleaned_text']
X_text_tfidf = tfidf.transform(X_text)

In [7]:
# Scale meta features
scaler = StandardScaler()
X_meta_scaled = scaler.fit_transform(X_meta)

In [8]:
# Combine features
X_combined = hstack([X_text_tfidf, csr_matrix(X_meta_scaled)])

In [9]:
# Encode labels
y_encoded = y.map(label_encoder)

In [10]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [11]:
# Handle imbalance on training data
from imblearn.combine import SMOTETomek
smote = SMOTETomek(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

print(f"Training set shape: {X_train_bal.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Class distribution - Train: {np.bincount(y_train_bal)}")
print(f"Class distribution - Test: {np.bincount(y_test)}")


Training set shape: (10677, 3010)
Test set shape: (983, 3010)
Class distribution - Train: [3559 3559 3559]
Class distribution - Test: [ 65  28 890]


In [12]:
# Model 1: Ensemble of Traditional ML Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

print("\n" + "="*50)
print("ENSEMBLE MODEL TRAINING")
print("="*50)


ENSEMBLE MODEL TRAINING


In [13]:
# Define individual models
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=42
    ),
    'XGBoost': XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    ),
    'LightGBM': LGBMClassifier(
        n_estimators=200,
        max_depth=7,
        learning_rate=0.05,
        num_leaves=31,
        class_weight='balanced',
        random_state=42
    ),
    'SVM': SVC(
        kernel='rbf',
        C=1.0,
        probability=True,
        class_weight='balanced',
        random_state=42
    ),
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        C=0.1,
        random_state=42
    )
}

In [14]:
# Train and evaluate individual models
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_bal, y_train_bal)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision, recall, f1_macro, _ = precision_recall_fscore_support(
        y_test, y_pred, average='weighted'
    )
    
    # For multi-class ROC-AUC (One-vs-Rest)
    try:
        roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted')
    except:
        roc_auc = np.nan
    
    results[name] = {
        'accuracy': accuracy,
        'f1_score': f1,
        'precision': precision,
        'recall': recall,
        'roc_auc': roc_auc,
        'model': model
    }
    
    print(f"{name} Results:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  ROC-AUC: {roc_auc:.4f}")
    
    # Detailed classification report
    print("\n  Classification Report:")
    print(classification_report(y_test, y_pred, 
                                target_names=['negative', 'neutral', 'positive']))



Training Random Forest...
Random Forest Results:
  Accuracy: 0.9105
  F1-Score: 0.8971
  Precision: 0.8841
  Recall: 0.9105
  ROC-AUC: 0.8548

  Classification Report:
              precision    recall  f1-score   support

    negative       0.50      0.51      0.50        65
     neutral       0.00      0.00      0.00        28
    positive       0.94      0.97      0.95       890

    accuracy                           0.91       983
   macro avg       0.48      0.49      0.49       983
weighted avg       0.88      0.91      0.90       983


Training XGBoost...
XGBoost Results:
  Accuracy: 0.9207
  F1-Score: 0.9062
  Precision: 0.8933
  Recall: 0.9207
  ROC-AUC: 0.8952

  Classification Report:
              precision    recall  f1-score   support

    negative       0.62      0.49      0.55        65
     neutral       0.00      0.00      0.00        28
    positive       0.94      0.98      0.96       890

    accuracy                           0.92       983
   macro avg       0.

In [15]:
# Create ensemble model
print("\n" + "="*50)
print("CREATING VOTING ENSEMBLE")
print("="*50)

ensemble = VotingClassifier(
    estimators=[
        ('rf', models['Random Forest']),
        ('xgb', models['XGBoost']),
        ('lgbm', models['LightGBM'])
    ],
    voting='soft'
)

ensemble.fit(X_train_bal, y_train_bal)
y_pred_ensemble = ensemble.predict(X_test)
y_pred_proba_ensemble = ensemble.predict_proba(X_test)


CREATING VOTING ENSEMBLE
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.130708 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152617
[LightGBM] [Info] Number of data points in the train set: 10677, number of used features: 2314
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [16]:
# Ensemble metrics
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
f1_ensemble = f1_score(y_test, y_pred_ensemble, average='weighted')

print(f"Ensemble Results:")
print(f"  Accuracy: {accuracy_ensemble:.4f}")
print(f"  F1-Score: {f1_ensemble:.4f}")
print("\n  Classification Report:")
print(classification_report(y_test, y_pred_ensemble, 
                            target_names=['negative', 'neutral', 'positive']))

Ensemble Results:
  Accuracy: 0.9278
  F1-Score: 0.9127

  Classification Report:
              precision    recall  f1-score   support

    negative       0.68      0.52      0.59        65
     neutral       0.00      0.00      0.00        28
    positive       0.94      0.99      0.96       890

    accuracy                           0.93       983
   macro avg       0.54      0.50      0.52       983
weighted avg       0.90      0.93      0.91       983



In [17]:
# Save the best model (ensemble)
joblib.dump(ensemble, 'ensemble_sentiment_model.joblib')
joblib.dump(scaler, 'feature_scaler.joblib')

print("\nModels saved successfully!")


Models saved successfully!


In [18]:
# Model 2: Deep Learning with Neural Networks
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Input, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

print("\n" + "="*50)
print("DEEP LEARNING MODEL TRAINING")
print("="*50)


DEEP LEARNING MODEL TRAINING


In [19]:
# Prepare data for deep learning
X_train_text, X_test_text, y_train_dl, y_test_dl = train_test_split(
    df['cleaned_text'], y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


In [20]:
# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train_text)

X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)


In [21]:
# Padding
max_len = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [22]:
# Meta features for deep learning
X_train_meta, X_test_meta = train_test_split(
    X_meta_scaled, test_size=0.2, random_state=42, stratify=y_encoded
)

In [23]:
# Build hybrid deep learning model
def build_hybrid_model(vocab_size, max_len, meta_features_dim):
    # Text input branch
    text_input = Input(shape=(max_len,), name='text_input')
    embedding = Embedding(input_dim=vocab_size+1, output_dim=128, 
                          input_length=max_len)(text_input)
    
    # CNN layers for text
    conv1 = Conv1D(128, 5, activation='relu', padding='same')(embedding)
    conv1 = Dropout(0.3)(conv1)
    conv2 = Conv1D(64, 3, activation='relu', padding='same')(conv1)
    conv2 = Dropout(0.3)(conv2)
    
    # LSTM layers
    lstm1 = Bidirectional(LSTM(64, return_sequences=True, dropout=0.3))(conv2)
    lstm2 = Bidirectional(LSTM(32, dropout=0.3))(lstm1)
    
    # Meta features branch
    meta_input = Input(shape=(meta_features_dim,), name='meta_input')
    meta_dense1 = Dense(32, activation='relu')(meta_input)
    meta_dense1 = Dropout(0.2)(meta_dense1)
    meta_dense2 = Dense(16, activation='relu')(meta_dense1)
    
    # Concatenate branches
    concatenated = Concatenate()([lstm2, meta_dense2])
    
    # Dense layers
    dense1 = Dense(64, activation='relu')(concatenated)
    dense1 = Dropout(0.4)(dense1)
    dense2 = Dense(32, activation='relu')(dense1)
    dense2 = Dropout(0.3)(dense2)
    
    # Output layer
    output = Dense(3, activation='softmax')(dense2)
    
    # Create model
    model = Model(inputs=[text_input, meta_input], outputs=output)
    
    return model


In [24]:
# Build and compile model
model = build_hybrid_model(
    vocab_size=5000,
    max_len=max_len,
    meta_features_dim=X_train_meta.shape[1]
)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print(model.summary())


None


In [25]:
# Build and compile model
model = build_hybrid_model(
    vocab_size=5000,
    max_len=max_len,
    meta_features_dim=X_train_meta.shape[1]
)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print(model.summary())

None


In [26]:
# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6),
    ModelCheckpoint('best_sentiment_model.h5', monitor='val_accuracy', 
                    save_best_only=True, mode='max')
]


In [27]:
# Train model
history = model.fit(
    [X_train_pad, X_train_meta], y_train_dl,
    validation_split=0.1,
    epochs=20,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/20
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 274ms/step - accuracy: 0.7922 - loss: 0.6109 



[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 298ms/step - accuracy: 0.8770 - loss: 0.4466 - val_accuracy: 0.8985 - val_loss: 0.3156 - learning_rate: 0.0010
Epoch 2/20
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279ms/step - accuracy: 0.9158 - loss: 0.2805 



[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 292ms/step - accuracy: 0.9141 - loss: 0.2697 - val_accuracy: 0.9086 - val_loss: 0.3257 - learning_rate: 0.0010
Epoch 3/20
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 309ms/step - accuracy: 0.9423 - loss: 0.2074 



[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 320ms/step - accuracy: 0.9401 - loss: 0.2085 - val_accuracy: 0.9112 - val_loss: 0.3481 - learning_rate: 0.0010
Epoch 4/20
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 319ms/step - accuracy: 0.9475 - loss: 0.1733 



[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 329ms/step - accuracy: 0.9520 - loss: 0.1650 - val_accuracy: 0.9213 - val_loss: 0.3976 - learning_rate: 0.0010
Epoch 5/20
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 332ms/step - accuracy: 0.9624 - loss: 0.1271 - val_accuracy: 0.9137 - val_loss: 0.3815 - learning_rate: 2.0000e-04
Epoch 6/20
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 314ms/step - accuracy: 0.9652 - loss: 0.1126 - val_accuracy: 0.9188 - val_loss: 0.4020 - learning_rate: 2.0000e-04


In [None]:
# Evaluate model
loss, accuracy = model.evaluate([X_test_pad, X_test_meta], y_test_dl, verbose=0)
print(f"\nDeep Learning Model Results:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Loss: {loss:.4f}")

In [None]:
# Save deep learning model and tokenizer
model.save('sentiment_deep_model.h5')
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("\nDeep learning model saved successfully!")

In [None]:
# Visualization of results
plt.figure(figsize=(15, 10))

In [None]:
# Plot training history
plt.subplot(2, 3, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(2, 3, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()


In [None]:
# Plot comparison of model accuracies
model_names = list(results.keys()) + ['Ensemble', 'Deep Learning']
accuracies = [results[m]['accuracy'] for m in results.keys()] + [accuracy_ensemble, accuracy]

plt.subplot(2, 3, 3)
bars = plt.bar(model_names, accuracies)
plt.title('Model Comparison (Accuracy)')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.xticks(rotation=45, ha='right')
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{acc:.3f}', ha='center', va='bottom', fontsize=9)


In [None]:
# Confusion matrix for ensemble
plt.subplot(2, 3, 4)
cm = confusion_matrix(y_test, y_pred_ensemble)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['negative', 'neutral', 'positive'],
            yticklabels=['negative', 'neutral', 'positive'])
plt.title('Ensemble Model Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

In [None]:
# Confusion matrix for deep learning
plt.subplot(2, 3, 5)
y_pred_dl = np.argmax(model.predict([X_test_pad, X_test_meta]), axis=1)
cm_dl = confusion_matrix(y_test_dl, y_pred_dl)
sns.heatmap(cm_dl, annot=True, fmt='d', cmap='Reds',
            xticklabels=['negative', 'neutral', 'positive'],
            yticklabels=['negative', 'neutral', 'positive'])
plt.title('Deep Learning Model Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

plt.tight_layout()
plt.show()

# Feature importance from ensemble model
print("\n" + "="*50)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*50)

In [None]:
# Get feature importance from Random Forest
rf_model = models['Random Forest']
importances = rf_model.feature_importances_

In [None]:
# Get feature names
feature_names = list(tfidf.get_feature_names_out()) + list(X_meta.columns)

In [None]:
# Get feature names
feature_names = list(tfidf.get_feature_names_out()) + list(X_meta.columns)

In [None]:
# Sort by importance
indices = np.argsort(importances)[::-1][:20]

print("\nTop 20 Most Important Features:")
print("-" * 50)
for i in indices[:20]:
    print(f"{feature_names[i]:<30} : {importances[i]:.4f}")

In [None]:
# Save all models and components
print("\n" + "="*50)
print("SAVING ALL MODEL COMPONENTS")
print("="*50)

model_components = {
    'ensemble_model': ensemble,
    'deep_learning_model': model,
    'tfidf_vectorizer': tfidf,
    'feature_scaler': scaler,
    'tokenizer': tokenizer,
    'label_encoder': label_encoder,
    'max_sequence_length': max_len,
    'meta_features': list(X_meta.columns)
}

joblib.dump(model_components, 'model_components.joblib')

print("\nAll model components saved to 'model_components.joblib'")