In [1]:
import kagglehub
import pandas as pd
import os


path = kagglehub.dataset_download("/kaggle/input/dataset/movie_comments_with_sentiment_and_region.xlsx")
df = pd.read_csv(os.path.join(path, "movie_comments_with_sentiment_and_region.xlsx"))


In [2]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

df['review'] = df['review'].apply(clean_text)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})


In [None]:
!pip install transformers -q

from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from tqdm import tqdm


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model.eval()


def get_bert_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = bert_model(**inputs)
        cls_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(cls_embedding)
    return np.array(embeddings)


In [None]:


from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import numpy as np


In [None]:

sample_df = df.sample(5000, random_state=42)
texts = sample_df['review'].tolist()
labels = sample_df['sentiment'].values

features = get_bert_embeddings(texts)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from xgboost.callback import TrainingCallback
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

lr_model = LogisticRegression(solver='liblinear')
rf_model = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)
#xgb_model = XGBClassifier(learning_rate=0.1, max_depth=6, n_estimators=100,use_label_encoder=False, eval_metric='logloss')

lr_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
#xgb_model.fit(X_train, y_train)

log_losses = []

class LogLossRecorder(TrainingCallback):
    def after_iteration(self, model, epoch, evals_log):
        if 'validation_0' in evals_log and 'logloss' in evals_log['validation_0']:
            log_losses.append(evals_log['validation_0']['logloss'][-1])
        return False

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss',learning_rate=0.1, max_depth=6, n_estimators=100)
#model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=20)
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False, callbacks=[LogLossRecorder()])

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(log_losses)+1), log_losses, marker='o')
plt.title("XGBoost Log Loss vs Epochs")
plt.xlabel("Epoch (Round)")
plt.ylabel("Log Loss")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

def get_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }


results = {
    "Logistic Regression": get_metrics(lr_model, X_test, y_test),
    "Random Forest": get_metrics(rf_model, X_test, y_test),
    "XGBoost": get_metrics(xgb_model, X_test, y_test)
}

results_df = pd.DataFrame(results).T
results_df.style.background_gradient(cmap='Blues').format("{:.4f}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


labels = list(results["Logistic Regression"].keys())  # ['Accuracy', 'Precision', ...]
models = list(results.keys())  # ['Logistic Regression', 'Random Forest', 'XGBoost']


data = np.array([list(results[model].values()) for model in models])


angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
data = np.concatenate((data, data[:, [0]]), axis=1)  # 闭合雷达图
angles += angles[:1]  # 闭合角度


plt.figure(figsize=(8, 6))
for i, model in enumerate(models):
    plt.polar(angles, data[i], label=model, linewidth=2)


plt.xticks(angles[:-1], labels)
plt.title('Model Performance Radar Chart', size=16)
plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1.1))
plt.tight_layout()
plt.show()


y_pred = xgb_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - XGBoost')
plt.show()
