In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# 加载数据集
df = pd.read_csv('ratings_comments_pairs.csv')
df = df.sample(frac=0.5)

# 数据清洗
df.dropna(inplace=True)

# 文本预处理函数
def preprocess_text(text):
    text = text.lower()  # 转换为小写
    text = re.sub(r'\W', ' ', text)  # 移除标点符号
    text = re.sub(r'\s+', ' ', text)  # 移除多余空格
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]  # 移除停用词
    text = ' '.join(words)
    return text

# 应用文本预处理
df['comment'] = df['comment'].apply(preprocess_text)

# 特征提取
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['comment']).toarray()
y = df['rating']

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 模型训练
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 模型评估
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/qujianning/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.8412586425850148
Classification Report:
               precision    recall  f1-score   support

          -1       0.74      0.32      0.45       594
           0       0.54      0.10      0.16       742
           1       0.85      0.99      0.92      5751

    accuracy                           0.84      7087
   macro avg       0.71      0.47      0.51      7087
weighted avg       0.81      0.84      0.80      7087



### SVM

In [5]:
from sklearn.svm import SVC

# 使用SVM训练模型
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# 使用测试集评估SVM模型
svm_pred = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM Classification Report:\n", classification_report(y_test, svm_pred))

SVM Accuracy: 0.8737124312120784
SVM Classification Report:
               precision    recall  f1-score   support

          -1       0.73      0.48      0.58       594
           0       0.60      0.35      0.44       742
           1       0.90      0.98      0.94      5751

    accuracy                           0.87      7087
   macro avg       0.74      0.61      0.66      7087
weighted avg       0.86      0.87      0.86      7087



### Regression

In [6]:
from sklearn.linear_model import LogisticRegression

# 使用逻辑回归训练模型
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)

# 使用测试集评估逻辑回归模型
logreg_pred = logreg_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print("Logistic Regression Classification Report:\n", classification_report(y_test, logreg_pred))

Logistic Regression Accuracy: 0.870467052349372
Logistic Regression Classification Report:
               precision    recall  f1-score   support

          -1       0.78      0.41      0.54       594
           0       0.62      0.35      0.45       742
           1       0.89      0.99      0.94      5751

    accuracy                           0.87      7087
   macro avg       0.76      0.58      0.64      7087
weighted avg       0.85      0.87      0.85      7087



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Xgboost

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

# 请替换成您的文件路径
file_path = 'ratings_comments_pairs.csv'
df = pd.read_csv(file_path)
# df = df.sample(frac=0.5)
label_mapping = {-1: 0, 0: 1, 1: 2}
reverse_label_mapping = {0: -1, 1: 0, 2: 1}

# 数据清洗
df.dropna(inplace=True)

# 文本预处理函数
def preprocess_text(text):
    text = text.lower()  # 转换为小写
    text = re.sub(r'\W', ' ', text)  # 移除标点符号
    text = re.sub(r'\s+', ' ', text)  # 移除多余空格
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]  # 移除停用词
    text = ' '.join(words)
    return text

# 应用文本预处理
df['comment'] = df['comment'].apply(preprocess_text)

tfidf_converter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.7)
X = tfidf_converter.fit_transform(df['comment']).toarray()
y_mapped = df['rating'].map(label_mapping)

X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.3, random_state=42)

model = xgb.XGBClassifier()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
predictions_original_labels = [reverse_label_mapping[label] for label in predictions]
# 评估模型
print("Accuracy:", accuracy_score(y_test.map(reverse_label_mapping), predictions_original_labels))
print("Classification Report:\n", classification_report(y_test.map(reverse_label_mapping), predictions_original_labels))


Accuracy: 0.8661633977705658
Classification Report:
               precision    recall  f1-score   support

          -1       0.75      0.44      0.56      1111
           0       0.60      0.26      0.36      1492
           1       0.89      0.98      0.93     11571

    accuracy                           0.87     14174
   macro avg       0.74      0.56      0.62     14174
weighted avg       0.84      0.87      0.84     14174

