In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from text_cleaner import TextCleaner
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('./data/mule.csv')
df.dropna(inplace=True)
df['text'] = df['title'] + ' ' + df['description']
df['label'] = df['storypoint']

#cleaner = TextCleaner()
#df['text'] = df['text'].apply(cleaner.text_normalizer)

In [None]:
# 将文本数据进行 TF-IDF 向量化
vectorizer = TfidfVectorizer(stop_words=None, lowercase=False, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['text'])
y = df['label']

In [None]:
# 方差
df['label'].var()

In [None]:
df['label'].describe()

In [None]:
# 确保划分比例：60% 训练集，20% 验证集，20% 测试集
train_val_split_point = int(len(df) * 0.6)
val_test_split_point = int(len(df) * 0.8)
X_train = X[:val_test_split_point]
y_train = y[:val_test_split_point]
#X_val = X[train_val_split_point:val_test_split_point]
#y_val = y[train_val_split_point:val_test_split_point]
X_test = X[val_test_split_point:]
y_test = y[val_test_split_point:]

transform = SelectKBest(score_func=f_classif, k=50)
svc = SVC()
clf = Pipeline([('feat_select', transform), ('classifier', svc)])
clf.fit(X_train, y_train)


# 使用验证集进行调参，防止过拟合
#y_val_pred = clf.predict(X_val)
#val_mae = mean_absolute_error(y_val, y_val_pred)
#print(f'Validation MAE: {val_mae}')


## 根据验证集的性能调整模型
#best_val_mae = val_mae
#best_C = svc.get_params()['C']
#Cs = [0.1, 1, 10]
#for C in Cs:
#    clf.set_params(classifier__C=C)
#    clf.fit(X_train, y_train)
#    y_val_pred_temp = clf.predict(X_val)
#    val_mae_temp = mean_absolute_error(y_val, y_val_pred_temp)
#    if val_mae_temp < best_val_mae:
#        best_val_mae = val_mae_temp
#        best_C = C


# 使用最优参数重新训练模型
#clf.set_params(classifier__C=best_C)
#clf.fit(X_train, y_train)

# 预测测试集
y_test_pred = clf.predict(X_test)
test_mae = mean_absolute_error(y_test, y_test_pred)
print(f'Test MAE: {test_mae}')

# 计算 mean 作为基准的 MAE
mean_mae = mean_absolute_error(y_test, [y_train.mean()] * len(y_test))
print(f'Mean MAE: {mean_mae}')