In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error, median_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
#from text_cleaner import TextCleaner
import warnings
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_csv('./data/mes_all.csv')
df['description'] = df['description'].fillna('')

In [11]:
len(df)

4671

In [12]:
df = df[df['storypoint'] != -1]
df.dropna(inplace=True)
df['text'] = df['title']# + ' ' + df['description']
df['label'] = df['storypoint']

#cleaner = TextCleaner()
#df['text'] = df['text'].apply(cleaner.text_normalizer)

In [13]:
# 将文本数据进行 TF-IDF 向量化
vectorizer = TfidfVectorizer(stop_words=None, lowercase=False, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['text'])
y = df['label']

In [14]:
# 方差
df['label'].var()

4.7259131664166

In [15]:
df['label'].describe()

count    1197.000000
mean        2.655806
std         2.173917
min         1.000000
25%         1.000000
50%         2.000000
75%         3.000000
max        13.000000
Name: label, dtype: float64

In [16]:
# 数据划分：60% 训练集，20% 验证集，20% 测试集
train_val_split_point = int(len(df) * 0.6)
val_test_split_point = int(len(df) * 0.8)
X_train = X[:train_val_split_point]
y_train = y[:train_val_split_point]
X_val = X[train_val_split_point:val_test_split_point]
y_val = y[train_val_split_point:val_test_split_point]
X_test = X[val_test_split_point:]
y_test = y[val_test_split_point:]

In [17]:
# 计算 mean 作为基准的 MAE
mean_mae = mean_absolute_error(y_test, [y_train.mean()] * len(y_test))
print(f'Mean MAE: {mean_mae}')

median_mae = median_absolute_error(y_test, [y_test.median()] * len(y_test))
print(f'MdAE : {median_mae}')

Mean MAE: 1.498769730733519
MdAE : 1.0


In [18]:
transform = SelectKBest(score_func=f_classif, k=50)
svc = SVC()
clf = Pipeline([('feat_select', transform), ('classifier', svc)])
clf.fit(X_train, y_train)

# 使用验证集进行调参，防止过拟合
y_val_pred = clf.predict(X_val)
val_mae = mean_absolute_error(y_val, y_val_pred)
print(f'Validation MAE: {val_mae}')

## 根据验证集的性能调整模型
best_val_mae = val_mae
best_C = svc.get_params()['C']
Cs = [0.1, 1, 10]
for C in Cs:
    clf.set_params(classifier__C=C)
    clf.fit(X_train, y_train)
    y_val_pred_temp = clf.predict(X_val)
    val_mae_temp = mean_absolute_error(y_val, y_val_pred_temp)
    if val_mae_temp < best_val_mae:
        best_val_mae = val_mae_temp
        best_C = C


# 使用最优参数重新训练模型
clf.set_params(classifier__C=best_C)
clf.fit(X_train, y_train)

# 预测测试集
y_test_pred = clf.predict(X_test)
test_mae = mean_absolute_error(y_test, y_test_pred)
print(f'Test MAE: {test_mae}')

test_mdae = median_absolute_error(y_test, y_test_pred)
print(f'Test MdAE: {test_mdae}')

Validation MAE: 1.6945606694560669
Test MAE: 1.55
Test MdAE: 1.0
