In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
import numpy as np
import joblib
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

# from 新闻文本分类.models.lightgbm_model import MyLightGBM
%run ../models/lightgbm_model.py

In [2]:
train_df = pd.read_csv('../datasets/train_set.csv', sep='\t')
test_df = pd.read_csv('../datasets/test_a.csv', sep='\t')

In [3]:
# 速度较慢;分多步进行
# ************1 step************

# tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)
# tfidf.fit(np.concatenate((train_df['text'].values, test_df['text'].values), axis=0))

# 保存训练数据集在TfidfVectorizer上的结果
# train_word_features = tfidf.transform(train_df['text'].values)
# joblib.dump(train_word_features, '../intermediate_save_data/train_word_features.pkl')

# 保存测试数据集在TfidfVectorizer上的结果
# test_word_features = tfidf.transform(test_df['text'].values)
# joblib.dump(test_word_features, '../intermediate_save_data/test_word_features.pkl')

In [4]:
# ************2 step************

train_word_features_load = joblib.load('../intermediate_save_data/train_word_features.pkl')
test_word_features_load = joblib.load('../intermediate_save_data/test_word_features.pkl')

In [5]:
# ************3 step************

y_train_data = train_df['label'].values
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=666)  # 5折交叉验证


def my_gbm_loss(y_pred, y_true):
    # 自定义验证数据集评估函数
    reshape_y_pred = np.transpose(y_pred.reshape(14, -1))  # 这是一个14分类任务
    reshape_y_pred = np.argmax(reshape_y_pred, axis=1)
    f1 = f1_score(y_true.get_label(), reshape_y_pred, average='micro')
    return 'F1 score', f1, True


# 回调函数
es_func = lgb.early_stopping(stopping_rounds=200)
le_func = lgb.log_evaluation(50)

lgb_params = {"objective": "multiclass",
              "num_class": 14,
              "max_depth": -1,
              "num_boost_round": 2000,
              "bagging_fraction": 0.8,
              "feature_fraction": 0.7,
              "verbosity": -1,
              "reg_alpha": 6,
              "reg_lambda": 6,
              "n_jobs": 8,
              "metric": ("multi_logloss",)}

test_predictions, model_list = MyLightGBM(X_train_data=train_word_features_load.toarray(),
                                          y_train_data=y_train_data,
                                          X_test_data=test_word_features_load.toarray(),
                                          kfold=kf,
                                          params=lgb_params,
                                          feval=my_gbm_loss,
                                          callbacks=[es_func, le_func])

Training fold 1
Training until validation scores don't improve for 200 rounds
[50]	training's multi_logloss: 0.151832	training's F1 score: 0.9558	valid_1's multi_logloss: 0.205957	valid_1's F1 score: 0.93805
[100]	training's multi_logloss: 0.0871202	training's F1 score: 0.977544	valid_1's multi_logloss: 0.175548	valid_1's F1 score: 0.9444
[150]	training's multi_logloss: 0.0576951	training's F1 score: 0.9891	valid_1's multi_logloss: 0.168269	valid_1's F1 score: 0.9461
[200]	training's multi_logloss: 0.0418199	training's F1 score: 0.994556	valid_1's multi_logloss: 0.165546	valid_1's F1 score: 0.946675
[250]	training's multi_logloss: 0.0328629	training's F1 score: 0.997094	valid_1's multi_logloss: 0.164093	valid_1's F1 score: 0.947375
[300]	training's multi_logloss: 0.0274135	training's F1 score: 0.998319	valid_1's multi_logloss: 0.16362	valid_1's F1 score: 0.947125
[350]	training's multi_logloss: 0.0237826	training's F1 score: 0.998875	valid_1's multi_logloss: 0.163641	valid_1's F1 score

In [6]:
# ************4 step************

# 保存预测结果
joblib.dump(test_predictions, '../intermediate_save_data/test_predictions_tfidf_lgb.pkl')

['../intermediate_save_data/test_predictions_tfidf_lgb.pkl']

In [7]:
test_predictions_load = joblib.load('../intermediate_save_data/test_predictions_tfidf_lgb.pkl')
test_predictions_label = np.argmax(test_predictions_load, axis=1)
test_predictions_label_df = pd.DataFrame(test_predictions_label, columns=['label'])
test_predictions_label_df

Unnamed: 0,label
0,1
1,2
2,8
3,5
4,0
...,...
49995,0
49996,13
49997,1
49998,3


In [8]:
# score:
test_predictions_label_df.to_csv('../intermediate_save_data/test_predictions_tfidf_lgb.csv', index=False)