In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from tqdm import tqdm

In [2]:
train = pd.read_csv(r'E:/TianChi/新闻文本分类/train_set.csv', sep='\t')
test = pd.read_csv(r'E:/TianChi/新闻文本分类/test_a.csv', sep='\t')

In [3]:
print(train.columns)
print(test.columns)

Index(['label', 'text'], dtype='object')
Index(['text'], dtype='object')


In [4]:
print(set(train['label']))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}


In [5]:
message_dict = {'科技': 0, '股票': 1, '体育': 2, '娱乐': 3, '时政': 4, '社会': 5, '教育': 6, '财经': 7, '家居': 8, '游戏': 9, '房产': 10, '时尚': 11, '彩票': 12, '星座': 13}

In [6]:
print(train.describe())
print("-"*100)
print(test.describe())

               label
count  200000.000000
mean        3.210950
std         3.084955
min         0.000000
25%         1.000000
50%         2.000000
75%         5.000000
max        13.000000
----------------------------------------------------------------------------------------------------
                                                     text
count                                               50000
unique                                              49995
top     2538 2506 1363 5466 3772 340 922 433 2397 5778...
freq                                                    4


In [7]:
train_text = train['text']
test_text = test['text']
all_text = pd.concat([train_text, test_text])

# TFIDF算法
### TF-IDF（Term Frequency-InversDocument Frequency）是一种常用于信息处理和数据挖掘的加权技术。该技术采用一种统计方法，根据字词的在文本中出现的次数和在整个语料中出现的文档频率来计算一个字词在整个语料中的重要程度。它的优点是能过滤掉一些常见的却无关紧要的词语，同时保留影响整个文本的重要字词。
* TF（Term Frequency）表示某个关键词在整篇文章中出现的频率。
* IDF（InversDocument Frequency）表示计算倒文本频率。文本频率是指某个关键词在整个语料所有文章中出现的次数。倒文档频率又称为逆文档频率，它是文档频率的倒数，主要用于降低所有文档中一些常见却对文档影响不大的词语的作用。

https://blog.csdn.net/u010417185/article/details/87905899

In [8]:
%%time
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)

all_word = word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

train_word_features

Wall time: 7min 12s


<200000x6977 sparse matrix of type '<class 'numpy.float64'>'
	with 56074040 stored elements in Compressed Sparse Row format>

In [9]:
print(all_word)
print("-"*100)
print(train_word_features.shape)
print(train_word_features)
print("-"*100)
print(test_word_features.shape)
print(test_word_features)

TfidfVectorizer(max_features=10000, stop_words='english',
                strip_accents='unicode', sublinear_tf=True,
                token_pattern='\\w{1,}')
----------------------------------------------------------------------------------------------------
(200000, 6977)
  (0, 6957)	0.06123077163392776
  (0, 6942)	0.028944854620259104
  (0, 6888)	0.031744071761905156
  (0, 6885)	0.020676338854265244
  (0, 6883)	0.05375987933625619
  (0, 6875)	0.05381962538912795
  (0, 6860)	0.02631647954528877
  (0, 6823)	0.052614288744070634
  (0, 6800)	0.0331354885749065
  (0, 6799)	0.04829120486123563
  (0, 6778)	0.06323910946402991
  (0, 6770)	0.04750366673846437
  (0, 6733)	0.027377099389752496
  (0, 6716)	0.042478700068770456
  (0, 6709)	0.07103975901129272
  (0, 6683)	0.05942895089274445
  (0, 6669)	0.026424670052076934
  (0, 6642)	0.025961663258648168
  (0, 6627)	0.05447419515324055
  (0, 6619)	0.02668769442656465
  (0, 6615)	0.08485414478405588
  (0, 6600)	0.0666090009621893
  (0, 6595)	0.0

In [10]:
X_train = train_word_features
y_train = train['label']

# 可以改变输入维度
x_train_, x_valid_, y_train_, y_valid_ = train_test_split(X_train, y_train, test_size=0.2)
X_test = test_word_features

In [11]:
print(x_train_)
print("-"*100)
print(x_valid_)
print("-"*100)
print(y_train_)
print("-"*100)
print(y_valid_)

  (0, 6975)	0.16319325252062827
  (0, 6933)	0.05793608590457798
  (0, 6885)	0.048890820586192284
  (0, 6875)	0.054601169028187034
  (0, 6860)	0.036752489773671605
  (0, 6848)	0.04083858817402098
  (0, 6778)	0.06464550352185065
  (0, 6733)	0.08023772712415099
  (0, 6716)	0.024860289219139388
  (0, 6684)	0.04393067635592298
  (0, 6669)	0.021795851249120552
  (0, 6659)	0.04161845269305206
  (0, 6619)	0.0372709128820683
  (0, 6570)	0.053093163674429636
  (0, 6549)	0.04656476491600003
  (0, 6537)	0.050079531400582744
  (0, 6474)	0.03839892665391913
  (0, 6460)	0.0610409832684235
  (0, 6375)	0.03785881743865962
  (0, 6356)	0.029577203342845436
  (0, 6335)	0.2863652752687756
  (0, 6284)	0.054906754080296154
  (0, 6221)	0.03267921928494529
  (0, 6207)	0.030451585881911696
  (0, 6201)	0.03552118521710065
  :	:
  (159999, 398)	0.03216214019583148
  (159999, 378)	0.043688186781067125
  (159999, 366)	0.0487429272926725
  (159999, 356)	0.042581860973073316
  (159999, 348)	0.032978386809525584
  (15

# 逻辑回归
### f1_score
https://blog.csdn.net/fengdu78/article/details/107739416


In [13]:
%%time
clf = LogisticRegression(C=4, n_jobs=16)
clf.fit(x_train_, y_train_)

y_pred = clf.predict(x_valid_)
train_scores = clf.score(x_train_, y_train_)
print(train_scores, f1_score(y_pred, y_valid_, average='macro'))

UnicodeEncodeError: 'ascii' codec can't encode characters in position 18-20: ordinal not in range(128)

# XGB

In [14]:
x_train_, x_valid_, y_train_, y_valid_ = train_test_split(X_train[:, :300], y_train, test_size=0.2, shuffle=True,random_state=42)
X_test = test_word_features[:,:300]

In [15]:
print(X_train[:, :300].shape)
print(X_train[:, :300])
print("-"*100)
print(x_train_)
print("-"*100)
print(x_valid_)
print("-"*100)
print(y_train_)
print("-"*100)
print(y_valid_)

(200000, 300)
  (0, 293)	0.06788293187676152
  (0, 280)	0.04408376561395052
  (0, 273)	0.047504015874521174
  (0, 243)	0.02873864199901826
  (0, 221)	0.09365562055916186
  (0, 217)	0.03486759271969195
  (0, 214)	0.05243303349303011
  (0, 143)	0.041116017387529444
  (0, 100)	0.08939933038358235
  (0, 33)	0.02601467000614618
  (1, 289)	0.14211398894422123
  (1, 278)	0.024839717563377835
  (1, 159)	0.0637338662894813
  (1, 155)	0.04878250423618895
  (1, 61)	0.12146474235410022
  (1, 58)	0.14213998097105388
  (2, 285)	0.02772347165843572
  (2, 280)	0.024038045219873218
  (2, 278)	0.019545799205470105
  (2, 273)	0.025903043122885
  (2, 268)	0.036387523626886915
  (2, 263)	0.0707844024387805
  (2, 255)	0.07159214985429842
  (2, 249)	0.0389912158088862
  (2, 247)	0.039072113392983714
  :	:
  (199997, 143)	0.06801290716004449
  (199997, 135)	0.027960170438285813
  (199997, 70)	0.01866405199499365
  (199997, 65)	0.025169598371429677
  (199997, 58)	0.04639375303759279
  (199997, 46)	0.0454590525

In [16]:
%%time
param = {'learning_rate': 0.05,         #  (xgb’s “eta”)
              'objective': 'multi:softmax', 
              'n_jobs': 16,
              'n_estimators': 300,           # 树的个数
              'max_depth': 10,               
              'gamma': 0.5,                  # 惩罚项中叶子结点个数前的参数，Increasing this value will make model more conservative.
              'reg_alpha': 0,               # L1 regularization term on weights.Increasing this value will make model more conservative.
              'reg_lambda': 2,              # L2 regularization term on weights.Increasing this value will make model more conservative.
              'min_child_weight' : 1,      # 叶子节点最小权重
              'subsample':0.8,             # 随机选择80%样本建立决策树
              'random_state':1           # 随机数
             }

model = XGBClassifier(param)
model.fit(x_train_, y_train_, eval_set=[(x_train_, y_train_)],
                       eval_metric=['mlogloss'],
                       early_stopping_rounds=10,  # 连续N次分值不再优化则提前停止
                       verbose=False
                      )
train_result, train_proba = model.predict(x_train_), model.predict_proba(x_train_)

Wall time: 7min 10s


In [17]:
print("train_result.shape : ")
print(train_result.shape)
print("train_result : ")
print(train_result)
print("-"*100)
print("y_train_.shape : ")
print(y_train_.shape)
print("-"*100)
print("train_proba.shape : ")
print(train_proba.shape)
print("train_proba : ")
print(train_proba)

train_result.shape : 
(160000,)
train_result : 
[ 3  2  1 ...  1 12  0]
----------------------------------------------------------------------------------------------------
y_train_.shape : 
(160000,)
----------------------------------------------------------------------------------------------------
train_proba.shape : 
(160000, 14)
train_proba : 
[[1.66867487e-02 4.18164488e-03 9.51950811e-03 ... 1.46730687e-03
  6.47020701e-04 1.20726263e-03]
 [3.19576100e-03 1.49288017e-03 9.78947937e-01 ... 2.00482100e-05
  7.25847110e-03 7.81182553e-06]
 [1.29233047e-01 3.02949131e-01 7.22355098e-02 ... 9.67446773e-04
  3.55880358e-04 7.05102633e-04]
 ...
 [2.44105067e-02 8.62139761e-01 7.76413223e-03 ... 6.08526578e-04
  5.30647638e-04 6.11736978e-05]
 [3.26759554e-02 1.12922043e-01 3.82110253e-02 ... 5.19639160e-03
  6.35578394e-01 2.09927885e-03]
 [6.71953142e-01 1.04622275e-01 3.77040468e-02 ... 8.85093305e-03
  1.97779294e-03 3.53217649e-04]]


In [18]:
train_acc = accuracy_score(y_train_, train_result)
train_auc = f1_score(y_train_, train_result, average='macro')
print(train_acc)
print(train_auc)

0.72426875
0.6975893853374654


 下面代码是封装之后的，，上述相同部分用来测试

In [19]:
def train(param):
    model = XGBClassifier(**param)
    model.fit(x_train_, y_train_, eval_set=[(x_train_, y_train_)],
                       eval_metric=['mlogloss'],
                       early_stopping_rounds=10,  # 连续N次分值不再优化则提前停止
                       verbose=False
                      )
    train_result, train_proba = model.predict(x_train_), model.predict_proba(x_train_)
    train_acc = accuracy_score(y_train_, train_result)
    train_auc = f1_score(y_train_, train_result, average='macro')

    print("Train acc: %.2f%% Train auc: %.2f" % (train_acc*100.0, train_auc))
    
    return train_result, train_proba, train_acc, train_auc

In [14]:
def test(X_test, y_test):
    model = XGBClassifier(**param)
    model.fit(x_train_, y_train_, eval_set=[(x_train_, y_train_)],
                       eval_metric=['mlogloss'],
                       early_stopping_rounds=10,  # 连续N次分值不再优化则提前停止
                       verbose=False
                      )
    result, proba = model.predict(X_test), model.predict_proba(X_test)
    acc = accuracy_score(y_test, result)
    f1 = f1_score(y_test, result, average='macro')
    
    print("acc: %.2f%% F1_score: %.2f%%" % (acc*100.0, f1))

    return result, proba, acc, f1

## 网格搜索

In [20]:
def grid(param_grid):
    xgb_model = XGBClassifier(nthread=20)
    clf = GridSearchCV(xgb_model, param_grid, scoring='f1_macro', cv=2, verbose=1)
    clf.fit(x_train_, y_train_)
    print("Best score: %f using parms: %s" % (clf.best_score_, clf.best_params_))
    return clf.best_params_, clf.best_score_

In [21]:
%%time
train_result, train_proba, train_acc, train_auc = train(param)




KeyboardInterrupt: 

In [None]:
%%time
result, proba, acc, f1 = test(x_valid_, y_valid_)


In [None]:
print("train运行结果：" + "-"*100)
print("train_result : ")
print(train_result)

print("train_proba : ")
print(train_proba)

print("train_acc : " + train_acc)
print("train_auc : " + train_auc)

print("-"*150)

print("test运行结果：" + "-"*100)
print("result : ")
print(result)

print("proba : ")
print(proba)

print("acc : " + acc)
print("f1 : " + f1)

## 结果文件保存

In [None]:
submission = pd.read_csv(r'./test_a_sample_submit.csv'))
submission['label'] = preds