In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pickle

In [2]:
train_set = pd.read_csv('./train_set.csv', sep='\t')
X_train, X_test, y_train, y_test = train_test_split(
    train_set['text'], 
    train_set['label'], 
    random_state=42
)

## Vector

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
TfidfVec = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1,2),#(1,3)
    min_df=3,  # 4  5
    max_df=0.9, # 0.95 1.0 
    use_idf=True,
    max_features=3000,
    smooth_idf=True, 
    sublinear_tf=True
).fit(train_set['text'])

# 保存
with open('saved/TfidfVec_00.pickle','wb') as f:
    pickle.dump(TfidfVec, f)

In [None]:
# 读取
with open('saved/TfidfVec_00.pickle','rb') as f:
    TfidfVec = pickle.load(f)

In [None]:
X_train_tfidf = TfidfVec.transform(X_train)
X_test_tfidf = TfidfVec.transform(X_test)

In [None]:
# 特征选择
from sklearn.feature_selection import SelectKBest,chi2
select_model = SelectKBest(chi2, k=1000).fit(X_train_tfidf, y_train)
X_train_tfidf_s1000 = select_model.transform(X_train_tfidf)
X_test_tfidf_s1000 = select_model.transform(X_test_tfidf)

### word2vec

word2vec 1000 1w

In [None]:
train = pd.read_csv('train_emb1000.csv', sep='\t', header=None)
X_train_emb1000 = train.iloc[:,:-1]
y_train_emb1000 = train.iloc[:,-1]
test = pd.read_csv('test_emb1000.csv', sep='\t', header=None)
X_test_emb1000 = test.iloc[:,:-1]
y_test_emb1000 = test.iloc[:,-1]

word2vec 256 20w

In [4]:
train = pd.read_csv('train_emb256.csv', sep='\t', header=None)
X_train_emb256 = train.iloc[:,:-1]
y_train_emb256 = train.iloc[:,-1]
test = pd.read_csv('test_emb256.csv', sep='\t', header=None)
X_test_emb256 = test.iloc[:,:-1]
y_test_emb256 = test.iloc[:,-1]

别人的训练的文件

In [None]:
# word2vec emb
embfile = './emb/word2vec.txt'
with open(embfile, encoding='utf-8') as f:
            lines = f.readlines()
            items = lines[0].split()
            word_count, embedding_dim = int(items[0]), int(items[1])
word2vec_emb = pd.DataFrame([line.strip().split(' ') for line in lines[1:]])
word2vec_emb = word2vec_emb.loc[:,1:].set_index(word2vec_emb[0].values)

In [None]:
def sentence_emb(emb, sentence):
    vec_emb = set(emb.index) & set(sentence.split(' '))
    mean_vec = pd.Series([0.]*emb.shape[1]).astype(float)
    for vec in vec_emb:
        mean_vec += emb.loc[vec].astype(float)
    return mean_vec/len(vec_emb)
X_train_emb = X_train.map(lambda x:sentence_emb(word2vec_emb, x))

## LogisticRegression

In [6]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

3000->3000

In [None]:
lr.fit(X_train_tfidf, y_train)

# 保存
with open('saved/lr_00.pickle','wb') as f:
    pickle.dump(lr, f)

In [None]:
# 读取
with open('saved/lr_00.pickle','rb') as f:
    lr = pickle.load(f)

y_pred = lr.predict(X_test_tfidf)
f1_score(y_test, y_pred, average='macro')

3000->1000

In [None]:
lr.fit(X_train_tfidf_s1000, y_train)
# 保存
with open('saved/lr_01.pickle','wb') as f:
    pickle.dump(lr, f)

In [None]:
# 读取
with open('saved/lr_01.pickle','rb') as f:
    lr = pickle.load(f)

y_pred = lr.predict(X_test_tfidf_s1000)
f1_score(y_test, y_pred, average='macro')

word2vec 1000

In [None]:
lr.fit(X_train_emb1000, y_train_emb1000)
# 保存
with open('saved/lr_02.pickle','wb') as f:
    pickle.dump(lr, f)

In [None]:
# 读取
with open('saved/lr_02.pickle','rb') as f:
    lr = pickle.load(f)

y_pred = lr.predict(X_test_emb1000)
f1_score(y_test_emb1000, y_pred, average='macro')

In [None]:
# 训练集
y_pred = lr.predict(X_train_emb1000)
f1_score(y_train_emb1000, y_pred, average='macro')

word2vec 256 20w

In [7]:
lr.fit(X_train_emb256, y_train_emb256)
# 保存
with open('saved/lr_03.pickle','wb') as f:
    pickle.dump(lr, f)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
# 读取
with open('saved/lr_03.pickle','rb') as f:
    lr = pickle.load(f)

y_pred = lr.predict(X_test_emb256)
f1_score(y_test_emb256, y_pred, average='macro')

0.8724104080617358

## NB

In [9]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

3000->3000

In [None]:
mnb.fit(X_train_tfidf, y_train)

# 保存
with open('saved/mnb_00.pickle','wb') as f:
    pickle.dump(mnb, f)

In [None]:
# 读取
with open('saved/mnb_00.pickle','rb') as f:
    mnb = pickle.load(f)

y_pred = mnb.predict(X_test_tfidf)
f1_score(y_test, y_pred, average='macro')

3000->1000

In [None]:
mnb.fit(X_train_tfidf_s1000, y_train)

# 保存
with open('saved/mnb_01.pickle','wb') as f:
    pickle.dump(mnb, f)

In [None]:
# 读取
with open('saved/mnb_01.pickle','rb') as f:
    mnb = pickle.load(f)

y_pred = mnb.predict(X_test_tfidf_s1000)
f1_score(y_test, y_pred, average='macro')

word2vec 1000

In [None]:
mnb.fit(X_train_emb1000 - X_train_emb1000.min(axis=0), y_train_emb1000)
# 保存
with open('saved/mnb_02.pickle','wb') as f:
    pickle.dump(mnb, f)

In [None]:
# 读取
with open('saved/mnb_02.pickle','rb') as f:
    mnb = pickle.load(f)

y_pred = mnb.predict(X_test_emb1000 - X_train_emb1000.min(axis=0))
f1_score(y_test_emb1000, y_pred, average='macro')

In [None]:
# 训练集
y_pred = mnb.predict(X_train_emb1000 - X_train_emb1000.min(axis=0))
f1_score(y_train_emb1000, y_pred, average='macro')

word2vec 256 20w

In [11]:
mnb.fit(X_train_emb256 - X_train_emb256.min(axis=0), y_train_emb256)
# 保存
with open('saved/mnb_03.pickle','wb') as f:
    pickle.dump(mnb, f)

In [12]:
# 读取
with open('saved/mnb_03.pickle','rb') as f:
    mnb = pickle.load(f)

y_pred = mnb.predict(X_test_emb256 - X_train_emb256.min(axis=0))
f1_score(y_test_emb256, y_pred, average='macro')

0.5408563187763433

## GBDT

In [13]:
import lightgbm as lgb
lgbc =lgb.LGBMClassifier(
    boosting_type='gbdt',
    num_leaves=2**6,
    max_depth=100, 
    learning_rate=0.088,
    n_estimators=500,
    objective='multiclass',
    subsample=0.72,#
    colsample_bytree=0.599,#
    reg_alpha=0.001,
    reg_lambda=0.599,#l2
    n_jobs=-1, #
    num_class=14,#
    silent=False, 
    random_state=42,
#     class_weight=20,
    min_child_samples=21,
    metric='multi_logloss',
    device='cpu',
)

3000->3000

In [None]:
lgbc.fit(X_train_tfidf,
         y_train,
         eval_set=(X_test_tfidf, y_test),
         early_stopping_rounds=100)
# 保存
with open('saved/lgbc_00.pickle', 'wb') as f:
    pickle.dump(lgbc, f)

In [None]:
# 读取
with open('saved/lgbc_00.pickle','rb') as f:
    lgbc = pickle.load(f)
y_pred = lgbc.predict(X_test_tfidf)
f1_score(y_test, y_pred, average='macro')

3000->1000

In [None]:
lgbc.fit(X_train_tfidf_s1000,
         y_train,
         eval_set=(X_test_tfidf_s1000, y_test),
         early_stopping_rounds=100)
# 保存
with open('saved/lgbc_01.pickle', 'wb') as f:
    pickle.dump(lgbc, f)

In [None]:
# 读取
with open('saved/lgbc_01.pickle','rb') as f:
    lgbc = pickle.load(f)
y_pred = lgbc.predict(X_test_tfidf_s1000)
f1_score(y_test, y_pred, average='macro')

word2vec 1000 1w

In [None]:
lgbc.fit(X_train_emb1000,
         y_train_emb1000,
         eval_set=(X_test_emb1000, y_test_emb1000),
         early_stopping_rounds=100)
# 保存
with open('saved/lgbc_02.pickle', 'wb') as f:
    pickle.dump(lgbc, f)

In [None]:
# 读取
with open('saved/lgbc_02.pickle','rb') as f:
    lgbc = pickle.load(f)
y_pred = lgbc.predict(X_test_emb1000)
f1_score(y_test_emb1000, y_pred, average='macro')

word2vec 256 20w

In [14]:
lgbc.fit(X_train_emb256,
         y_train,
         eval_set=(X_test_emb256, y_test),
         early_stopping_rounds=100)
# 保存
with open('saved/lgbc_03.pickle', 'wb') as f:
    pickle.dump(lgbc, f)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65280
[LightGBM] [Info] Number of data points in the train set: 150000, number of used features: 256
[LightGBM] [Info] Start training from score -1.639176
[LightGBM] [Info] Start training from score -1.688265
[LightGBM] [Info] Start training from score -1.854657
[LightGBM] [Info] Start training from score -2.205277
[LightGBM] [Info] Start training from score -2.583976
[LightGBM] [Info] Start training from score -2.790671
[LightGBM] [Info] Start training from score -2.989088
[LightGBM] [Info] Start training from score -3.117373
[LightGBM] [Info] Start training from score -3.241803
[LightGBM] [Info] Start training from score -3.518855
[LightGBM] [Info] Start training from score -3.700412
[LightGBM] [Info] Start training from score -4.157071
[LightGBM] [Info] Start training from score -4.698749
[LightGBM] [Info] Start training from score -5.374479
[1]	valid_0's multi_logloss: 1.75487
Training until vali

[182]	valid_0's multi_logloss: 0.333151
[183]	valid_0's multi_logloss: 0.333243
[184]	valid_0's multi_logloss: 0.33331
[185]	valid_0's multi_logloss: 0.333338
[186]	valid_0's multi_logloss: 0.333432
[187]	valid_0's multi_logloss: 0.333493
[188]	valid_0's multi_logloss: 0.333541
[189]	valid_0's multi_logloss: 0.33358
[190]	valid_0's multi_logloss: 0.33367
[191]	valid_0's multi_logloss: 0.333766
[192]	valid_0's multi_logloss: 0.333876
[193]	valid_0's multi_logloss: 0.333995
[194]	valid_0's multi_logloss: 0.334086
[195]	valid_0's multi_logloss: 0.33414
[196]	valid_0's multi_logloss: 0.334322
[197]	valid_0's multi_logloss: 0.334327
[198]	valid_0's multi_logloss: 0.334495
[199]	valid_0's multi_logloss: 0.334613
[200]	valid_0's multi_logloss: 0.33476
[201]	valid_0's multi_logloss: 0.33481
[202]	valid_0's multi_logloss: 0.334804
[203]	valid_0's multi_logloss: 0.334897
[204]	valid_0's multi_logloss: 0.334922
[205]	valid_0's multi_logloss: 0.334953
[206]	valid_0's multi_logloss: 0.335034
[207]	

In [15]:
# 读取
with open('saved/lgbc_03.pickle','rb') as f:
    lgbc = pickle.load(f)
y_pred = lgbc.predict(X_test_emb256)
f1_score(y_test_emb256, y_pred, average='macro')

0.8735853530290284

## Voting

In [None]:
all_pred = pd.DataFrame([
    lr.predict(X_test_tfidf), 
    mnb.predict(X_test_tfidf), 
    lgbc.predict(X_test_tfidf)
])

In [None]:
value = all_pred.T.apply(lambda x: x.value_counts().index[0], axis=1)

In [None]:
f1_score(y_test, value.values, average='macro')