In [2]:
from tqdm import tqdm
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt


def set_style():
    sns.set_style('white')
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.rcParams['figure.figsize'] = [10, 10]

    pd.options.display.max_columns = 300
    pd.options.display.max_rows = 1000
    # plt.rcParams['figure.dpi'] = 200
    sns.set_palette('muted')  # 调色板颜色温和
    sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 1})

set_style()
import sys

In [11]:
train_data = open('data/track1_round1_train_20210222.csv','r').readlines()

In [16]:
train_data = [data.strip().split('|,|')  for data in train_data]

In [45]:
df_train = pd.DataFrame(train_data,columns=['report_id','description','label'])

In [46]:
df_train.head()

Unnamed: 0,report_id,description,label
0,0,623 328 538 382 399 400 478 842 698 137 492 26...,2.0
1,1,48 328 538 382 809 623 434 355 382 382 363 145...,
2,2,623 656 293 851 636 842 698 493 338 266 369 69...,15.0
3,3,48 328 380 259 439 107 380 265 172 470 290 693...,
4,4,623 328 399 698 493 338 266 14 177 415 511 647...,16.0


In [82]:
for i in range(17):
    df_train['label%d'%i] = [str(i) in label for label in  df_train['label']]

In [83]:
df_train.head()

Unnamed: 0,report_id,description,label,label0,label1,label2,label3,label4,label5,label6,label7,label8,label9,label10,label11,label12,label13,label14,label15,label16
0,0,623 328 538 382 399 400 478 842 698 137 492 26...,2.0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,1,48 328 538 382 809 623 434 355 382 382 363 145...,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2,623 656 293 851 636 842 698 493 338 266 369 69...,15.0,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False
3,3,48 328 380 259 439 107 380 265 172 470 290 693...,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,4,623 328 399 698 493 338 266 14 177 415 511 647...,16.0,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True


In [None]:
df_trainin

In [40]:
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping

In [91]:
labels = ['label%d'%i for i in range(17)]

In [92]:
df_train[labels]

Unnamed: 0,label0,label1,label2,label3,label4,label5,label6,label7,label8,label9,label10,label11,label12,label13,label14,label15,label16
0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,True,True,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False
9996,True,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
9997,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9998,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True


In [95]:
enc = OneHotEncoder()
train_y = enc.fit_transform(df_train[labels]).toarray()

train_y.shape

(10000, 34)

In [135]:
text = [i.split() for i in df_train.description]

In [117]:
df_train.description.apply(lambda x:len(x.split())).max()

104

In [120]:
text = [np.array(i.split()) for i in df_train.description]

In [126]:
words = []
for t in text:
    words.extend(t)

In [133]:
len(pd.DataFrame(words)[0].unique())

858

In [136]:
max_words = 1000
max_len = 120
tok = Tokenizer(num_words=max_words)  ## 使用的最大词语数为5000

In [137]:
tok.fit_on_texts(text)

In [138]:
for ii,iterm in enumerate(tok.word_index.items()):
    if ii < 10:
        print(iterm)
    else:
        break
print("===================")  
for ii,iterm in enumerate(tok.word_counts.items()):
    if ii < 10:
        print(iterm)
    else:
        break

('693', 1)
('328', 2)
('380', 3)
('698', 4)
('415', 5)
('177', 6)
('381', 7)
('809', 8)
('623', 9)
('266', 10)
('623', 5681)
('328', 14966)
('538', 2270)
('382', 4372)
('399', 4398)
('400', 822)
('478', 1418)
('842', 4328)
('698', 12959)
('137', 83)


In [139]:
train_seq = tok.texts_to_sequences(text)

In [140]:
train_seq_mat = sequence.pad_sequences(train_seq,maxlen=max_len)

In [141]:
## 定义LSTM模型
inputs = Input(name='inputs',shape=[max_len])
## Embedding(词汇表大小,batch大小,每个新闻的词长)
layer = Embedding(max_words+1,128,input_length=max_len)(inputs)
layer = LSTM(128)(layer)
layer = Dense(128,activation="relu",name="FC1")(layer)
layer = Dropout(0.5)(layer)
layer = Dense(34,activation="softmax",name="FC2")(layer)
model = Model(inputs=inputs,outputs=layer)
model.summary()
model.compile(loss="categorical_crossentropy",optimizer=RMSprop(),metrics=["accuracy"])

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 120)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 120, 128)          128128    
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
FC1 (Dense)                  (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
FC2 (Dense)                  (None, 34)                4386      
Total params: 280,610
Trainable params: 280,610
Non-trainable params: 0
_____________________________________________________

In [142]:
## 模型训练
model_fit = model.fit(train_seq_mat,train_y,batch_size=128,epochs=10,
                      callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] ## 当val-loss不再提升时停止训练
                     )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
