In [1]:
from tqdm import tqdm
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt


def set_style():
    sns.set_style('white')
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.rcParams['figure.figsize'] = [10, 10]

    pd.options.display.max_columns = 300
    pd.options.display.max_rows = 1000
    # plt.rcParams['figure.dpi'] = 200
    sns.set_palette('muted')  # 调色板颜色温和
    sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 1})

set_style()
import sys

In [None]:
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

import tensorflow as tf
print(tf.__version__)
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras import Sequential

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Conv1D,MaxPooling1D, Flatten, BatchNormalization
from keras.layers import concatenate

In [4]:
!ls data

track1_round1_testA_20210222.csv  track1_round1_train_20210222.csv


In [5]:
test_data = open('data/track1_round1_testA_20210222.csv').readlines()
test_data = [data.strip().split('|,|') for data in test_data]
test_dataset = pd.DataFrame(test_data, columns=['report_id', 'description'])
test_dataset['words'] = test_dataset.description.apply(lambda x:x.split())

test_dataset.head()

Unnamed: 0,report_id,description,words
0,0,852 328 697 538 142 355 582 800 728 4 647 169 ...,"[852, 328, 697, 538, 142, 355, 582, 800, 728, ..."
1,1,380 358 343 654 171 832 47 832 690 693 48 563 ...,"[380, 358, 343, 654, 171, 832, 47, 832, 690, 6..."
2,2,751 335 834 582 717 583 585 693 623 328 107 38...,"[751, 335, 834, 582, 717, 583, 585, 693, 623, ..."
3,3,623 328 649 582 488 12 578 623 538 382 382 265...,"[623, 328, 649, 582, 488, 12, 578, 623, 538, 3..."
4,4,83 293 398 797 382 363 145 424 693 698 800 691...,"[83, 293, 398, 797, 382, 363, 145, 424, 693, 6..."


In [6]:
train_data = open('data/track1_round1_train_20210222.csv','r').readlines()

train_data = [data.strip().split('|,|')  for data in train_data]

train_dataset = pd.DataFrame(train_data,columns=['report_id','description','label'])
train_dataset['words'] = train_dataset.description.apply(lambda x:x.split())

train_dataset.head()

Unnamed: 0,report_id,description,label,words
0,0,623 328 538 382 399 400 478 842 698 137 492 26...,2.0,"[623, 328, 538, 382, 399, 400, 478, 842, 698, ..."
1,1,48 328 538 382 809 623 434 355 382 382 363 145...,,"[48, 328, 538, 382, 809, 623, 434, 355, 382, 3..."
2,2,623 656 293 851 636 842 698 493 338 266 369 69...,15.0,"[623, 656, 293, 851, 636, 842, 698, 493, 338, ..."
3,3,48 328 380 259 439 107 380 265 172 470 290 693...,,"[48, 328, 380, 259, 439, 107, 380, 265, 172, 4..."
4,4,623 328 399 698 493 338 266 14 177 415 511 647...,16.0,"[623, 328, 399, 698, 493, 338, 266, 14, 177, 4..."


In [7]:
dataset = train_dataset.append(test_dataset)
dataset.head()

Unnamed: 0,report_id,description,label,words
0,0,623 328 538 382 399 400 478 842 698 137 492 26...,2.0,"[623, 328, 538, 382, 399, 400, 478, 842, 698, ..."
1,1,48 328 538 382 809 623 434 355 382 382 363 145...,,"[48, 328, 538, 382, 809, 623, 434, 355, 382, 3..."
2,2,623 656 293 851 636 842 698 493 338 266 369 69...,15.0,"[623, 656, 293, 851, 636, 842, 698, 493, 338, ..."
3,3,48 328 380 259 439 107 380 265 172 470 290 693...,,"[48, 328, 380, 259, 439, 107, 380, 265, 172, 4..."
4,4,623 328 399 698 493 338 266 14 177 415 511 647...,16.0,"[623, 328, 399, 698, 493, 338, 266, 14, 177, 4..."


In [8]:
mlb = MultiLabelBinarizer()

label = mlb.fit_transform(train_dataset.label.apply(lambda x:x.split()))

label.shape

(10000, 17)

In [9]:
tokenizer=Tokenizer()  #创建一个Tokenizer对象
#fit_on_texts函数可以将输入的文本中的每个词编号，编号是根据词频的，词频越大，编号越小
tokenizer.fit_on_texts(dataset['words'])
vocab=tokenizer.word_index #得到每个词的编号
# 将每个样本中的每个词转换为数字列表，使用每个词的编号进行编号

In [10]:
train_data =tokenizer.texts_to_sequences(train_dataset['words'])
#将超过固定值的部分截掉，不足的在最前面用0填充
train_data_padded_seqs = pad_sequences(train_data, maxlen=100, value = len(vocab))

test_data =tokenizer.texts_to_sequences(test_dataset['words'])
#将超过固定值的部分截掉，不足的在最前面用0填充
test_data_padded_seqs = pad_sequences(test_data, maxlen=100, value = len(vocab))

In [11]:
train_data_padded_seqs.shape

(10000, 100)

## CNN

In [12]:
model = Sequential()
model.add(Embedding(len(vocab) + 1, 300, input_length=100)) #使用Embeeding层将每个词编码转换为词向量
model.add(Conv1D(256, 5, padding='same'))
model.add(MaxPooling1D(3, 3, padding='same'))
model.add(Conv1D(128, 5, padding='same'))
model.add(MaxPooling1D(3, 3, padding='same'))
model.add(Conv1D(64, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.1))
model.add(BatchNormalization())  # (批)规范化层
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(17, activation='softmax'))
model.summary()

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(train_data_padded_seqs, label, epochs=50, batch_size=128, validation_split=0.1)
# y_predict = model.predict_classes(x_test_padded_seqs)  # 预测的是类别，结果就是类别号
# y_predict = list(map(str, y_predict))
# print('准确率', metrics.accuracy_score(y_test, y_predict))
# print('平均f1-score:', metrics.f1_score(y_test, y_predict, average='weighted'))


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 300)          257700    
_________________________________________________________________
conv1d (Conv1D)              (None, 100, 256)          384256    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 34, 256)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 34, 128)           163968    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 12, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 12, 64)            24640     
_________________________________________________________________
flatten (Flatten)            (None, 768)               0

KeyboardInterrupt: 

## textCnn

In [None]:
main_input = Input(shape=(100,), dtype='float64')
# 词嵌入（使用预训练的词向量）
embedder = Embedding(len(vocab) + 1, 300, input_length=100, trainable=False)
embed = embedder(main_input)
# 词窗大小分别为3,4,5
cnn1 = Conv1D(256, 3, padding='same', strides=1, activation='relu')(embed)
cnn1 = MaxPooling1D(pool_size=48)(cnn1)
cnn2 = Conv1D(256, 4, padding='same', strides=1, activation='relu')(embed)
cnn2 = MaxPooling1D(pool_size=47)(cnn2)
cnn3 = Conv1D(256, 5, padding='same', strides=1, activation='relu')(embed)
cnn3 = MaxPooling1D(pool_size=46)(cnn3)
# 合并三个模型的输出向量
cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)
main_output = Dense(17, activation='softmax')(drop)
model = Model(inputs=main_input, outputs=main_output)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_data_padded_seqs, label, epochs=50, batch_size=200,validation_split=0.1)


## Bilstm

In [15]:
from keras import layers

In [17]:
inputs = Input(shape=(100,), dtype='float64')
embedder = Embedding(len(vocab) + 1, 300, input_length=100, trainable=False)
embed = embedder(inputs)
# Embed each integer in a 128-dimensional vector
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(embed)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = Dense(17, activation='softmax')(x)
model = Model(inputs, outputs)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 100, 300)          257700    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          186880    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_3 (Dense)              (None, 17)                2193      
Total params: 545,589
Trainable params: 287,889
Non-trainable params: 257,700
_________________________________________________________________


In [18]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_data_padded_seqs, label, epochs=50, batch_size=200,validation_split=0.1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f0a0869e0b8>

## Transformer

In [19]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [20]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
embedder = Embedding(len(vocab) + 1, 300, input_length=100, trainable=False)


In [21]:
embed_dim = 300  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 300  # Hidden layer size in feed forward network inside transformer
maxlen=100
vocab_size = len(vocab) + 1

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(17, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

AttributeError: module 'keras.layers' has no attribute 'MultiHeadAttention'

## Prediction

In [194]:
test_pre = model.predict(test_data_padded_seqs)

In [209]:
result = ['|%s'%' '.join(['%.16f'%i for i in pre]) for pre in test_pre]

In [210]:
result[0]

'|0.0026591273490340 0.0033524620812386 0.0011169621720910 0.0001841006014729 0.0001974138431251 0.0000606569228694 0.0001457884209231 0.0008011281606741 0.0000141518821692 0.0007482352084480 0.0000825503811939 0.0000962369231274 0.0000298136310448 0.0000104138371171 0.4743624925613403 0.0229320097714663 0.4932065010070801'

In [213]:
test_dataset['report_ID'] = test_dataset['report_id'].apply(lambda x:'%s|'%x)
test_dataset['Prediction'] =result
test_dataset.head()

Unnamed: 0,report_id,description,words,Prediction,report_ID
0,0,852 328 697 538 142 355 582 800 728 4 647 169 ...,"[852, 328, 697, 538, 142, 355, 582, 800, 728, ...",|0.0026591273490340 0.0033524620812386 0.00111...,0|
1,1,380 358 343 654 171 832 47 832 690 693 48 563 ...,"[380, 358, 343, 654, 171, 832, 47, 832, 690, 6...",|0.0569489412009716 0.0414477922022343 0.04517...,1|
2,2,751 335 834 582 717 583 585 693 623 328 107 38...,"[751, 335, 834, 582, 717, 583, 585, 693, 623, ...",|0.0179503764957190 0.0121990703046322 0.00019...,2|
3,3,623 328 649 582 488 12 578 623 538 382 382 265...,"[623, 328, 649, 582, 488, 12, 578, 623, 538, 3...",|0.1098752319812775 0.0348254181444645 0.02064...,3|
4,4,83 293 398 797 382 363 145 424 693 698 800 691...,"[83, 293, 398, 797, 382, 363, 145, 424, 693, 6...",|0.0051004556007683 0.0197584703564644 0.02819...,4|


In [214]:
test_dataset[['report_ID','Prediction']].to_csv('submit.csv',index = False,header=None)