# GRU+FastText 文本分类

In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.initializers import Constant

Using TensorFlow backend.


In [2]:
MAX_NUM_WORDS = 30000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 100

## 1. 数据预处理

### 1.1 读取训练数据

In [3]:
train = pd.read_csv('./data/train.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
X_train_texts = train['comment_text'].fillna('').values
X_train_texts[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [5]:
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
y_train[0]

array([0, 0, 0, 0, 0, 0], dtype=int64)

### 1.2 读取测试数据

In [6]:
test = pd.read_csv('./data/test.csv')
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [7]:
X_test_texts = test['comment_text'].fillna('').values
X_test_texts[0]

"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"

### 1.3 建立token

In [8]:
token = text.Tokenizer(num_words=MAX_NUM_WORDS)
token.fit_on_texts(list(X_train_texts)+list(X_test_texts))

### 1.3 将“评论文字”转换为“数字列表”

In [9]:
X_train_seqs = token.texts_to_sequences(X_train_texts)
X_test_seqs  = token.texts_to_sequences(X_test_texts)

In [10]:
X_train_seqs[0]

[733,
 78,
 1,
 140,
 131,
 182,
 30,
 712,
 4438,
 10284,
 1252,
 86,
 368,
 51,
 2230,
 14039,
 49,
 6744,
 15,
 60,
 2624,
 151,
 7,
 2832,
 33,
 115,
 1246,
 16129,
 2517,
 5,
 50,
 59,
 256,
 1,
 370,
 31,
 1,
 46,
 29,
 144,
 72,
 3931,
 89,
 4208,
 6368,
 2687,
 1183]

### 1.4 截长补短让所有“数字列表”的长度都为100

In [11]:
X_train = sequence.pad_sequences(X_train_seqs, maxlen = MAX_SEQUENCE_LENGTH)
X_test  = sequence.pad_sequences(X_test_seqs, maxlen = MAX_SEQUENCE_LENGTH)

In [12]:
X_train[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,   733,
          78,     1,   140,   131,   182,    30,   712,  4438, 10284,
        1252,    86,   368,    51,  2230, 14039,    49,  6744,    15,
          60,  2624,   151,     7,  2832,    33,   115,  1246, 16129,
        2517,     5,    50,    59,   256,     1,   370,    31,     1,
          46,    29,   144,    72,  3931,    89,  4208,  6368,  2687,
        1183])

## 2 建立嵌入层

In [30]:
word_index = token.word_index
print('Found {} unique tokens.'.format(len(word_index)))

Found 394787 unique tokens.


In [31]:
count = 0
for (k,v) in word_index.items():
    if count == 5:break
    print(k, v)
    count += 1

the 1
to 2
of 3
a 4
and 5


### 2.1 单词到词向量的索引

In [41]:
import io

def load_vectors(fname):
    file = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    vocab_and_vectors = {}
    # put words as dict indexes and vectors as words values
    for line in file:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        vocab_and_vectors[word] = vector
    return vocab_and_vectors

In [42]:
embeddings_index = load_vectors('../../data/crawl-300d-2M.vec')        

Found 2000000 word vectors.


In [43]:
print('Found {} word vectors.'.format(len(embeddings_index)))

Found 2000000 word vectors.


In [44]:
embeddings_index['word']

array([ 1.300e-01,  1.817e-01, -2.394e-01, -3.343e-01, -1.782e-01,
        1.909e-01,  2.360e-01, -3.340e-02, -2.144e-01,  2.300e-03,
        2.884e-01,  1.728e-01,  9.580e-02, -1.631e-01, -1.694e-01,
        1.399e-01, -3.348e-01,  8.230e-02, -2.362e-01,  1.736e-01,
       -1.211e-01, -1.921e-01,  2.112e-01, -3.219e-01,  1.304e-01,
        6.290e-02,  2.469e-01,  2.399e-01,  3.390e-02, -2.660e-02,
       -2.490e-01,  1.621e-01, -2.325e-01, -6.460e-02, -5.870e-02,
        2.026e-01,  3.257e-01,  1.077e-01,  1.626e-01,  3.746e-01,
       -4.290e-02, -1.358e-01, -8.100e-02,  1.301e-01, -2.484e-01,
       -1.489e-01,  1.837e-01, -2.005e-01, -3.705e-01, -1.698e-01,
       -4.474e-01,  1.324e-01,  1.508e-01,  2.100e-03,  1.410e-02,
       -8.110e-02, -3.260e-02, -3.553e-01, -1.793e-01, -1.990e-02,
       -2.126e-01,  1.082e-01, -6.460e-02, -4.052e-01,  3.202e-01,
        1.000e-02, -9.220e-02, -2.408e-01, -4.420e-02, -4.280e-01,
        2.139e-01, -3.097e-01, -1.249e-01,  2.535e-01,  2.053e

### 2.2 单词索引到词向量的矩阵

In [45]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [48]:
embedding_matrix[18]

array([-1.06299996e-01,  6.69000000e-02,  6.37999997e-02,  3.28900009e-01,
        1.37999998e-02, -3.37000005e-02, -1.95899993e-01, -4.25999999e-01,
        7.24900007e-01, -5.04000001e-02, -6.88000023e-02,  7.53000006e-02,
       -7.86999986e-02, -4.23999988e-02,  3.37999985e-02, -4.00000019e-03,
        2.50900000e-01,  7.80000016e-02,  1.17500000e-01,  3.51000018e-02,
       -2.39700004e-01, -3.92999984e-02,  2.34999992e-02, -7.66000003e-02,
        2.00000009e-03, -5.13000004e-02, -1.24600001e-01,  9.61000025e-02,
       -6.45999983e-02, -1.26599997e-01, -1.28399998e-01,  2.61000004e-02,
       -2.44999994e-02, -1.48599997e-01,  1.29000004e-02, -1.02000004e-02,
        9.83000025e-02, -1.20800003e-01,  7.46000037e-02,  1.04099996e-01,
       -9.42000002e-02,  2.24199995e-01, -1.54300004e-01, -3.11999992e-02,
        7.51999989e-02,  2.85000000e-02, -4.08999994e-02,  4.36000004e-02,
        5.51999994e-02, -2.31000006e-01, -3.40000018e-02, -7.91100025e-01,
        8.62200022e-01,  

### 2.3 建立嵌入层

In [None]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

## 3. 建立GRU模型

In [49]:
model = Sequential()

In [16]:
#model.add(Embedding(input_dim = MAX_NUM_WORDS,
#                    output_dim = EMBEDDING_DIM,
#                    input_length=MAX_SEQUENCE_LENGTH))

Instructions for updating:
Colocations handled automatically by placer.


In [52]:
model.add(Embedding(input_dim = num_words,
                    output_dim = EMBEDDING_DIM,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))

In [53]:
model.add(Dropout(0.2))

In [54]:
model.add(GRU(units=300))

In [55]:
model.add(Dense(units=6, activation='sigmoid'))

In [56]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 300)          9000000   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
gru_2 (GRU)                  (None, 300)               540900    
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 1806      
Total params: 9,542,706
Trainable params: 542,706
Non-trainable params: 9,000,000
_________________________________________________________________


## 4 训练模型

In [57]:
model.compile(loss='binary_crossentropy',  
              optimizer='adam', 
              metrics=['accuracy'])

In [58]:
train_history = model.fit(X_train, y_train,batch_size=32, epochs=2, verbose=2, validation_split=0.2)

Train on 127656 samples, validate on 31915 samples
Epoch 1/2
 - 1907s - loss: 0.0518 - acc: 0.9813 - val_loss: 0.0447 - val_acc: 0.9832
Epoch 2/2
 - 1882s - loss: 0.0428 - acc: 0.9836 - val_loss: 0.0431 - val_acc: 0.9835


## 5 进行预测

In [59]:
y_pred = model.predict(X_test, batch_size=1024)

In [60]:
submission = pd.read_csv('./data/sample_submission.csv')
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission.csv', index=False)