In [1]:
import pandas as pd
import numpy as np
import torch
import time
import random
import os

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from keras.layers import Input, Dense, Conv1D, Dropout, MaxPooling1D, Flatten, Embedding, concatenate, LSTM, Activation
from keras.models import Model
from keras.models import Sequential

from keras.utils import np_utils

import torch
from torch.utils import data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

Using TensorFlow backend.


In [2]:
train = pd.read_csv('data/train_one_label.csv')
test = pd.read_csv("data/test.csv")

In [3]:
def word_vectors(train, test):
    '''
    使用keras对文本数据做预处理
    '''
    tokenizer = Tokenizer(num_words=1000)
    tokenizer.fit_on_texts(train['comment_text'])

    train_seq = tokenizer.texts_to_sequences(train['comment_text'])
    test_seq = tokenizer.texts_to_sequences(test['comment_text'])

    x_train = sequence.pad_sequences(train_seq, maxlen=200)  # shape  (* , 200)
    y_train = train['toxic']
    x_test = sequence.pad_sequences(test_seq, maxlen=200)  # shape (* , 200)
    return torch.tensor(x_train).long(), torch.tensor(y_train.values).long(), torch.tensor(x_test).long()

In [4]:
x_train, y_train, x_test = word_vectors(train, test)

In [5]:
train = data.TensorDataset(x_train, y_train)
test = data.TensorDataset(x_test)

In [6]:
train_iter = data.DataLoader(train, batch_size=8, shuffle=True)

In [7]:
class LSTM(nn.Module):

    def __init__(self):
        super(LSTM, self).__init__()
        self.word_embeddings = nn.Embedding(1000, 300)  # embedding之后的shape: torch.Size([200, 8, 300])
        # 若使用预训练的词向量，需在此处指定预训练的权重
        # embedding.weight.data.copy_(weight_matrix)
        self.lstm = nn.LSTM(input_size=300, hidden_size=128, num_layers=1)  # torch.Size([200, 8, 128])
        self.decoder = nn.Linear(128, 2)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out = self.lstm(embeds)[0]  # torch.Size([8, 200, 128])
        # 取最后一个时间步
        final = lstm_out[:, -1, :]  # 8*128
        y = self.decoder(final)  # 8*2 
        return y


In [8]:
model = LSTM()
model.train()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)
loss_funtion = F.cross_entropy

In [9]:
for epoch, batch in enumerate(train_iter):
    optimizer.zero_grad()
    predicted = model(batch[0])
    print(batch)
    loss = loss_funtion(predicted, batch[1])
    loss.backward()
    optimizer.step()
    print(loss)

[tensor([[  0,   0,   0,  ..., 104,   4, 350],
        [  0,   0,   0,  ...,   2,  28,  78],
        [  0,   0,   0,  ..., 251, 252, 253],
        ...,
        [  0,   0,   0,  ..., 108,  49,  97],
        [  0,   0,   0,  ...,   1, 274, 140],
        [  0,   0,   0,  ...,   1, 475, 476]]), tensor([0, 0, 0, 0, 0, 0, 0, 0])]
tensor(0.7199, grad_fn=<NllLossBackward>)
[tensor([[  0,   0,   0,  ..., 538, 539, 540],
        [  0,   0,   0,  ...,  59,  14, 319],
        [  0,   0,   0,  ..., 413,   1, 414],
        ...,
        [  0,   0,   0,  ..., 201,  19, 622],
        [  0,   0,   0,  ..., 533, 534, 535],
        [ 34, 632,   8,  ...,   2,  14, 705]]), tensor([0, 0, 0, 0, 0, 0, 1, 0])]
tensor(0.6202, grad_fn=<NllLossBackward>)
[tensor([[  0,   0,   0,  ...,  24, 432, 197],
        [  0,   0,   0,  ...,  11,  42, 316],
        [  0,   0,   0,  ..., 574, 575, 576],
        ...,
        [  0,   0,   0,  ..., 578,  93, 579],
        [  0,   0,   0,  ...,  19, 151,  11],
        [  0,   0,  

# 补充：使用Keras搭建LSTM模型

In [142]:
max_features = 1000   # 词汇表大小
# cut texts after this number of words (among top max_features most common words)
# 裁剪文本为 maxlen 大小的长度（取最后部分，基于前 max_features 个常用词）
maxlen = 200 
batch_size = 8  # 批数据量大小

model = Sequential()
# 嵌入层，每个词维度为128
model.add(Embedding(max_features, 128))
# LSTM层，输出维度128，可以尝试着换成 GRU 试试
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))  # try using a GRU instead, for fun
model.add(Dense(1))   # 单神经元全连接层
model.add(Activation('sigmoid'))   # sigmoid 激活函数层

In [143]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [144]:
model.fit(x_train.numpy(), y_train.numpy(), 
          validation_split=0.1, 
          batch_size=batch_size, 
          epochs=3)

Train on 22 samples, validate on 3 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x127685748>