In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
input_file = 'clean_data.csv'
file = pandas.read_csv(input_file)

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
x = file.Tweet
y = file.Label
seed = 334
# 按（0.98, 0.01, 0.01）的比例划分训练、验证、测试集
x_train, x_validation_test, y_train, y_validation_test = train_test_split(x, y, test_size=0.02, random_state=seed)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_test, y_validation_test, test_size=0.5, random_state=seed)

# print(len(x))
# print(len(x_train), len(x_val), len(x_test))

| <center><font size=3>Set | <center><font size=3>Size | <center><font size=3>Proportion |
|--|--|--|--|--|--|
| <center><font size=3>Train | <center><font size=3>1565182 | <center><font size=3>98.00% |
| <center><font size=3>Validation | <center><font size=3>15971 | <center><font size=3>1.00% |
| <center><font size=3>Test | <center><font size=3>15982 | <center><font size=3>1.00% |
| <center><font size=3>Total | <center><font size=3>1597125 | <center><font size=3>100.00% |

In [5]:
from gensim.models import KeyedVectors

Using TensorFlow backend.


In [6]:
# 载入预生成的词向量长度为100的word2vec模型
model_cbow = KeyedVectors.load('model_cbow_100.word2vec')
model_sg = KeyedVectors.load('model_sg_100.word2vec')

# len(model_cbow.wv.vocab)

| <center><font size=3>Vocabulary Size | <center><font size=3>Vector Size| <center><font size=3>Vector Type |
|--|--|--|
| <center><font size=3>268681 | <center><font size=3>100 | <center><font size=3>numpy.ndarray |

In [7]:
from keras.preprocessing.text import Tokenizer

In [8]:
# 创建不同词频阈值的分词器，并将每个句子中的单词序列变为数字序列
num_words = 100000
tokenizer_100000 = Tokenizer(num_words=num_words)
# tokenizer_150000 = Tokenizer(num_words=150000)
# tokenizer_200000 = Tokenizer(num_words=200000)
# tokenizer_250000 = Tokenizer(num_words=250000)

tokenizer_100000.fit_on_texts(x_train)
# tokenizer_150000.fit_on_texts(x_train)
# tokenizer_200000.fit_on_texts(x_train)
# tokenizer_250000.fit_on_texts(x_train)

sequences_train_100000 = tokenizer_100000.texts_to_sequences(x_train)
# sequences_150000 = tokenizer_150000.texts_to_sequences(x_train)
# sequences_200000 = tokenizer_200000.texts_to_sequences(x_train)
# sequences_200000 = tokenizer_250000.texts_to_sequences(x_train)

# 利用训练集上的分词器对验证集进行序列化
sequences_validation_100000 = tokenizer_100000.texts_to_sequences(x_validation)

In [9]:
# 确定句子中最大的单词数
len_max = 0
for x in x_train:
    temp = len(x.split())
    if temp > len_max:
        len_max = temp

# len_max

# 这里不能用sequence的长度来确定句子的最大长度，因为原始数据中存在乱码，如下面的两个print语句所示，而预处理的时候没有考虑到这一点
# print(sequences_100000[510758:510759])
# print(x_train[510758:510759])
# x_train 输入num时，为index=num的元素，输入num1:num2时，为第num1行到第num2行的元素

In [10]:
from keras.preprocessing.sequence import pad_sequences

In [11]:
# 对于变长的句子进行padding使其成为维度相同的标准输入
x_train_pad = pad_sequences(sequences_train_100000, maxlen=70, padding='post')
x_validation_pad = pad_sequences(sequences_validation_100000, maxlen=70, padding='post')

# x_train_pad[510758:510759]

In [12]:
# 创建嵌入矩阵（即在输入神经网络时将标号转为向量的索引）
embedding_matrix_cbow = np.zeros((num_words, 100))
embedding_matrix_sg = np.zeros((num_words, 100))
embedding_matrix_cbow_sg = np.zeros((num_words, 200))
for word, rank in tokenizer_100000.word_index.items():
    if rank >= num_words:
        break
    if word in model_cbow:
        embedding_matrix_cbow[rank] = model_cbow[word]
    if word in model_sg:
        embedding_matrix_sg[rank] = model_sg[word]
    if (word in model_cbow) and (word in model_sg):
        embedding_matrix_cbow_sg[rank] = np.append(model_cbow[word],model_sg[word])

# 某些词如‘quot’等不具有实意的词在word2vec模型中没有对应的词向量，但词频极高，现在的处理会将其视为零向量

# print(x_train[1:2])
# sequences_train_100000[1:2]
# print(embedding_matrix_cbow[297])
# print(model_cbow['play'])

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding 
from keras.callbacks import TensorBoard

In [29]:
model_cnn_embedding_matrix_cbow = Sequential()
model_cnn_embedding_matrix_cbow.add(Embedding(100000, 100, weights=[embedding_matrix_cbow], input_length=70, trainable=False))
model_cnn_embedding_matrix_cbow.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_embedding_matrix_cbow.add(GlobalMaxPooling1D())
model_cnn_embedding_matrix_cbow.add(Dense(256, activation='relu'))
model_cnn_embedding_matrix_cbow.add(Dense(128, activation='relu'))
model_cnn_embedding_matrix_cbow.add(Dense(1, activation='sigmoid'))
model_cnn_embedding_matrix_cbow.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_embedding_matrix_cbow.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TensorBoard(log_dir='./tmp/log/model_cnn_embedding_matrix_cbow/')])
model_cnn_embedding_matrix_cbow.save('./models/model_cnn_embedding_matrix_cbow.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
model_cnn_embedding_matrix_sg = Sequential()
model_cnn_embedding_matrix_sg.add(Embedding(100000, 100, weights=[embedding_matrix_sg], input_length=70, trainable=False))
model_cnn_embedding_matrix_sg.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_embedding_matrix_sg.add(GlobalMaxPooling1D())
model_cnn_embedding_matrix_sg.add(Dense(256, activation='relu'))
model_cnn_embedding_matrix_sg.add(Dense(128, activation='relu'))
model_cnn_embedding_matrix_sg.add(Dense(1, activation='sigmoid'))
model_cnn_embedding_matrix_sg.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_embedding_matrix_sg.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TensorBoard(log_dir='./tmp/log/model_cnn_embedding_matrix_sg/')])
model_cnn_embedding_matrix_sg.save('./models/model_cnn_embedding_matrix_sg.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


| <center><font size=3>Embedding Matrix | <center><font size=3>Loss | <center><font size=3>Acc | <center><font size=3>Val Loss | <center><font size=3>Val Acc | <center><font size=3>Time(s) |
|--|--|--|--|--|
| <center><font size=3>CBOW | <center><font size=3>0.3858 | <center><font size=3>0.8257 | <center><font size=3>0.3906 | <center><font size=3>0.8250 | <center><font size=3>3460 |
| <center><font size=3>SG | <center><font size=3>0.3767 | <center><font size=3>0.8302 | <center><font size=3>0.3861 | <center><font size=3>0.8276 | <center><font size=3>3452 |
| <center><font size=3>CBOW+SG | <center><font size=3>0.3784 | <center><font size=3>0.8298 | <center><font size=3>0.3866 | <center><font size=3>0.8249 | <center><font size=3>4755 |
 <center> **Filter Num = 100, Filter Size = 2, Epochs = 5

In [14]:
model_cnn_filter_num_100 = Sequential()
model_cnn_filter_num_100.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_filter_num_100.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_filter_num_100.add(GlobalMaxPooling1D())
model_cnn_filter_num_100.add(Dense(256, activation='relu'))
model_cnn_filter_num_100.add(Dense(128, activation='relu'))
model_cnn_filter_num_100.add(Dense(1, activation='sigmoid'))
model_cnn_filter_num_100.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_filter_num_100.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TensorBoard(log_dir='./tmp/log/model_cnn_filter_num_100/')])
model_cnn_filter_num_100.save('./models/model_cnn_filter_num_100.h5')

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
model_cnn_filter_num_50 = Sequential()
model_cnn_filter_num_50.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_filter_num_50.add(Conv1D(filters=50, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_filter_num_50.add(GlobalMaxPooling1D())
model_cnn_filter_num_50.add(Dense(256, activation='relu'))
model_cnn_filter_num_50.add(Dense(128, activation='relu'))
model_cnn_filter_num_50.add(Dense(1, activation='sigmoid'))
model_cnn_filter_num_50.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_filter_num_50.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TensorBoard(log_dir='./tmp/log/model_cnn_filter_num_50/')])
model_cnn_filter_num_50.save('./models/model_cnn_filter_num_50.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a91613ba8>

In [21]:
model_cnn_filter_num_150 = Sequential()
model_cnn_filter_num_150.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_filter_num_150.add(Conv1D(filters=150, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_filter_num_150.add(GlobalMaxPooling1D())
model_cnn_filter_num_150.add(Dense(256, activation='relu'))
model_cnn_filter_num_150.add(Dense(128, activation='relu'))
model_cnn_filter_num_150.add(Dense(1, activation='sigmoid'))
model_cnn_filter_num_150.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_filter_num_150.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TensorBoard(log_dir='./tmp/log/model_cnn_filter_num_150/')])
model_cnn_filter_num_150.save('./models/model_cnn_filter_num_150.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a93e3a208>

| <center><font size=3>Filter Num | <center><font size=3>Loss | <center><font size=3>Acc | <center><font size=3>Val Loss | <center><font size=3>Val Acc | <center><font size=3>Time(s) |
|--|--|--|--|--|--|
| <center><font size=3>50 | <center><font size=3>0.3901 | <center><font size=3>0.8230 | <center><font size=3>0.3948 | <center><font size=3>0.8203 | <center><font size=3>3196 |
| <center><font size=3>100 | <center><font size=3>0.3784 | <center><font size=3>0.8298 | <center><font size=3>0.3866 | <center><font size=3>0.8249 | <center><font size=3>4755 |
| <center><font size=3>150 | <center><font size=3>0.3726 | <center><font size=3>0.8329 | <center><font size=3>0.3830 | <center><font size=3>0.8293 | <center><font size=3>6061 |
 <center> **Filter Size = 2, Epochs = 5, Embedding Matrix = CBOW+SG

In [31]:
model_cnn_filter_size_3 = Sequential()
model_cnn_filter_size_3.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_filter_size_3.add(Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1))
model_cnn_filter_size_3.add(GlobalMaxPooling1D())
model_cnn_filter_size_3.add(Dense(256, activation='relu'))
model_cnn_filter_size_3.add(Dense(128, activation='relu'))
model_cnn_filter_size_3.add(Dense(1, activation='sigmoid'))
model_cnn_filter_size_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_filter_size_3.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TensorBoard(log_dir='./tmp/log/model_cnn_filter_size_3')])
model_cnn_filter_size_3.save('./models/model_cnn_filter_size_3.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
model_cnn_filter_size_5 = Sequential()
model_cnn_filter_size_5.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_filter_size_5.add(Conv1D(filters=100, kernel_size=5, padding='valid', activation='relu', strides=1))
model_cnn_filter_size_5.add(GlobalMaxPooling1D())
model_cnn_filter_size_5.add(Dense(256, activation='relu'))
model_cnn_filter_size_5.add(Dense(128, activation='relu'))
model_cnn_filter_size_5.add(Dense(1, activation='sigmoid'))
model_cnn_filter_size_5.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_filter_size_5.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TensorBoard(log_dir='./tmp/log/model_cnn_filter_size_5')])
model_cnn_filter_size_5.save('./models/model_cnn_filter_size_5.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


| <center><font size=3>Filter Size | <center><font size=3>Loss | <center><font size=3>Acc | <center><font size=3>Val Loss | <center><font size=3>Val Acc | <center><font size=3>Time(s) |
|--|--|--|--|--|--|
| <center><font size=3>2 | <center><font size=3>0.3784 | <center><font size=3>0.8298 | <center><font size=3>0.3866 | <center><font size=3>0.8249 | <center><font size=3>4755 |
| <center><font size=3>3 | <center><font size=3>0.3729 | <center><font size=3>0.8331 | <center><font size=3>0.3817 | <center><font size=3>0.8267 | <center><font size=3>4781 |
| <center><font size=3>5 | <center><font size=3>0.3692 | <center><font size=3>0.8348 | <center><font size=3>0.3926 | <center><font size=3>0.8272 | <center><font size=3>6941 |
 <center> **Filter Num = 100, Epochs = 5, Embedding Matrix = CBOW+SG

In [14]:
import tensorflow as tf

In [21]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)

In [None]:
model.fit(params, ..., callbacks[TB(log_dir='...')])

In [23]:
model = Sequential()
model.add(Embedding(100000, 100, weights=[embedding_matrix_cbow], input_length=70, trainable=False))
model.add(Conv1D(filters=20, kernel_size=2, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())
# model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=1, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/try')])

Train on 1565182 samples, validate on 15971 samples
Epoch 1/1


<keras.callbacks.History at 0x1a94820a90>