In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
input_file = 'clean_data.csv'
file = pandas.read_csv(input_file)

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
x = file.Tweet
y = file.Label
seed = 334
# 按（0.98, 0.01, 0.01）的比例划分训练、验证、测试集
x_train, x_validation_test, y_train, y_validation_test = train_test_split(x, y, test_size=0.02, random_state=seed)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_test, y_validation_test, test_size=0.5, random_state=seed)

# print(len(x))
# print(len(x_train), len(x_val), len(x_test))

| <center><font size=3>Set | <center><font size=3>Size | <center><font size=3>Proportion |
|--|--|--|--|--|--|
| <center><font size=3>Train | <center><font size=3>1565182 | <center><font size=3>98.00% |
| <center><font size=3>Validation | <center><font size=3>15971 | <center><font size=3>1.00% |
| <center><font size=3>Test | <center><font size=3>15982 | <center><font size=3>1.00% |
| <center><font size=3>Total | <center><font size=3>1597125 | <center><font size=3>100.00% |

In [5]:
from gensim.models import KeyedVectors

Using TensorFlow backend.


In [6]:
# 载入预生成的词向量长度为100的word2vec模型
model_cbow = KeyedVectors.load('./word2vec_models/model_cbow_100.word2vec')
model_sg = KeyedVectors.load('./word2vec_models/model_sg_100.word2vec')

# len(model_cbow.wv.vocab)

| <center><font size=3>Vocabulary Size | <center><font size=3>Vector Size| <center><font size=3>Vector Type |
|--|--|--|
| <center><font size=3>268681 | <center><font size=3>100 | <center><font size=3>numpy.ndarray |

In [7]:
from keras.preprocessing.text import Tokenizer

In [8]:
# 创建不同词频阈值的分词器，并将每个句子中的单词序列变为数字序列
num_words=100000

tokenizer_100000 = Tokenizer(num_words=num_words)
# tokenizer_150000 = Tokenizer(num_words=150000)
# tokenizer_200000 = Tokenizer(num_words=200000)
# tokenizer_250000 = Tokenizer(num_words=250000)

tokenizer_100000.fit_on_texts(x_train)
# tokenizer_150000.fit_on_texts(x_train)
# tokenizer_200000.fit_on_texts(x_train)
# tokenizer_250000.fit_on_texts(x_train)

sequences_train_100000 = tokenizer_100000.texts_to_sequences(x_train)
# sequences_train_150000 = tokenizer_150000.texts_to_sequences(x_train)
# sequences_train_200000 = tokenizer_200000.texts_to_sequences(x_train)
# sequences_200000 = tokenizer_250000.texts_to_sequences(x_train)

# 利用训练集上的分词器对验证集和测试集进行序列化
sequences_validation_100000 = tokenizer_100000.texts_to_sequences(x_validation)
sequences_test_100000 = tokenizer_100000.texts_to_sequences(x_test)
# sequences_validation_150000 = tokenizer_150000.texts_to_sequences(x_validation)
# sequences_test_150000 = tokenizer_150000.texts_to_sequences(x_test)
# sequences_validation_200000 = tokenizer_200000.texts_to_sequences(x_validation)
# sequences_test_200000 = tokenizer_200000.texts_to_sequences(x_test)

In [9]:
# 确定句子中最大的单词数
len_max = 0
for x in x_train:
    temp = len(x.split())
    if temp > len_max:
        len_max = temp

# len_max

# 这里不能用sequence的长度来确定句子的最大长度，因为原始数据中存在乱码，如下面的两个print语句所示，而预处理的时候没有考虑到这一点
# print(sequences_100000[510758:510759])
# print(x_train[510758:510759])
# x_train 输入num时，为index=num的元素，输入num1:num2时，为第num1行到第num2行的元素

In [10]:
from keras.preprocessing.sequence import pad_sequences

In [11]:
# 对于变长的句子进行padding使其成为维度相同的标准输入
x_train_pad = pad_sequences(sequences_train_100000, maxlen=70, padding='post')
x_validation_pad = pad_sequences(sequences_validation_100000, maxlen=70, padding='post')
x_test_pad = pad_sequences(sequences_test_100000, maxlen=70, padding='post')

# x_train_pad[510758:510759]

In [12]:
# 创建嵌入矩阵（即在输入神经网络时将标号转为向量的索引）
embedding_matrix_cbow = np.zeros((num_words, 100))
embedding_matrix_sg = np.zeros((num_words, 100))
embedding_matrix_cbow_sg = np.zeros((num_words, 200))
for word, rank in tokenizer_100000.word_index.items():
    if rank >= num_words:
        break
    if word in model_cbow:
        embedding_matrix_cbow[rank] = model_cbow[word]
    if word in model_sg:
        embedding_matrix_sg[rank] = model_sg[word]
    if (word in model_cbow) and (word in model_sg):
        embedding_matrix_cbow_sg[rank] = np.append(model_cbow[word],model_sg[word])

# 某些词如‘quot’等不具有实意的词在word2vec模型中没有对应的词向量，但词频极高，现在的处理会将其视为零向量

# print(x_train[1:2])
# sequences_train_100000[1:2]
# print(embedding_matrix_cbow[297])
# print(model_cbow['play'])

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding 
from keras.callbacks import TensorBoard
import tensorflow as tf

In [14]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)
        
# model.fit(params, ..., callbacks[TB(log_dir='...')])

In [17]:
model_cnn_embedding_matrix_cbow = Sequential()
model_cnn_embedding_matrix_cbow.add(Embedding(100000, 100, weights=[embedding_matrix_cbow], input_length=70, trainable=False))
model_cnn_embedding_matrix_cbow.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_embedding_matrix_cbow.add(GlobalMaxPooling1D())
model_cnn_embedding_matrix_cbow.add(Dense(256, activation='relu'))
model_cnn_embedding_matrix_cbow.add(Dense(128, activation='relu'))
model_cnn_embedding_matrix_cbow.add(Dense(1, activation='sigmoid'))
model_cnn_embedding_matrix_cbow.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_embedding_matrix_cbow.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_embedding_matrix_cbow/')])
model_cnn_embedding_matrix_cbow.save('./models/model_cnn_embedding_matrix_cbow.h5')

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
model_cnn_embedding_matrix_sg = Sequential()
model_cnn_embedding_matrix_sg.add(Embedding(100000, 100, weights=[embedding_matrix_sg], input_length=70, trainable=False))
model_cnn_embedding_matrix_sg.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_embedding_matrix_sg.add(GlobalMaxPooling1D())
model_cnn_embedding_matrix_sg.add(Dense(256, activation='relu'))
model_cnn_embedding_matrix_sg.add(Dense(128, activation='relu'))
model_cnn_embedding_matrix_sg.add(Dense(1, activation='sigmoid'))
model_cnn_embedding_matrix_sg.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_embedding_matrix_sg.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_embedding_matrix_sg/')])
model_cnn_embedding_matrix_sg.save('./models/model_cnn_embedding_matrix_sg.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


| <center><font size=3>Embedding Matrix | <center><font size=3>Loss | <center><font size=3>Acc | <center><font size=3>Val Loss | <center><font size=3>Val Acc | <center><font size=3>Time(s) |
|--|--|--|--|--|
| <center><font size=3>CBOW | <center><font size=3>0.3858 | <center><font size=3>0.8255 | <center><font size=3>0.3935 | <center><font size=3>0.8250 | <center><font size=3>2931 |
| <center><font size=3>SG | <center><font size=3>0.3769 | <center><font size=3>0.8302 | <center><font size=3>0.3918 | <center><font size=3>0.8259 | <center><font size=3>2626 |
| <center><font size=3>CBOW+SG | <center><font size=3>0.3787 | <center><font size=3>0.8297 | <center><font size=3>0.3891 | <center><font size=3>0.8266 | <center><font size=3>3427 |
 <center> **Filter Num = 100, Filter Size = 2, Strides = 1, Partition = 1, Epochs = 5

In [19]:
model_cnn_filter_num_100 = Sequential()
model_cnn_filter_num_100.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_filter_num_100.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_filter_num_100.add(GlobalMaxPooling1D())
model_cnn_filter_num_100.add(Dense(256, activation='relu'))
model_cnn_filter_num_100.add(Dense(128, activation='relu'))
model_cnn_filter_num_100.add(Dense(1, activation='sigmoid'))
model_cnn_filter_num_100.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_filter_num_100.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_filter_num_100/')])
model_cnn_filter_num_100.save('./models/model_cnn_filter_num_100.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
model_cnn_filter_num_50 = Sequential()
model_cnn_filter_num_50.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_filter_num_50.add(Conv1D(filters=50, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_filter_num_50.add(GlobalMaxPooling1D())
model_cnn_filter_num_50.add(Dense(256, activation='relu'))
model_cnn_filter_num_50.add(Dense(128, activation='relu'))
model_cnn_filter_num_50.add(Dense(1, activation='sigmoid'))
model_cnn_filter_num_50.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_filter_num_50.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_filter_num_50/')])
model_cnn_filter_num_50.save('./models/model_cnn_filter_num_50.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
model_cnn_filter_num_150 = Sequential()
model_cnn_filter_num_150.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_filter_num_150.add(Conv1D(filters=150, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_filter_num_150.add(GlobalMaxPooling1D())
model_cnn_filter_num_150.add(Dense(256, activation='relu'))
model_cnn_filter_num_150.add(Dense(128, activation='relu'))
model_cnn_filter_num_150.add(Dense(1, activation='sigmoid'))
model_cnn_filter_num_150.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_filter_num_150.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_filter_num_150/')])
model_cnn_filter_num_150.save('./models/model_cnn_filter_num_150.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


| <center><font size=3>Filter Num | <center><font size=3>Loss | <center><font size=3>Acc | <center><font size=3>Val Loss | <center><font size=3>Val Acc | <center><font size=3>Time(s) |
|--|--|--|--|--|--|
| <center><font size=3>50 | <center><font size=3>0.3905 | <center><font size=3>0.8230 | <center><font size=3>0.3949 | <center><font size=3>0.8201 | <center><font size=3>2350 |
| <center><font size=3>100 | <center><font size=3>0.3787 | <center><font size=3>0.8297 | <center><font size=3>0.3891 | <center><font size=3>0.8266 | <center><font size=3>3427 |
| <center><font size=3>150 | <center><font size=3>0.3725 | <center><font size=3>0.8329 | <center><font size=3>0.3828 | <center><font size=3>0.8261 | <center><font size=3>4680 |
 <center> **Filter Size = 2, Strides = 1, Partition = 1, Epochs = 5, Embedding Matrix = CBOW+SG

In [22]:
model_cnn_filter_size_3 = Sequential()
model_cnn_filter_size_3.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_filter_size_3.add(Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1))
model_cnn_filter_size_3.add(GlobalMaxPooling1D())
model_cnn_filter_size_3.add(Dense(256, activation='relu'))
model_cnn_filter_size_3.add(Dense(128, activation='relu'))
model_cnn_filter_size_3.add(Dense(1, activation='sigmoid'))
model_cnn_filter_size_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_filter_size_3.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_filter_size_3')])
model_cnn_filter_size_3.save('./models/model_cnn_filter_size_3.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
model_cnn_filter_size_5 = Sequential()
model_cnn_filter_size_5.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_filter_size_5.add(Conv1D(filters=100, kernel_size=5, padding='valid', activation='relu', strides=1))
model_cnn_filter_size_5.add(GlobalMaxPooling1D())
model_cnn_filter_size_5.add(Dense(256, activation='relu'))
model_cnn_filter_size_5.add(Dense(128, activation='relu'))
model_cnn_filter_size_5.add(Dense(1, activation='sigmoid'))
model_cnn_filter_size_5.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_filter_size_5.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_filter_size_5')])
model_cnn_filter_size_5.save('./models/model_cnn_filter_size_5.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


| <center><font size=3>Filter Size | <center><font size=3>Loss | <center><font size=3>Acc | <center><font size=3>Val Loss | <center><font size=3>Val Acc | <center><font size=3>Time(s) |
|--|--|--|--|--|--|
| <center><font size=3>2 | <center><font size=3>0.3787 | <center><font size=3>0.8297 | <center><font size=3>0.3891 | <center><font size=3>0.8266 | <center><font size=3>3427 |
| <center><font size=3>3 | <center><font size=3>0.3730 | <center><font size=3>0.8329 | <center><font size=3>0.3827 | <center><font size=3>0.8269 | <center><font size=3>4105 |
| <center><font size=3>5 | <center><font size=3>0.3689 | <center><font size=3>0.8350 | <center><font size=3>0.3914 | <center><font size=3>0.8257 | <center><font size=3>5303 |
 <center> **Filter Num = 100, Strides = 1, Partition = 1, Epochs = 5, Embedding Matrix = CBOW+SG

In [19]:
model_cnn_stride_2 = Sequential()
model_cnn_stride_2.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_stride_2.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=2))
model_cnn_stride_2.add(GlobalMaxPooling1D())
model_cnn_stride_2.add(Dense(256, activation='relu'))
model_cnn_stride_2.add(Dense(128, activation='relu'))
model_cnn_stride_2.add(Dense(1, activation='sigmoid'))
model_cnn_stride_2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_stride_2.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_stride_2/')])
model_cnn_stride_2.save('./models/model_cnn_stride_2.h5')

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
model_cnn_stride_3 = Sequential()
model_cnn_stride_3.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_stride_3.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=3))
model_cnn_stride_3.add(GlobalMaxPooling1D())
model_cnn_stride_3.add(Dense(256, activation='relu'))
model_cnn_stride_3.add(Dense(128, activation='relu'))
model_cnn_stride_3.add(Dense(1, activation='sigmoid'))
model_cnn_stride_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_stride_3.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_stride_3/')])
model_cnn_stride_3.save('./models/model_cnn_stride_3.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


| <center><font size=3>Stride | <center><font size=3>Loss | <center><font size=3>Acc | <center><font size=3>Val Loss | <center><font size=3>Val Acc | <center><font size=3>Time(s) |
|--|--|--|--|--|--|
| <center><font size=3>1 | <center><font size=3>0.3787 | <center><font size=3>0.8297 | <center><font size=3>0.3891 | <center><font size=3>0.8266 | <center><font size=3>3427 |
| <center><font size=3>2 | <center><font size=3>0.3958 | <center><font size=3>0.8194 | <center><font size=3>0.4037 | <center><font size=3>0.8183 | <center><font size=3>2859 |
| <center><font size=3>3 | <center><font size=3>0.4506 | <center><font size=3>0.7862 | <center><font size=3>0.4565 | <center><font size=3>0.7819 | <center><font size=3>1823 |
 <center> **Filter Size = 2, Filter Num = 100, Partition = 1, Epochs = 5, Embedding Matrix = CBOW+SG

In [20]:
from keras.layers import MaxPooling1D, Flatten

In [21]:
model_cnn_2_partition_pooling = Sequential()
model_cnn_2_partition_pooling.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_2_partition_pooling.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_2_partition_pooling.add(MaxPooling1D(pool_size=35))
model_cnn_2_partition_pooling.add(Flatten())
model_cnn_2_partition_pooling.add(Dense(256, activation='relu'))
model_cnn_2_partition_pooling.add(Dense(128, activation='relu'))
model_cnn_2_partition_pooling.add(Dense(1, activation='sigmoid'))
model_cnn_2_partition_pooling.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_2_partition_pooling.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_2_partition_pooling')])
model_cnn_2_partition_pooling.save('./models/model_cnn_2_partition_pooling.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
model_cnn_3_partition_pooling = Sequential()
model_cnn_3_partition_pooling.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_3_partition_pooling.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_3_partition_pooling.add(MaxPooling1D(pool_size=24))
model_cnn_3_partition_pooling.add(Flatten())
model_cnn_3_partition_pooling.add(Dense(256, activation='relu'))
model_cnn_3_partition_pooling.add(Dense(128, activation='relu'))
model_cnn_3_partition_pooling.add(Dense(1, activation='sigmoid'))
model_cnn_3_partition_pooling.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_3_partition_pooling.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_3_partition_pooling')])
model_cnn_3_partition_pooling.save('./models/model_cnn_3_partition_pooling.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
model_cnn_4_partition_pooling = Sequential()
model_cnn_4_partition_pooling.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_4_partition_pooling.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_4_partition_pooling.add(MaxPooling1D(pool_size=18))
model_cnn_4_partition_pooling.add(Flatten())
model_cnn_4_partition_pooling.add(Dense(256, activation='relu'))
model_cnn_4_partition_pooling.add(Dense(128, activation='relu'))
model_cnn_4_partition_pooling.add(Dense(1, activation='sigmoid'))
model_cnn_4_partition_pooling.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_4_partition_pooling.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_4_partition_pooling')])
model_cnn_4_partition_pooling.save('./models/model_cnn_4_partition_pooling.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
model_cnn_5_partition_pooling = Sequential()
model_cnn_5_partition_pooling.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_5_partition_pooling.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_5_partition_pooling.add(MaxPooling1D(pool_size=14))
model_cnn_5_partition_pooling.add(Flatten())
model_cnn_5_partition_pooling.add(Dense(256, activation='relu'))
model_cnn_5_partition_pooling.add(Dense(128, activation='relu'))
model_cnn_5_partition_pooling.add(Dense(1, activation='sigmoid'))
model_cnn_5_partition_pooling.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_5_partition_pooling.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_5_partition_pooling')])
model_cnn_5_partition_pooling.save('./models/model_cnn_5_partition_pooling.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


| <center><font size=3>Partition | <center><font size=3>Loss | <center><font size=3>Acc | <center><font size=3>Val Loss | <center><font size=3>Val Acc | <center><font size=3>Time(s) |
|--|--|--|--|--|--|
| <center><font size=3>1 | <center><font size=3>0.3787 | <center><font size=3>0.8297 | <center><font size=3>0.3891 | <center><font size=3>0.8266 | <center><font size=3>3427 |
| <center><font size=3>2 | <center><font size=3>0.3758 | <center><font size=3>0.8296 | <center><font size=3>0.3840 | <center><font size=3>0.8271 | <center><font size=3>2936 |
| <center><font size=3>3 | <center><font size=3>0.3778 | <center><font size=3>0.8300 | <center><font size=3>0.3818 | <center><font size=3>0.8279 | <center><font size=3>2944 |
| <center><font size=3>4 | <center><font size=3>0.3782 | <center><font size=3>0.8299 | <center><font size=3>0.3825 | <center><font size=3>0.8291 | <center><font size=3>2516 |
| <center><font size=3>5 | <center><font size=3>0.3762 | <center><font size=3>0.8312 | <center><font size=3>0.3880 | <center><font size=3>0.8252 | <center><font size=3>2533 |
 <center> **Filter Size = 2, Filter Num = 100, Stride = 1, Epochs = 5, Embedding Matrix = CBOW+SG

In [None]:
from keras import backend as K 
from keras.layers import Lambda

In [None]:
def k_max(x):
    x.top_k
    
def k_max_output_shape(input_shape, k):
    shape = list(input_shape)
    assert len(shape) == 3
    shape[-1] = k
    return tuple(shape)

In [None]:
model_cnn_2_max_pooling = Sequential()
model_cnn_2_max_pooling.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_2_max_pooling.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_2_max_pooling.add(Lambda(k_max, output_shape=k_max_output_shape(2)))
model_cnn_2_max_pooling.add(Flatten())
model_cnn_2_max_pooling.add(Dense(256, activation='relu'))
model_cnn_2_max_pooling.add(Dense(128, activation='relu'))
model_cnn_2_max_pooling.add(Dense(1, activation='sigmoid'))
model_cnn_2_max_pooling.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_2_max_pooling.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_2_max_pooling')])
model_cnn_2_max_pooling.save('./models/model_cnn_2_max_pooling.h5')

In [None]:
model_cnn_trainable_true = Sequential()
model_cnn_trainable_true.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=True))
model_cnn_trainable_true.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_trainable_true.add(GlobalMaxPooling1D())
model_cnn_trainable_true.add(Dense(256, activation='relu'))
model_cnn_trainable_true.add(Dense(128, activation='relu'))
model_cnn_trainable_true.add(Dense(1, activation='sigmoid'))
model_cnn_trainable_true.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_trainable_true.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_trainable_true/')])
model_cnn_trainable_true.save('./models/model_cnn_trainable_true.h5')

| <center><font size=3>Trainable | <center><font size=3>Loss | <center><font size=3>Acc | <center><font size=3>Val Loss | <center><font size=3>Val Acc | <center><font size=3>Time(s) |
|--|--|--|--|--|--|
| <center><font size=3>False | <center><font size=3>0.3787 | <center><font size=3>0.8297 | <center><font size=3>0.3891 | <center><font size=3>0.8266 | <center><font size=3>3427 |
| <center><font size=3>True | <center><font size=3>0. | <center><font size=3>0. | <center><font size=3>0. | <center><font size=3>0. | <center><font size=3> |
 <center> **Filter Size = 2, Filter Num = 100, Strides = 1, Partition = 1, Epochs = 5, Embedding Matrix = CBOW+SG

In [50]:
from keras.layers import Conv2D, MaxPooling2D, Reshape

In [None]:
model_cnn_conv2d = Sequential()
model_cnn_conv2d.add(Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False))
model_cnn_conv2d.add(Reshape((-1, 200, 1)))
model_cnn_conv2d.add(Conv2D(filters=70, kernel_size=(2, 2), padding='valid', activation='relu', strides=(1, 1)))
model_cnn_conv2d.add(MaxPooling2D(pool_size=(2, 2), padding='valid'))
model_cnn_conv2d.add(Conv2D(filters=50, kernel_size=(2, 2), padding='valid', activation='relu', strides=(1, 1)))
model_cnn_conv2d.add(MaxPooling2D(pool_size=(2, 2), padding='valid'))
model_cnn_conv2d.add(Conv2D(filters=30, kernel_size=(2, 2), padding='valid', activation='relu', strides=(1, 1)))
model_cnn_conv2d.add(MaxPooling2D(pool_size=(2, 2), padding='valid'))
model_cnn_conv2d.add(Flatten())
model_cnn_conv2d.add(Dense(256, activation='relu'))
model_cnn_conv2d.add(Dense(128, activation='relu'))
model_cnn_conv2d.add(Dense(1, activation='sigmoid'))
model_cnn_conv2d.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_cnn_conv2d.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=5, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_conv2d/')])
model_cnn_conv2d.save('./models/model_cnn_conv2d.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, 70, 200)           20000000  
_________________________________________________________________
reshape_14 (Reshape)         (None, 70, 200, 1)        0         
_________________________________________________________________
conv2d_32 (Conv2D)           (None, 69, 199, 70)       350       
_________________________________________________________________
max_pooling2d_28 (MaxPooling (None, 34, 99, 70)        0         
_________________________________________________________________
conv2d_33 (Conv2D)           (None, 33, 98, 50)        14050     
_________________________________________________________________
max_pooling2d_29 (MaxPooling (None, 16, 49, 50)        0         
_________________________________________________________________
conv2d_34 (Conv2D)           (None, 15, 48, 30)        6030      
__________

| <center><font size=3>Network | <center><font size=3>Loss | <center><font size=3>Acc | <center><font size=3>Val Loss | <center><font size=3>Val Acc | <center><font size=3>Time(s) |
|--|--|--|--|--|--|
| <center><font size=3>Conv1D | <center><font size=3>0.3787 | <center><font size=3>0.8297 | <center><font size=3>0.3891 | <center><font size=3>0.8266 | <center><font size=3>3427 |
| <center><font size=3>Conv2D | <center><font size=3>0. | <center><font size=3>0. | <center><font size=3>0. | <center><font size=3>0. | <center><font size=3> |
 <center> **Epochs = 5, Embedding Matrix = CBOW+SG

In [15]:
from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model

In [None]:
input_type = Input(shape=(70,), dtype='float64')
tweet_encoder = Embedding(100000, 200, weights=[embedding_matrix_cbow_sg], input_length=70, trainable=False)(input_type)
model_2 = Conv1D(filters=50, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
model_2 = GlobalMaxPooling1D()(model_2)
model_3 = Conv1D(filters=50, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
model_3 = GlobalMaxPooling1D()(model_3)
model_5 = Conv1D(filters=50, kernel_size=5, padding='valid', activation='relu', strides=1)(tweet_encoder)
model_5 = GlobalMaxPooling1D()(model_5)
merged = concatenate([model_2, model_3, model_5], axis=1)
merged = Dense(256, activation='relu')(merged)
merged = Dense(128, activation='relu')(merged)
output = Dense(1, activation='sigmoid')(merged)
model_cnn_mixed = Model(inputs=[input_type], outputs=[output])
model_cnn_mixed.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn_mixed.fit(x_train_pad, y_train, validation_data=(x_validation_pad, y_validation), epochs=10, batch_size=32, verbose=1, callbacks=[TB(log_dir='./tmp/log/model_cnn_mixed/')])
model_cnn_mixed.save('./models/model_cnn_mixed.h5')

Train on 1565182 samples, validate on 15971 samples
Epoch 1/10
 345056/1565182 [=====>........................] - ETA: 23:14 - loss: 0.4370 - acc: 0.7967

In [None]:
model_mix.evaluate(x=x_test_pad, y=y_test)

In [None]:
from sklearn.metrics import roc_curve, auc
FPR, TPR, threshold = roc_curve(y_test, model_mix.predict(x_test_pad))
roc_auc = auc(FPR, TPR)
plt.plot(FPR, TPR)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.show()