# CNN with enhanced meta features

## Load data

In [1]:
!ls data

auxiliary_features.bc	       journalist_accounts.txt	tweets_1.json
auxiliary_features_v2.bc       log_labels_v3.bc		tweets_2.json
celebrity_accounts.txt	       meta_features_v3.bc	tweets_3.json
classification_labels.bc       raw_labels_v3.bc		tweets_4.json
classification_labels_v2.bc    regression_labels.bc	tweets_5.json
embedding_matrix_100dim.bc     regression_labels_v2.bc	tweet_texts.txt
embedding_matrix_100dim_v2.bc  sentiment_data		tweet_texts_v2.txt
embedding_matrix_100dim_v3.bc  sequences_len32.bc	tweet_texts_v3.txt
embedding_matrix_200dim.bc     sequences_len32_v2.bc	user_ids.txt
embedding_matrix_200dim_v2.bc  sequences_len32_v3.bc	word_index.json
embedding_matrix_25dim.bc      sequences_len48.bc	word_index_v3.json
embedding_matrix_25dim_v2.bc   sequences_len48_v2.bc	word_labels.tsv
embedding_matrix_50dim.bc      sequences_len48_v3.bc	word_labels_v3.tsv
embedding_matrix_50dim_v2.bc   tech_accounts.txt


In [2]:
from tep.utils import load_array
y = load_array("data/log_labels_v3.bc")
print(y.shape)
X_seq = load_array("data/sequences_len32_v3.bc")
print(X_seq.shape)
X_meta = load_array("data/meta_features_v3.bc")
print(X_meta.shape)
emb_mat = load_array("data/embedding_matrix_100dim_v3.bc")
print(emb_mat.shape)

(1293005,)
(1293005, 32)
(1293005, 24)
(721696, 100)


In [3]:
import numpy as np
np.nan_to_num(X_meta, copy=False)

array([[0., 1., 1., ..., 0., 0., 1.],
       [0., 0., 2., ..., 0., 0., 1.],
       [0., 0., 2., ..., 0., 0., 1.],
       ...,
       [1., 3., 0., ..., 0., 0., 0.],
       [1., 3., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 5., 0., 0.]])

## Build model

In [4]:
from tep.deepConvModel import regression_model
from tep.trainUtils import get_callbacks, print_regression_metrics, r2

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
model = regression_model(X_meta.shape[1], 
                         emb_mat, 
                         X_seq.shape[1], 
                         conv_layers=2, 
                         filters=64,
                         dropout=0.5,
                         fc_layers=2, 
                         fc_units=128, 
                         metrics=[r2])

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [6]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_input (InputLayer)         (None, 32)           0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 32, 100)      72169600    text_input[0][0]                 
__________________________________________________________________________________________________
pad_1 (ZeroPadding1D)           (None, 34, 100)      0           word_embedding[0][0]             
__________________________________________________________________________________________________
conv_1 (Conv1D)                 (None, 32, 64)       19264       pad_1[0][0]                      
__________________________________________________________________________________________________
pool_1 (Ma

In [7]:
model_path = 'models/enhanced_meta'
model_name = 'baseline'
logging_path = model_path + '/' + model_name

In [8]:
!mkdir -p $logging_path

In [9]:
!cp data/word_labels_v3.tsv $logging_path

In [10]:
from tep.modelUtils import save_architecture

In [11]:
save_architecture(model, model_path + '/' + model_name + '.json')

## Prepare model training

In [12]:
from keras.callbacks import ModelCheckpoint, History, EarlyStopping, ReduceLROnPlateau

In [13]:
cp_file = model_path + '/' + model_name + '.hdf5'

In [14]:
# create callbacks
cp = ModelCheckpoint(cp_file, save_best_only=True, save_weights_only=True, verbose=1)
hist = History()
es = EarlyStopping(patience=5, verbose=1)
rlr = ReduceLROnPlateau(patience=3, verbose=1)
cbs = [cp, hist, es, rlr]

In [15]:
valid_size = 10000
train_size = X_seq.shape[0] - valid_size # 10000
batch_size = 1024
print(valid_size)
print(train_size)
print(batch_size)

X_train = {'text_input': X_seq[:train_size], 'aux_input': X_meta[:train_size]}
y_train = {'output': y[:train_size]}
valid = ({'text_input': X_seq[-valid_size:], 'aux_input': X_meta[-valid_size:]}, {'output': y[-valid_size:]})

10000
1283005
1024


## Train model

In [16]:
model.compile(optimizer='Adam', loss='mean_squared_error', metrics=[r2])
model.fit(x=X_train, 
          y=y_train, 
          validation_data=valid, 
          batch_size=batch_size, 
          verbose=0,
          epochs=100,
          shuffle=True, 
          callbacks=cbs)


Epoch 00001: val_loss improved from inf to 0.89532, saving model to models/enhanced_meta/baseline.hdf5

Epoch 00002: val_loss improved from 0.89532 to 0.80877, saving model to models/enhanced_meta/baseline.hdf5

Epoch 00003: val_loss improved from 0.80877 to 0.80638, saving model to models/enhanced_meta/baseline.hdf5

Epoch 00004: val_loss improved from 0.80638 to 0.78542, saving model to models/enhanced_meta/baseline.hdf5

Epoch 00005: val_loss did not improve

Epoch 00006: val_loss did not improve

Epoch 00007: val_loss did not improve

Epoch 00008: val_loss did not improve

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.

Epoch 00009: val_loss did not improve
Epoch 00009: early stopping


<keras.callbacks.History at 0x7f179045da58>

In [17]:
history = cbs[1]
print_regression_metrics(history)

loss: 0.6713629767480636, r2: 0.8528450812839237, val_loss: 0.7854210567474366, val_r2: 0.8265550698280334


In [18]:
!ls models/enhanced_meta/

baseline  baseline.hdf5  baseline.json


## Analyze neuron activations

In [19]:
model.load_weights('models/enhanced_meta/baseline.hdf5')

In [20]:
import json
word_index_file = open('data/word_index_v3.json')
word_index_str = word_index_file.read()
word_index = json.loads(word_index_str)
type(word_index)

dict

In [23]:
from tep.featureVisualization import ConvLayerVisualizer
clv = ConvLayerVisualizer(model, word_index)

In [24]:
model.input

[<tf.Tensor 'text_input:0' shape=(?, 32) dtype=int32>,
 <tf.Tensor 'aux_input:0' shape=(?, 24) dtype=float32>]