# SMM4H Task 2: Adverse Events

In [1]:
import pandas as pd
import numpy as np

### Load Training & Validation Data

In [2]:
# load the training data
training_data = pd.read_csv("task2_en_training.tsv", sep='\t')
print(training_data.shape)
training_data.head()

(20544, 4)


Unnamed: 0,tweet_id,user_id,class,tweet
0,344266386467606528,809439366,0,"depression hurts, cymbalta can help"
1,349220537903489025,323112996,0,"@jessicama20045 right, but cipro can make thin..."
2,351421773079781378,713100330,0,@fibby1123 are you on paxil .. i need help
3,326594278472171520,543113070,0,@redicine the lamotrigine and sjs just made ch...
4,345567138376994816,138795534,0,have decided to skip my #humira shot today. my...


In [3]:
# load the validation data
validation_data = pd.read_csv("task2_en_validation.tsv", sep='\t')
print(validation_data.shape)
validation_data.head()

(5134, 4)


Unnamed: 0,tweet_id,user_id,class,tweet
0,343909778008973312,464336224,0,i don't fucking need humira
1,352823276889837570,590337731,0,"my retake is next friday, if i bloody fail aga..."
2,339867818843594756,246979971,0,"@doctorchristian scared to start fluoxetine, w..."
3,349294537367236611,149749939,0,"@intuitivegal1 ok, if you stopped taking the l..."
4,354256195432882177,54516759,0,novartis announces secukinumab (ain457) demons...


### Word Embeddings

Option 1: Create Word Embeddings Manually using Word2Vec

In [4]:
# read in task 1 data
task1_train = pd.read_csv("task1_training.tsv", sep='\t')
task1_train = task1_train["tweet"]
task1_val = pd.read_csv("task1_validation.tsv", sep='\t')
task1_val = task1_val["tweet"]

# task 2 training data
task2_train = pd.Series(training_data["tweet"])

# combine task 1 data with task 2 training data
tweets_combined = pd.concat([task1_train, task1_val, task2_train], ignore_index=True)

#tokenize tweets
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()

tweets = []
for tweet in tweets_combined:
    tweets.append(tweet_tokenizer.tokenize(tweet))

In [5]:
#source: https://radimrehurek.com/gensim/models/word2vec.html

from gensim.models import Word2Vec

# train model
model = Word2Vec(tweets, min_count=1) # consider adjusting min_count
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.vocab)
print(words)

# save model in ASCII (word2vec) format
filename = 'embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

Word2Vec(vocab=93140, size=100, alpha=0.025)


In [188]:
#source: https://realpython.com/python-keras-text-classification/

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

### Feature Extraction

In [4]:
# data preprocessing from Tasha

from nltk.tokenize import RegexpTokenizer
import re
import string
import emoji

def strip_links(text):
    link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')
    return text

def normalize_mentions(text):
    entity_prefixes = ['@']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
            else:
                words.append('@username')
    return ' '.join(words)

tokenizer = RegexpTokenizer(r'\w+')

In [5]:
# apply preprocessing functions and tokenize tweets

from keras.preprocessing.text import Tokenizer

tweets_train = pd.Series(training_data["tweet"].values)
tweets_train = tweets_train.apply(lambda x: strip_links(x))
tweets_train = tweets_train.apply(lambda x: normalize_mentions(x))
tweets_train = tweets_train.apply(lambda x: emoji.demojize(x, delimiters=('','')))
tweets_train = tweets_train.apply(lambda x: ' '.join(tokenizer.tokenize(x)))

y_train = training_data["class"].values

tweets_test = pd.Series(validation_data["tweet"].values)
tweets_test = tweets_test.apply(lambda x: strip_links(x))
tweets_test = tweets_test.apply(lambda x: normalize_mentions(x))
tweets_test = tweets_test.apply(lambda x: emoji.demojize(x, delimiters=('','')))
tweets_test = tweets_test.apply(lambda x: ' '.join(tokenizer.tokenize(x)))
                                
y_test = validation_data["class"].values

tokenizer = Tokenizer(num_words=20000) # consider num_words argument to limit vocab size
tokenizer.fit_on_texts(tweets_train)

X_train = tokenizer.texts_to_sequences(tweets_train)
X_test = tokenizer.texts_to_sequences(tweets_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

Using TensorFlow backend.


In [6]:
# pads the sequence of words with zeros
# source: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
from keras.preprocessing.sequence import pad_sequences

maxlen = 25

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

Word Embeddings Option 2: Pre-Trained Word2Vec Twitter Embeddings

In [16]:
# source: https://github.com/FredericGodin/TwitterEmbeddings

import gensim

model = gensim.models.KeyedVectors.load_word2vec_format('word2vec_twitter_tokens.bin', 
                                                        binary=True, unicode_errors='ignore')


# source: https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-11-cnn-word2vec-41f5e28eda74

embeddings_index = {}
for w in model.wv.vocab.keys():
    embeddings_index[w] = np.append(model.wv[w],model.wv[w])
    
    
num_words = vocab_size
embedding_matrix = np.zeros((num_words, 800))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


### Model Building

In [17]:
# source: https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric

# define custom loss function

import keras.backend as K
import tensorflow as tf

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [27]:
from keras.models import Sequential
from keras import layers

# build the model

embedding_dim = 800

#un-comment when using manually created Word2Vec embeddings
#embedding_matrix = create_embedding_matrix('embedding_word2vec.txt',tokenizer.word_index, embedding_dim)
#embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=True))

model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())

#model.add(layers.Bidirectional(layers.LSTM(64))) -- experiemented with adding LSTM layer here

model.add(layers.Dropout(0.3))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss=f1_loss,
              metrics=['accuracy'])
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 25, 800)           21651200  
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 21, 128)           512128    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                1290      
_________________________________________________________________
dropout_6 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                

### Train Model

In [28]:
import keras


# source: https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
es = keras.callbacks.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = keras.callbacks.callbacks.ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)

history = model.fit(X_train, y_train,
                    epochs=1000,
                    validation_data=(X_test, y_test),
                    batch_size=128, callbacks = [es,mc])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20544 samples, validate on 5134 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 00016: early stopping


### Make Predictions on Validation Data

In [29]:
# load the saved model
from keras.models import load_model
saved_model = load_model('best_model.h5', custom_objects={'f1_loss':f1_loss})
# evaluate the model
loss, accuracy = saved_model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
print("")
loss, accuracy = saved_model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

# caclulate the F1-Score
from sklearn import metrics
preds = saved_model.predict_classes(X_test)
print("F1 Score:", metrics.f1_score(y_test, preds))
print("Precision:", metrics.precision_score(y_test, preds))
print("Recall:", metrics.recall_score(y_test, preds))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Training Accuracy: 0.9657

Testing Accuracy:  0.9108
F1 Score: 0.5278350515463918
Precision: 0.5161290322580645
Recall: 0.540084388185654


In [216]:
# average 5 runs
test_accuracy = [0.9114, 0.9153, 0.9032, 0.9143, 0.9129]
f1_scores = [0.5215562565720294, 0.5307443365695793, 0.5216554379210779, 0.5319148936170213, 0.5348595213319459]
precisions = [0.519916142557652, 0.543046357615894, 0.479646017699115, 0.5364806866952789, 0.5277207392197125]
recalls = [0.5232067510548524, 0.5189873417721519, 0.5717299578059072, 0.5274261603375527, 0.5421940928270043]

print("Average Test Accuracy: ", sum(test_accuracy)/len(test_accuracy))
print("Average F1-Score: ", sum(f1_scores)/len(f1_scores))
print("Average Precision: ", sum(precisions)/len(precisions))
print("Average Recall: ", sum(recalls)/len(recalls))

Average Test Accuracy:  0.91142
Average F1-Score:  0.5281460892023307
Average Precision:  0.5213619887575305
Average Recall:  0.5367088607594936
