In [1]:
import pandas as pd
import logging
import sys
import re
import spacy
pd.set_option('display.max_colwidth', 200)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# # Create STDERR handler
handler = logging.StreamHandler(sys.stderr)
# # ch.setLevel(logging.DEBUG)

# # Create formatter and add it to the handler
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# # Set STDERR handler as the only handler 
logger.handlers = [handler]

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
logger.info('Train set shape:{}'.format(train.shape))
logger.info('Test set shape:{}'.format(test.shape))
train.head()

root - INFO - Train set shape:(7613, 5)
root - INFO - Test set shape:(3263, 4)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [3]:
train['target'].value_counts(normalize = True)

0    0.57034
1    0.42966
Name: target, dtype: float64

In [4]:
train['text'].iloc[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

#### Function to find hashtags

In [5]:
def find_hashtags(tweet):
    return ", ".join([match.group(0)[1:] for match in re.finditer(r"#\w+", tweet)]) or None

find_hashtags(train['text'].iloc[5])

'RockyFire, CAfire, wildfires'

In [6]:
train['hashtag'] = train["text"].apply(lambda x: find_hashtags(x))
train['hashtag'].fillna(value="no", inplace=True)
train.head()

Unnamed: 0,id,keyword,location,text,target,hashtag
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,earthquake
1,4,,,Forest fire near La Ronge Sask. Canada,1,no
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,no
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,wildfires
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,"Alaska, wildfires"


#### Remove URLs

In [7]:
train['clean_tweet'] = train['text'].apply(lambda x: re.sub(r'http\S+', '', x))

test['clean_tweet'] = test['text'].apply(lambda x: re.sub(r'http\S+', '', x))
train.head()

Unnamed: 0,id,keyword,location,text,target,hashtag,clean_tweet
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,earthquake,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
1,4,,,Forest fire near La Ronge Sask. Canada,1,no,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,no,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,wildfires,"13,000 people receive #wildfires evacuation orders in California"
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,"Alaska, wildfires",Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school


In [8]:
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

# remove numbers
train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

# remove whitespaces
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))
train.head()

Unnamed: 0,id,keyword,location,text,target,hashtag,clean_tweet
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,earthquake,our deeds are the reason of this earthquake may allah forgive us all
1,4,,,Forest fire near La Ronge Sask. Canada,1,no,forest fire near la ronge sask. canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,no,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,wildfires,", people receive wildfires evacuation orders in california"
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,"Alaska, wildfires",just got sent this photo from ruby alaska as smoke from wildfires pours into a school


In [9]:
# import spaCy's language model
# !python -m spacy download en

nlp = spacy.load('en', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

In [10]:
train['clean_tweet'] = lemmatization(train['clean_tweet'])
test['clean_tweet'] = lemmatization(test['clean_tweet'])
train.head()

Unnamed: 0,id,keyword,location,text,target,hashtag,clean_tweet
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,earthquake,-PRON- deed be the reason of this earthquake may allah forgive -PRON- all
1,4,,,Forest fire near La Ronge Sask. Canada,1,no,forest fire near la ronge sask . canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,no,all resident ask to ' shelter in place ' be be notify by officer . no other evacuation or shelter in place order be expect
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,wildfires,", people receive wildfire evacuation order in california"
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,"Alaska, wildfires",just get send this photo from ruby alaska as smoke from wildfire pour into a school


#### Preparing Elmo Vectors:

In [11]:
# !pip install "tensorflow>=1.7.0"
# !pip install tensorflow-hub
# !pip uninstall tensorflow tensorflow_hub tensorflowjs
# !pip install tensorflow==2.0.0a0 tensorflow_hub==0.5.0 tensorflowjs==1.2.6
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [12]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

absl - INFO - Using /var/folders/fj/_dh4b8qn3mq_ydqmf0f_jlgdt59p0j/T/tfhub_modules to cache modules.
tensorflow - DEBUG - Initialize variable module/aggregation/scaling:0 from checkpoint b'/var/folders/fj/_dh4b8qn3mq_ydqmf0f_jlgdt59p0j/T/tfhub_modules/9bb74bc86f9caffc8c47dd7b33ec4bb354d9602d/variables/variables' with aggregation/scaling
tensorflow - DEBUG - Initialize variable module/aggregation/weights:0 from checkpoint b'/var/folders/fj/_dh4b8qn3mq_ydqmf0f_jlgdt59p0j/T/tfhub_modules/9bb74bc86f9caffc8c47dd7b33ec4bb354d9602d/variables/variables' with aggregation/weights
tensorflow - DEBUG - Initialize variable module/bilm/CNN/W_cnn_0:0 from checkpoint b'/var/folders/fj/_dh4b8qn3mq_ydqmf0f_jlgdt59p0j/T/tfhub_modules/9bb74bc86f9caffc8c47dd7b33ec4bb354d9602d/variables/variables' with bilm/CNN/W_cnn_0
tensorflow - DEBUG - Initialize variable module/bilm/CNN/W_cnn_1:0 from checkpoint b'/var/folders/fj/_dh4b8qn3mq_ydqmf0f_jlgdt59p0j/T/tfhub_modules/9bb74bc86f9caffc8c47dd7b33ec4bb354d9602d/va

tensorflow - DEBUG - Initialize variable module/bilm/RNN_1/RNN/MultiRNNCell/Cell0/rnn/lstm_cell/kernel:0 from checkpoint b'/var/folders/fj/_dh4b8qn3mq_ydqmf0f_jlgdt59p0j/T/tfhub_modules/9bb74bc86f9caffc8c47dd7b33ec4bb354d9602d/variables/variables' with bilm/RNN_1/RNN/MultiRNNCell/Cell0/rnn/lstm_cell/kernel
tensorflow - DEBUG - Initialize variable module/bilm/RNN_1/RNN/MultiRNNCell/Cell0/rnn/lstm_cell/projection/kernel:0 from checkpoint b'/var/folders/fj/_dh4b8qn3mq_ydqmf0f_jlgdt59p0j/T/tfhub_modules/9bb74bc86f9caffc8c47dd7b33ec4bb354d9602d/variables/variables' with bilm/RNN_1/RNN/MultiRNNCell/Cell0/rnn/lstm_cell/projection/kernel
tensorflow - DEBUG - Initialize variable module/bilm/RNN_1/RNN/MultiRNNCell/Cell1/rnn/lstm_cell/bias:0 from checkpoint b'/var/folders/fj/_dh4b8qn3mq_ydqmf0f_jlgdt59p0j/T/tfhub_modules/9bb74bc86f9caffc8c47dd7b33ec4bb354d9602d/variables/variables' with bilm/RNN_1/RNN/MultiRNNCell/Cell1/rnn/lstm_cell/bias
tensorflow - DEBUG - Initialize variable module/bilm/RNN_1

In [28]:
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

embeddings

tensorflow - INFO - Saver not created because there are no variables in the graph to restore


<tf.Tensor 'module_apply_default_111/aggregation/mul_3:0' shape=(1, 8, 1024) dtype=float32>

In [29]:
print(embeddings)

Tensor("module_apply_default_111/aggregation/mul_3:0", shape=(1, 8, 1024), dtype=float32)


In [19]:
def elmo_vectors(x):
    embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        # return average of ELMo features
        return sess.run(tf.reduce_mean(embeddings,1))

In [20]:
list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]

In [22]:
# Extract ELMo embeddings

elmo_train = [elmo_vectors(x['clean_tweet']) for x in list_train]
elmo_test = [elmo_vectors(x['clean_tweet']) for x in list_test]

tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables i

tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables in the graph to restore
tensorflow - INFO - Saver not created because there are no variables i

In [24]:
# Once we have all the vectors, we can concatenate them back to a single array:
import numpy as np
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [37]:
elmo_train_new[0]

array([-0.12047485, -0.08234133,  0.06085269, ..., -0.08301005,
        0.07772747, -0.06713372], dtype=float32)

In [31]:
from sklearn.model_selection import train_test_split

xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new, 
                                                  train['target'],  
                                                  random_state=42, 
                                                  test_size=0.2) 

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
preds_valid = lreg.predict(xvalid)


In [34]:
f1_score(yvalid, preds_valid)


0.7821138211382114