# Disaster Tweets

Initial plan is to use this notebook for baseline modeling of preprocessed data.

In [1]:
# imports

# data
import pandas as pd
import numpy as np

# modeling
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')


In [None]:
# read training and test data
train_df = pd.read_csv('/Users/davidwalkup/ds-course/projects/Mod4/disaster_tweet_prediction/data/cleaned_train.csv')

In [None]:
train_df.head()

In [None]:
stemmed_train_df = pd.read_csv('/Users/davidwalkup/ds-course/projects/Mod4/disaster_tweet_prediction/data/stemmed_train.csv')

In [None]:
stemmed_train_df.head()

In [None]:
lemmatized_train_df = pd.read_csv('/Users/davidwalkup/ds-course/projects/Mod4/disaster_tweet_prediction/data/lemmatized_train.csv')

In [None]:
lemmatized_train_df.head()

In [None]:
test_df = pd.read_csv('/Users/davidwalkup/ds-course/projects/Mod4/disaster_tweet_prediction/data/cleaned_test.csv')

In [None]:
test_df.head()

#### How good does my model have to be to outperform the naive approach (i.e., no tweet is about a disaster)?

In [None]:
p_classes = dict(train_df['target'].value_counts(normalize=True))
naive_approach = p_classes[0]
print('Class probabilities: ', p_classes,
      '\nChance tweet is not about a real disaster: ', np.round(naive_approach, decimals = 4))

### Bagging using sklearn CountVectorizer

First set of experiments will include stop words.

In [None]:
count_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                   ngram_range = (1, 2),
                                   binary = True)

In [None]:
train_vector_df = count_vectorizer.fit_transform(cleaned_train_df['text'])

In [None]:
train_vector_df.shape

In [None]:
train_vector_stemmed_df = count_vectorizer.fit_transform(stemmed_train_df['text'])

In [None]:
train_vector_stemmed_df.shape

In [None]:
train_vector_lemma_df = count_vectorizer.fit_transform(lemmatized_train_df['text'])

In [None]:
train_vector_lemma_df.shape

Second set of experiments will remove stop words, to see if that improves performance.

In [None]:
count_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                   stop_words = stopwords.words('english'),
                                   ngram_range = (1, 2),
                                   binary = True)

In [None]:
train_vector_no_stops_df = count_vectorizer.fit_transform(cleaned_train_df['text'])

In [None]:
train_vector_no_stops_df.shape

In [None]:
train_vector_stemmed_no_stops_df = count_vectorizer.fit_transform(stemmed_train_df['text'])

In [None]:
train_vector_stemmed_no_stops_df.shape

In [None]:
train_vector_lemma_no_stops_df = count_vectorizer.fit_transform(lemmatized_train_df['text'])

In [None]:
train_vector_lemma_no_stops_df.shape

Logistic Regression on CountVectorizer treated training data

In [None]:
# basic LogReg
clf_logreg = LogisticRegression(class_weight = 'balanced')

scores = model_selection.cross_val_score(clf_logreg,
                                         train_vector_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")

print(scores.mean(), '+/-', scores.std())

In [None]:
# LogRegCV
clf_logreg_cv = LogisticRegressionCV(class_weight = 'balanced')

scores = model_selection.cross_val_score(clf_logreg_cv,
                                         train_vector_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")

print(scores.mean(), '+/-', scores.std())

In [None]:
# LogReg, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# LogReg, stemmed
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# LogReg, stemmed, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# LogReg, lemmatized
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_lemma_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# LogReg, lemmatized, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_vector_lemma_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

Multinomial Bayes on CountVectorizer treated training data

In [None]:
# basic Multinomial Bayes
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# Multinomial Bayes, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# Multinomial Bayes, stemmed
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# Multinomial Bayes, stemmed, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# Multinomial Bayes, lemmatized
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_lemma_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# Multinomial Bayes, lemmatized, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_vector_lemma_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

Bagging and term frequency weighting using TD-IDF vectorization

First set will retain stop words.

In [None]:
tf_idf = TfidfVectorizer(ngram_range=(1, 1),
                         max_df=0.5,
                         min_df=2)

In [None]:
train_tfidf_df = tf_idf.fit_transform(cleaned_train_df['text'])

In [None]:
train_tfidf_df.shape

In [None]:
train_tfidf_stemmed_df = tf_idf.fit_transform(stemmed_train_df['text'])

In [None]:
train_tfidf_stemmed_df.shape

In [None]:
train_tfidf_lemmatized_df = tf_idf.fit_transform(lemmatized_train_df['text'])

In [None]:
train_tfidf_lemmatized_df.shape

Second set will remove stop words.

In [None]:
tf_idf = TfidfVectorizer(stop_words = stopwords.words('english'),
                         ngram_range=(1, 1),
                         max_df=0.5,
                         min_df=2)

In [None]:
train_tfidf_no_stops_df = tf_idf.fit_transform(cleaned_train_df['text'])

In [None]:
train_tfidf_no_stops_df.shape

In [None]:
train_tfidf_stemmed_no_stops_df = tf_idf.fit_transform(stemmed_train_df['text'])

In [None]:
train_tfidf_stemmed_no_stops_df.shape

In [None]:
train_tfidf_lemmatized_no_stops_df = tf_idf.fit_transform(lemmatized_train_df['text'])

In [None]:
train_tfidf_lemmatized_no_stops_df.shape

Logistic Regression on TF-IDF treated training data

In [None]:
# basic LogReg
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# LogReg, stemmed
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# LogReg, lemmatized
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# LogReg, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# LogReg, stemmed, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# LogReg, lemmatized, no stop words
clf = LogisticRegression(class_weight = 'balanced')
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

Multinomial Bayes on TF-IDF treated training data

In [None]:
# basic Multinomial Bayes
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# Multinomial Bayes, stemmed
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_stemmed_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# Multinomial Bayes, lemmatized
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# Multinomial Bayes, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# Multinomial Bayes, stemmed, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_stemmed_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

In [None]:
# Multinomial Bayes, lemmatized, no stop words
clf = MultinomialNB()
scores = model_selection.cross_val_score(clf,
                                         train_tfidf_lemmatized_no_stops_df, train_df["target"],
                                         cv=5,
                                         scoring="f1")
print(scores.mean(), '+/-', scores.std())

Attempting to use TensorFlow & BERT

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
train_input = bert_encode(train_df.text.values, tokenizer, max_len=160)
test_input = bert_encode(test_df.text.values, tokenizer, max_len=160)
train_labels = train_df.target.values

In [None]:
model = build_model(bert_layer, max_len=160)
model.summary()

In [None]:
train_history = model.fit(train_input, train_labels,
                          validation_split=0.2,
                          epochs=5)

In [None]:
test_pred = model.predict(test_input)

In [None]:
# submission['target'] = test_pred.round().astype(int)
# submission.to_csv('disaster_tweet_submission.csv', index=False)