In [23]:
import re
import tensorflow as tf
import numpy as np
import pandas as pd
import spacy
import string
import io
import tqdm
import os
import shutil
from spellchecker import SpellChecker

from tensorflow.keras import Model
from tensorflow.keras.layers import Dot, Embedding, Flatten
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [24]:
train_df = pd.read_csv('train.csv')

nlp = spacy.load('en_core_web_sm')
from spacy.lang.en.stop_words import STOP_WORDS

In [25]:
spell = SpellChecker()
STOP_WORDS.update('m', 's')
def correct_spellings(x, spell=spell):
    ### correct the misspelled words of a given tweet ###
    x = x.split()
    misspelled = spell.unknown(x)
    result = map(lambda word : spell.correction(word) if word in misspelled and spell.correction(word) is not None else word, x)
    return " ".join(result)

def tweets_cleaning(x, correct_spelling=False, remove_emojis=True, remove_stop_words_lemmatization=True):
    ### Apply function to a clean a tweet ###

    # remove multiple spaces and lower characters
    x = x.replace(" +"," ").lower().strip()
    # remove urls
    url = re.compile(r'https?://\S+|www\.\S+')
    x = url.sub(r'',x)
    # remove html tags
    html = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    x = html.sub('',x)
    # remove non alpha-numeric characters
    x = ''.join([ch for ch in x if ch.isalnum() or ch==" "])

    if remove_emojis:
        x = x.encode('ascii', 'ignore').decode('utf8').strip()
    if correct_spelling:
        x = correct_spellings(x)
    if remove_stop_words_lemmatization:
        x = " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) and (token.text not in STOP_WORDS)])
    return x

## apply the cleaning function to the text column for train and test
train_df['text_clean'] = train_df['text'].apply(tweets_cleaning)

In [2]:
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def remove_html(text):
    no_html= pattern.sub('',text)
    return no_html

# Remove all text that start with html
train_df['text']=train_df['text'].apply(lambda x : remove_html(x))

train_df["text_clean"] = train_df["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) and (token.text not in STOP_WORDS)]))

# How many unique words have this text
def counter_word(text):
    count = dict()
    for row in text.values:
        for word in row.split():
            try:
                count[word] += 1
            except KeyError:
                count[word] = 1
    return count

counter = counter_word(train_df['text_clean'])

In [26]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000)
tokenizer.fit_on_texts(train_df["text_clean"])
train_df["text_encoded"] = tokenizer.texts_to_sequences(train_df["text_clean"])

In [27]:
SEED=123
vocab_size = tokenizer.num_words

In [28]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset and produce a progress bar.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table, # probability distribution to draw from
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      # create a tensor with shape (1, 1)
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      # create the negative skip-grams
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=SEED,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1) # shape (num_ns, 1)

      # shape of context (num_ns+1, 1)
      context = tf.concat([context_class, negative_sampling_candidates], 0)
      # create the label vector
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # shape of target word (1,1)
      target_word = tf.constant(target_word,shape=(1,1))

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [29]:
targets, contexts, labels = generate_training_data(
    sequences=train_df["text_encoded"],
    window_size=2,
    num_ns=6,
    vocab_size=vocab_size,
    seed=SEED)
print(len(targets), len(contexts), len(labels))

100%|██████████| 7613/7613 [00:02<00:00, 2695.50it/s]

12684 12684 12684





In [30]:
BATCH_SIZE = 1024
BUFFER_SIZE = len(targets)
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset element_spec=((TensorSpec(shape=(1024, 1, 1), dtype=tf.int32, name=None), TensorSpec(shape=(1024, 7, 1), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 7), dtype=tf.int64, name=None))>


In [31]:
AUTOTUNE = tf.data.AUTOTUNE
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(1024, 1, 1), dtype=tf.int32, name=None), TensorSpec(shape=(1024, 7, 1), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 7), dtype=tf.int64, name=None))>


In [32]:
embedding_dim = 128

target_embedding = Embedding(vocab_size,
                            embedding_dim,
                            input_length=1,
                            name="w2v_embedding",
                             mask_zero=True)
context_embedding = Embedding(vocab_size,
                              embedding_dim,
                              input_length=6+1,
                              name="context_embedding")
dots = Dot(axes=3,name="dot_product")
flatten = Flatten()

In [33]:
# the target word is tensor of shape (batch_size, 1, 1)
# it's a sequence of 1 word with one representation channel
target = tf.keras.Input(shape=[1,1], name="target")

# the context has shape (batch_size, 5, 1) 
# it contains 5 words with one representation channel
context = tf.keras.Input(shape=[7,1], name="context")

# the target gets embedded
word_emb = target_embedding(target)

# the context gets embedded
context_emb = context_embedding(context)

# we calculate the dot product for the two embeddings
dots_result = dots([context_emb, word_emb])

# flatten the result
flat = flatten(dots_result)

# apply the sigmoid to compare with the binary label
output = tf.keras.activations.sigmoid(flat)

# form the model using the inputs and outputs
word2vec = tf.keras.Model(inputs=[target,context], outputs=output)

In [34]:
word2vec.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 context (InputLayer)           [(None, 7, 1)]       0           []                               
                                                                                                  
 target (InputLayer)            [(None, 1, 1)]       0           []                               
                                                                                                  
 context_embedding (Embedding)  (None, 7, 1, 128)    128000      ['context[0][0]']                
                                                                                                  
 w2v_embedding (Embedding)      (None, 1, 1, 128)    128000      ['target[0][0]']                 
                                                                                            

In [35]:
# then we compile the model using adam and the binary cross entropy
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.BinaryCrossentropy(),
                 metrics=['accuracy'])

In [36]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [37]:
word2vec.fit(dataset, epochs=60, callbacks=[tensorboard_callback])

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.callbacks.History at 0x204d742e530>

In [17]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [38]:
word2vec.get_layer('w2v_embedding').get_weights()[0]


array([[ 0.02243414,  0.00511794,  0.00461416, ..., -0.00255553,
         0.03808076,  0.01105988],
       [ 0.03939544,  0.02381322, -0.04401875, ..., -0.00959129,
        -0.03842111, -0.00823138],
       [-0.03562939, -0.03180138,  0.017786  , ...,  0.0480735 ,
         0.00694228, -0.01101474],
       ...,
       [ 0.08900376,  0.05028961,  0.47553134, ...,  0.05058875,
        -0.295977  ,  0.12752183],
       [ 0.19535683, -0.18541452,  0.16551405, ..., -0.08908189,
         0.08490787,  0.29271874],
       [-0.3260565 , -0.06430529,  0.17915635, ..., -0.36229157,
        -0.08289687,  0.3101487 ]], dtype=float32)

In [60]:
target_embedding.get_weights()[0]

array([[ 0.02243414,  0.00511794,  0.00461416, ..., -0.00255553,
         0.03808076,  0.01105988],
       [ 0.03939544,  0.02381322, -0.04401875, ..., -0.00959129,
        -0.03842111, -0.00823138],
       [-0.03562939, -0.03180138,  0.017786  , ...,  0.0480735 ,
         0.00694228, -0.01101474],
       ...,
       [ 0.08900376,  0.05028961,  0.47553134, ...,  0.05058875,
        -0.295977  ,  0.12752183],
       [ 0.19535683, -0.18541452,  0.16551405, ..., -0.08908189,
         0.08490787,  0.29271874],
       [-0.3260565 , -0.06430529,  0.17915635, ..., -0.36229157,
        -0.08289687,  0.3101487 ]], dtype=float32)

In [39]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = [value for value in tokenizer.index_word.values()][0:1000]

In [40]:
len(weights)

1000

In [41]:
# Set up a logs directory, so Tensorboard knows where to look for files.
log_dir='./logs/projector/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [42]:
from tensorboard.plugins import projector

# Save the weights we want to analyze as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, here
# we will remove this value.
weights = tf.Variable(weights)
# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [None]:
# Now run tensorboard against on log data we just saved.
%tensorboard --logdir ./logs/projector/

## Clustering


In [43]:
text_encoded = tokenizer.texts_to_sequences(train_df['text_clean'])
text_encoded_padded = tf.keras.preprocessing.sequence.pad_sequences(text_encoded,
                                                                    padding="post")
text_encoded_padded

array([[408, 171,   0, ...,   0,   0,   0],
       [119,   2, 159, ...,   0,   0,   0],
       [439, 323, 288, ...,   0,   0,   0],
       ...,
       [436, 389,   0, ...,   0,   0,   0],
       [ 21, 738,  89, ...,   0,   0,   0],
       [107,  24, 340, ...,   0,   0,   0]])

In [44]:
avg_pooling = tf.keras.layers.GlobalAveragePooling1D()

In [45]:
emebedded_text = target_embedding(text_encoded_padded)

In [46]:
emebedded_text.shape

TensorShape([7613, 18, 128])

In [47]:
avg_embedded_text = avg_pooling(emebedded_text).numpy()

In [48]:
avg_embedded_text.shape

(7613, 128)

In [50]:
df = pd.DataFrame(avg_embedded_text)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-0.145224,0.296751,-0.057229,0.304147,0.185177,0.145016,0.248525,-0.094235,-0.191815,-0.197466,...,-0.137379,0.058359,-0.054496,0.055996,0.231555,-0.126948,0.090376,-0.101733,-0.030216,0.222704
1,-0.035157,-0.135679,0.165873,-0.031736,0.12542,0.008071,0.072617,-0.07485,0.04936,-0.143887,...,-0.093917,0.09341,-0.101558,0.242465,0.138486,0.014551,0.115147,-0.069421,-0.020198,0.042849
2,-0.07895,-0.166807,0.074829,0.086263,0.137971,0.218548,0.133346,-0.102556,0.178981,-0.148829,...,-0.02739,0.072285,-0.131755,-0.285132,0.105809,-0.227209,-0.037434,-0.056259,-0.197392,0.201401
3,-0.149976,-0.021635,0.024069,0.025348,0.131577,0.087046,0.11175,-0.091035,0.101343,-0.284371,...,-0.196501,0.067714,-0.192414,-0.026927,0.179924,-0.201999,0.071228,-0.032025,-0.23878,0.225671
4,-0.13725,-0.28567,0.139986,0.079331,0.095626,0.034978,0.072431,-0.185988,0.145994,-0.174375,...,-0.196483,0.162876,-0.186406,0.117287,0.130711,-0.135128,0.13982,-0.045019,-0.120225,0.169081


In [51]:
df.isnull().sum()

0      68
1      68
2      68
3      68
4      68
       ..
123    68
124    68
125    68
126    68
127    68
Length: 128, dtype: int64

In [52]:
df = df.dropna()

In [53]:
# Using the Elbow method to find the optimal number K of clusters
from sklearn.cluster import KMeans
wcss =  []
for i in range (1,11): 
    kmeans = KMeans(n_clusters= i, init = "k-means++", random_state = 0)
    kmeans.fit(df)
    wcss.append(kmeans.inertia_)

In [54]:
# Create a DataFrame that will be fed to plotly 
wcss_frame = pd.DataFrame(wcss)

# Using Plotly to visualize elbow 
import plotly.express as px 

# Creating a line plot
fig = px.line(wcss_frame, x=wcss_frame.index+1, y=wcss_frame.iloc[:, -1])

# Creating layouts 
fig.update_layout(
    title="Inertia per clusters",
    xaxis_title="# clusters",
    yaxis_title="Inertia"
)

# Render in notebook
fig.show()

In [55]:
# Import silhouette score
from sklearn.metrics import silhouette_score

# Computer mean silhouette score
sil = []

## Careful, you need to start at i=2 as silhouette score cannot accept less than 2 labels 
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i, init = "k-means++", random_state = 0)
    kmeans.fit(df)
    sil.append(silhouette_score(df, kmeans.predict(df)))
    print("Silhouette score for K={} is {}".format(i, sil[-1]))

Silhouette score for K=2 is 0.03969899192452431
Silhouette score for K=3 is 0.03397141396999359
Silhouette score for K=4 is 0.028314219787716866
Silhouette score for K=5 is 0.028913697227835655
Silhouette score for K=6 is -0.0030982790049165487
Silhouette score for K=7 is 0.01722247712314129
Silhouette score for K=8 is 0.01410001516342163
Silhouette score for K=9 is 0.024919455870985985
Silhouette score for K=10 is 0.01066309493035078


In [56]:
# Create a data frame 
cluster_scores=pd.DataFrame(sil, index=range(2,11))

# Create figure
fig = px.bar(data_frame=cluster_scores,  
             x=cluster_scores.index, 
             y=cluster_scores.iloc[:, -1]
            )

# Add title and axis labels
fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster"
)

# Render
fig.show()
# fig.show(renderer="iframe") if using workspace

In [57]:
kmeans = KMeans(n_clusters= 2, init = "k-means++", random_state = 0)
kmeans.fit(df)

df["cluster"]=kmeans.predict(df)

In [58]:
clusterised = train_df.join(df["cluster"],how="inner")
clusterised.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,text_encoded,cluster
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive,"[408, 171]",1
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[119, 2, 159, 488]",1
2,5,,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...,"[439, 323, 288, 186, 323, 276, 355]",0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfire evacuation order...,"[5, 67, 186, 276, 33]",0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got send photo ruby alaska smoke wildfire pour...,"[264, 179, 123, 172, 67, 111]",0


In [61]:
cluster_0 = clusterised[clusterised["cluster"]==0]
cluster_1 = clusterised[clusterised["cluster"]==1]

In [62]:
cluster_0['target'].value_counts()

1    1682
0    1497
Name: target, dtype: int64

In [63]:
cluster_1['target'].value_counts()

0    2792
1    1574
Name: target, dtype: int64

In [64]:
clusterised['target'].value_counts()

0    4289
1    3256
Name: target, dtype: int64

In [66]:
train_df['target'].value_counts()/train_df.shape[0]

0    0.57034
1    0.42966
Name: target, dtype: float64