In [None]:
import numpy as np
import re
import pandas as pd
from time import time
from collections import defaultdict
import en_core_web_sm

import logging
logging.basicConfig(format = "%(levelname)s - %(asctime)s: %(message)s", datefmt = '%H:%M:%S', level = logging.INFO)

from pymongo import MongoClient

In [None]:
# Getting collected tweets between 2020-04-16 and 2020-04-25: Training set
client = MongoClient("mongodb:...")
db = client.CPCS340
tweets =  db.covid.find({"timestamp_CAD":{'$lte':"2020-04-25"}})
print(tweets.count())

In [None]:
# Extrating the content of first n tweets
n = tweets.count()
print(n)
counter = 0
tweet_date = []
tweet_content = []

for doc in tweets:
    try:
        tweet_date.append(doc['timestamp_CAD'])
        tweet_content.append(doc['text'])
    except:
        pass
    
    counter +=1   
    if counter > n:
        break

In [None]:
data = np.vstack([tweet_date,tweet_content])
df = pd.DataFrame(data.T, columns=['Date', 'text'])

#### Cleaning content

We are lemmatizing and removing the stopwords and non-alphabetic characters for each tweet.

In [None]:
# Eliminating url links into tweets
text = []

for i in range(len(tweet_content)):
    text.append(re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", df['text'][i]))
    
df = pd.DataFrame(text, columns = ['text'])
df.head()
print(df.shape)

In [None]:
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [None]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['text'])

In [None]:
nlp = en_core_web_sm.load()

In [None]:
t = time()
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

In [None]:
df_clean.head(10)

#### Bigrams

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
#As Phrases() takes a list of list of words as input:
sent = [row.split() for row in df_clean['clean']]

In [None]:
#Creates the relevant phrases from the list of tweets:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [None]:
#The goal of Phraser() is to cut down memory consumption of Phrases(), 
#by discarding model state not strictly needed for the bigram detection task:
#Transform the corpus based on the bigrams detected:
bigram = Phraser(phrases)
sentences = bigram[sent]

#### Most Frequent Words:
Mainly a sanity check of the effectiveness of the lemmatization, removal of stopwords, and addition of bigrams.

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:30]

#### Training the model

In [None]:
import multiprocessing
from gensim.models import Word2Vec

In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=3,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

#### Building the Vocabulary Table
Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):

In [None]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
#The built vocabulary
w2v_model.wv.vocab.keys()

In [None]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
#As we do not plan to train the model any further, 
#we are calling init_sims(), which will make the model much more memory-efficient:
w2v_model.init_sims(replace=True)

#### Exploring the model
##### Most similar to:

In [None]:
# Covid
w2v_model.wv.most_similar(positive=["covid"],topn=35)
# We chose: illness, suspect, complication, severe, immune

In [None]:
# sick
w2v_model.wv.most_similar(positive=["sick"], topn = 35)
# We chose: wear_mask, ill, stay_home, feel_like, distancing

In [None]:
# fever
w2v_model.wv.most_similar(positive=["fever"],topn = 35)
# We chose: breath, infect,contagious, cause_death

In [None]:
#tiredness
w2v_model.wv.most_similar(positive=["tired"], topn = 35)
# We chose: irresponsible

In [None]:
#cough
w2v_model.wv.most_similar(positive=["cough"],topn=35)
# We chose: immune, sneeze, spread, disease, respiratory, antibody, severe, expose

In [None]:
#pain
w2v_model.wv.most_similar(positive=["pain"],topn=35, restrict_vocab=None)
# We chose: ambulance

In [None]:
#nose
w2v_model.wv.most_similar(positive=["nose"],topn=25)
# We do not choose any word, smell is already considered as keyword

In [None]:
#headache
w2v_model.wv.most_similar(positive=["headache"],topn=25)
# Headache is not included into the vocabulary

In [None]:
#diagnose
w2v_model.wv.most_similar(positive=["diagnose"],topn=35)
# We chose: infected, surveillance

##### Similarities

In [None]:
w2v_model.wv.similarity("covid", 'cause_death')

##### Odd-One-Out:
Here, we ask our model to give us the word that does not belong to the list!

In [None]:
w2v_model.wv.doesnt_match(['covid', 'death', 'health','cases'])