# Word2Vec model precomputing

In [9]:
from gensim.models import Word2Vec
import pandas as pd
import string
PUNCT_TO_REMOVE = string.punctuation + '\n\t' + '…'
print(PUNCT_TO_REMOVE)
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
from nltk.corpus import stopwords
print(", ".join(stopwords.words('english')))
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

import re

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_urls(text):
    return re.sub(r"http\S+", "", text)


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
	…
i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn'

In [10]:
tweets = pd.read_csv("tweets/covid19_tweets.csv")
tweets = tweets[['text']]
tweets.text = tweets.text.apply(str)
tweets.text = tweets.text.str.lower()
tweets.text = tweets.text.apply(lambda tweet: remove_urls(tweet))
tweets.text = tweets.text.apply(lambda tweet: remove_punctuation(tweet))
tweets.text = tweets.text.apply(lambda tweet: remove_numbers(tweet))
tweets.text = tweets.text.apply(lambda tweet: lemmatize_words(tweet))
tweets.text = tweets.text.apply(lambda tweet: remove_stopwords(tweet))
tweets.head()

Unnamed: 0,text
0,smelled scent hand sanitizers today someone pa...
1,hey yankee yankeespr mlb wouldnt made sense pl...
2,diane wdunlap realdonaldtrump trump never clai...
3,brookbanktv one gift covid ha give appreciatio...
4,july medium bulletin novel coronavirusupdates ...


In [11]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
data_tok = []
for string in tweets.text:
    data_tok.append([el.lower() for el in tokenizer.tokenize(string)])
data_tok[:2]

[['smelled',
  'scent',
  'hand',
  'sanitizers',
  'today',
  'someone',
  'past',
  'would',
  'think',
  'intoxicated'],
 ['hey',
  'yankee',
  'yankeespr',
  'mlb',
  'wouldnt',
  'made',
  'sense',
  'player',
  'pay',
  'respect']]

In [14]:
model = Word2Vec(sentences=data_tok, size=2, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [16]:
model.wv['covid']

array([-2.2529435,  5.559278 ], dtype=float32)

In [18]:
model.wv['coronavirus']

array([-4.2661886,  5.5407896], dtype=float32)

# Common appearances matrix precomputing

In [27]:
import numpy as np
from tqdm import tqdm
common_len = 1000

# get 1000 most frequent words with the length 4 or more
from collections import Counter
cnt = Counter()
for tweet in tweets.text.values:
    for word in tweet.split():
        if len(word) >= 4 and ('’' not in list(word)):
            cnt[word] += 1
        
most_common_words = cnt.most_common(common_len)

inds = np.arange(common_len)
mesh = np.array(np.meshgrid(inds, inds))
combinations = mesh.T.reshape(-1, 2)
words = []
freqs = []
for word in most_common_words:
    words.append(word[0])
    freqs.append(word[1])
word_dict = {'word': words, 'word_freq': freqs}  
word_df = pd.DataFrame(word_dict) 
# saving the dataframe 
word_df.to_csv('word_df.csv') 

In [35]:
COMMONWORDS = words 
def leave_only_common_words(text):
    st = set([word for word in str(text).split() if word in COMMONWORDS])
    return st
common_sets = tweets.text.apply(lambda tweet: leave_only_common_words(tweet)).values

In [43]:
word_dict = dict(zip(words, range(common_len)))

In [66]:
from tqdm import tqdm
appearance_matrix = np.zeros((common_len, common_len))
tweets_list = []
for common_set in tqdm(common_sets, position=0):
    common_inds = [word_dict[word] for word in common_set]
    tweets_list.append(common_inds)
    mesh = np.array(np.meshgrid(common_inds, common_inds))
    common_combinations = mesh.T.reshape(-1, 2)
    for combination in common_combinations:
        appearance_matrix[combination[0], combination[1]] += 1

100%|████████████████████████████████████████████████████████████████████████| 179108/179108 [00:21<00:00, 8226.00it/s]


In [69]:
tweets['words'] = tweets_list
original_tweets = pd.read_csv("tweets/covid19_tweets.csv")['text']
tweets.text = original_tweets
tweets.to_csv("tweets_words.csv")
tweets.head()

Unnamed: 0,text,words
0,If I smelled the scent of hand sanitizers toda...,"[303, 246, 165, 69, 10, 56]"
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,"[759, 598, 144]"
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,"[417, 23, 153, 18, 365, 741, 0]"
3,@brookbanktv The one gift #COVID19 has give me...,"[145, 700, 90, 0, 109, 280]"
4,25 July : Media Bulletin on Novel #CoronaVirus...,"[86, 614, 0, 544, 163]"


In [65]:
appearance_matrix = appearance_matrix.astype(int)
np.savetxt("appearance_matrix.csv", appearance_matrix, delimiter=",", fmt='%d')

In [62]:
most_common_words

[('covid', 108323),
 ('case', 20194),
 ('coronavirus', 14184),
 ('death', 9849),
 ('people', 9285),
 ('pandemic', 8071),
 ('mask', 6488),
 ('health', 5227),
 ('need', 4963),
 ('positive', 4875),
 ('today', 4718),
 ('time', 4700),
 ('total', 4455),
 ('test', 4401),
 ('india', 4358),
 ('like', 4147),
 ('help', 4110),
 ('vaccine', 3933),
 ('trump', 3785),
 ('school', 3768),
 ('state', 3725),
 ('number', 3716),
 ('last', 3576),
 ('realdonaldtrump', 3560),
 ('world', 3446),
 ('country', 3376),
 ('update', 3361),
 ('news', 3358),
 ('many', 3261),
 ('know', 3255),
 ('report', 3226),
 ('august', 2961),
 ('first', 2953),
 ('reported', 2910),
 ('spread', 2870),
 ('virus', 2821),
 ('lockdown', 2771),
 ('take', 2769),
 ('even', 2745),
 ('still', 2704),
 ('year', 2703),
 ('home', 2695),
 ('life', 2679),
 ('work', 2648),
 ('week', 2634),
 ('government', 2620),
 ('make', 2608),
 ('back', 2596),
 ('tested', 2559),
 ('testing', 2515),
 ('patient', 2482),
 ('good', 2481),
 ('business', 2454),
 ('america