# Information Retrieval and Web Analytics Project
## Ranking

#### Packages

We first import all the packages that we need for text processing:

In [1]:
import time
import string
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import json
import nltk
import demoji
import re
nltk.download('stopwords');
from deep_translator import GoogleTranslator
from gensim.models.word2vec import Word2Vec
from numpy import linalg as LA

from collections import defaultdict
import math
import numpy as np
import collections
from numpy import linalg as la
import matplotlib.cm as cm
import random
import pandas as pd
import matplotlib.pyplot as plt
import dateutil.parser
import datetime

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data 

**Read the file with the tweets:**

In [2]:
docs_path = 'dataset_tweets_WHO.txt'

# Read the JSON file in a unique string
with open(docs_path) as fp:
    corpus = fp.readlines()[0]

# Load the JSON file as a dictionary
corpus = json.loads(corpus)

In [3]:
print("There are \033[1m%i tweets\033[0m in the dataset" %len(corpus))

There are [1m2399 tweets[0m in the dataset


## Text Processing

### Functions

In [4]:
def italics_to_plaintext(text):
    #difference between an italic lowecase character and its corresponding plaintext lowercase character
    diff_lower = ord('𝘢') - ord('a')
    #difference between an italic uppercase character and its corresponding plaintext uppercase character
    diff_upper = ord('𝘈') - ord('A')
    
    plaintext = ""
    for c in text:
        # if the character is italic lowercase, get the corresponding plaintext lowercase character
        if ord(c) >= ord('𝘢') and ord(c) <= ord('𝘻'):
            plaintext += chr(ord(c) - diff_lower)
        # else if the character is italic uppercase, get the corresponding plaintext uppercase character
        elif ord(c) >= ord('𝘈') and  ord(c) <= ord('𝘡'):
            plaintext += chr(ord(c) - diff_upper)
        else:
            plaintext += c
    
    return plaintext

def bold_to_plaintext(text):
    #difference between a bold lowecase character and its corresponding plaintext lowercase character
    diff_lower = ord('𝐚') - ord('a')
    #difference between a bold uppercase character and its corresponding plaintext uppercase character
    diff_upper = ord('𝐀') - ord('A')
    
    plaintext = ""
    for c in text:
        # if the character is bold lowercase, get the corresponding plaintext lowercase character
        if ord(c) >= ord('𝐚') and ord(c) <= ord('𝐳'):
            plaintext += chr(ord(c) - diff_lower)
        # else if the character is bold uppercase, get the corresponding plaintext uppercase character
        elif ord(c) >= ord('𝐀') and  ord(c) <= ord('𝐙'):
            plaintext += chr(ord(c) - diff_upper)
        else:
            plaintext += c
    
    return plaintext

def getTerms(text, stemming, stops):
    # Text to lowercase
    text = text.lower()
    # Text delete italic letter type if needed
    text = italics_to_plaintext(text)
    # Text delete bold letter type if needed
    text = bold_to_plaintext(text)
    # Delete all urls
    text = re.sub(r'http\S+', ' ', text) 
    # Delete all non-alphanumerical characters (it includes emojis) except '#' and '@'
    text = re.sub(r'[^A-Za-z0-9#@]+', ' ', text)
    # Text tokenization
    words = text.split()
    # Remove stopwords
    words = [word for word in words if word not in stops]
    # Get the stem of each word
    words = [stemming.stem(word) for word in words]
    
    return words

### Process tweets

In [5]:
stemming = PorterStemmer()
# Delete also "amp" (&) and "rt"
stops = set(stopwords.words("english")).union(set({'amp', 'rt'}))

# Dictionary where we'll save all the processed tweets
data = {}
for tweet in corpus:
    
    #In case that the tweet is not in english, we traduce it
    lang = corpus[tweet]['lang']
    if lang != 'en':
        text_tweet = GoogleTranslator(target='en').translate(corpus[tweet]['full_text'])
    else:
        text_tweet = corpus[tweet]['full_text']
    
    # Get the text tokenized and cleaned 
    text_tweet_processed = getTerms(text_tweet, stemming, stops)
    
    if text_tweet_processed != []: #In case that the text is not null                       
        data[tweet] = {}
        data[tweet]['org_text'] = text_tweet
        data[tweet]['text'] = text_tweet_processed
        
        if 'retweeted_status' in corpus[tweet]:
            # Save the original tweet's user
            data[tweet]['user'] = corpus[tweet]['retweeted_status']['user']['name']
            # Save the original tweet's url
            try:
                data[tweet]['url'] = corpus[tweet]['retweeted_status']['entities']['media'][0]['url']
            except: 
                data[tweet]['url'] = ''
        else:
            data[tweet]['user'] = corpus[tweet]['user']['name']
            try:
                data[tweet]['url'] = corpus[tweet]['entities']['media'][0]['url']
            except: 
                data[tweet]['url'] = ''

        # Save all emojis used with its meaning
        data[tweet]['emojis'] = demoji.findall(corpus[tweet]['full_text'])

        # Save creation data
        data[tweet]['date'] = corpus[tweet]['created_at']

        # Save the number of retweets of this tweet
        data[tweet]['retweets'] = corpus[tweet]['retweet_count']

        # Save the number of 'favorites' of this tweet
        data[tweet]['favorites'] = corpus[tweet]['favorite_count']
        
        # Save the hashtags of this tweet
        hashtags = []
        for i in corpus[tweet]['entities']['hashtags']:
            hashtags.append(i['text'])
            data[tweet]['text'].append(i['text'].lower())
    
        data[tweet]['hashtags'] = hashtags

        # Save the full name of all the users mentioned
        data[tweet]['user_mentions'] = list()
        ## List of dictionaries, each with information of a user mentioned
        users_data = corpus[tweet]['entities']['user_mentions']
        for user in users_data:
            data[tweet]['user_mentions'].append(user['name'])        

## Indexing

### Functions

In [6]:
def create_index_tfidf(data, num_d):
    """
    Implement the inverted index and compute tf, df and idf
    
    Input:
    * lines: collection of Wikipedia articles
    * num_d: total number of documents
    
    Returns:
    * index: the inverted index (implemented through a python dictionary) containing terms as keys and the corresponding 
    * list of document these keys appears in (and the positions) as values
    * tf: normalized term frequency for each term in each document
    * df: number of documents each term appear in
    * idf: inverse document frequency of each term
    """
        
    index = defaultdict(list)
    tf = defaultdict(list) #term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int) #document frequencies of terms in the corpus
    idf = defaultdict(float)
    
    for tweet_id in data:   
        terms = data[tweet_id]['text']      

        termdictTweet = {}

        for position, term in enumerate(terms): # Tweet's terms
            try:
                # If the term is already in the dict append the position to the corrisponding list
                termdictTweet[term][1].append(position) 
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                termdictTweet[term] = [tweet_id, array('I',[position])] 
        
        # Normalize term frequencies
        norm = 0
        for term, posting in termdictTweet.items(): 
            norm += len(posting[1])**2
        norm = math.sqrt(norm)


        # Calculate the tf and df weights
        for term, posting in termdictTweet.items():     
            # Append the tf
            tf[term].append(np.round(len(posting[1])/norm,4))
            # Increment the document frequency of current term
            df[term] += len(posting)
        
        # Merge the current tweet index with the main index
        for termtweet, postingtweet in termdictTweet.items():
            index[termtweet].append(postingtweet)
            
        # Compute idf
        for term in df:
            idf[term] = np.round(np.log(float(num_d/df[term])),4)
            
    return index, tf, df, idf

### Create the index

In [7]:
start_time = time.time()
num_d = len(data)
index, tf, df, idf = create_index_tfidf(data, num_d)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time,2)))

Total time to create the index: 128.04 seconds


### Selected queries

In [8]:
q = ["#COVID19",
     "covid vaccine",
     "global pandemic",
     "#mentalhealth",
     "death risk"]

## Ranking

### TF-IDF + Cosine Similarity

#### Implementation

In [9]:
def rankTweets(terms, tweets, index, idf, tf):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    * terms: list of query terms
    * docs: list of documents, to rank, matching the query
    * index: inverted index data structure
    * idf: inverted document frequencies
    * tf: term frequencies
    * titleIndex: mapping between page id and page title
    
    Returns:
    Print the list of ranked documents
    """
        
    tweetVectors = defaultdict(lambda: [0]*len(terms))
    queryVector = [0]*len(terms)    

    # Compute the norm for the query tf
    query_terms_count = collections.Counter(terms) # get the frequency of each term in the query 
    query_norm = la.norm(list(query_terms_count.values()))
    
    for termIndex, term in enumerate(terms): #termIndex is the index of the term in the query
        if term not in index:
            continue
                    
        ## Compute tf*idf normalized
        queryVector[termIndex] = (query_terms_count[term] / query_norm) * idf[term]

        # Generate tweetVectors for matching docs
        for tweetIndex, (tweet, postings) in enumerate(index[term]):
            if tweet in tweets:
                tweetVectors[tweet][termIndex] = tf[term][tweetIndex]

    # Score of each tweet
    tweetScores = [ [np.dot(curTweetVec, queryVector), tweet] for tweet, curTweetVec in tweetVectors.items() ]
    tweetScores.sort(reverse=True)
    scores = [x[0] for x in tweetScores]
    resultTweets = [x[1] for x in tweetScores]
    #print document titles instead if document id's
    #resultDocs=[ titleIndex[x] for x in resultDocs ]
    if len(resultTweets) == 0:
        print("No results found, try again")
        query = input()
        tweets = search_tf_idf(query, index)    
    #print ('\n'.join(resultDocs), '\n')
    return resultTweets, scores

In [10]:
def search_tf_idf(query, index):
    stemming = PorterStemmer()
    stops = set(stopwords.words("english")).union(set({'amp', 'rt'}))
    
    query = getTerms(query, stemming, stops)
    tweets = set()

    for i, term in enumerate(query):
        try:
            # store in termDocs the ids of the docs that contain "term"                        
            termDocs = [posting[0] for posting in index[term]]
            
            # if this is the first word of the query, save all the tweets with this word
            if i == 0:
                tweets = tweets.union(termDocs)
            # for the next words, only keey those that contain that word and all the past words of the query
            else:
                tweets = tweets.intersection(termDocs)
        except:
            #term is not in index
            pass
    
    tweets = list(tweets)
    ranked_tweets, scores = rankTweets(query, tweets, index, idf, tf)   
    return ranked_tweets, scores

#### Testing the retrieval system

In [11]:
print("\033[1mInsert a query:\033[0m\n")
query = input()
ranked_tweets, scores = search_tf_idf(query, index)    
top = 5

print("\n======================\n\033[1mTop {} results out of {} for the searched query:\033[0m\n".format(top, len(ranked_tweets)))
for d_id in ranked_tweets[:top]:
    print("\n\033[1mTweet id = {}\033[0m".format(d_id))
    print("   - Tweet_text = {} ".format(data[d_id]['org_text']))
    print("   - Tweet_user = {} ".format(data[d_id]['user']))
    print("   - Tweet_date = {} ".format(data[d_id]['date']))
    print("   - Tweet_hashtags = {} ".format(data[d_id]['hashtags']))
    print("   - Tweet_likes = {} ".format(data[d_id]['favorites']))
    print("   - Tweet_retweets = {} ".format(data[d_id]['retweets']))
    print("   - Tweet_url = {} ".format(data[d_id]['url']))

[1mInsert a query:[0m

covid vaccine

[1mTop 5 results out of 15 for the searched query:[0m


[1mTweet id = 1959[0m
   - Tweet_text = 💉💉💉💉
💉💉💉💉
💉💉💉💉
💉💉💉💉
💉💉💉💉                 💉💉💉💉
💉💉💉💉                 💉💉💉💉

COVID-19 vaccines     COVID-19 vaccines
in 10 countries             in the rest of the 🌍

#VaccinEquity is 🗝️ to ending the pandemic, together!

#WorldEmojiDay 
   - Tweet_user = World Health Organization (WHO) 
   - Tweet_date = Sat Jul 17 16:24:23 +0000 2021 
   - Tweet_hashtags = ['VaccinEquity', 'WorldEmojiDay'] 
   - Tweet_likes = 3486 
   - Tweet_retweets = 1517 
   - Tweet_url =  

[1mTweet id = 2257[0m
   - Tweet_text = Q&amp;A #AskWHO on COVID-19 vaccines effectiveness https://t.co/FEdfOREhjn 
   - Tweet_user = World Health Organization (WHO) 
   - Tweet_date = Wed Jun 30 16:12:43 +0000 2021 
   - Tweet_hashtags = ['AskWHO'] 
   - Tweet_likes = 219 
   - Tweet_retweets = 85 
   - Tweet_url =  

[1mTweet id = 904[0m
   - Tweet_text = #COVID19 variants &amp; vaccine

## Your score + cosine similarity

Our score, SBDM, conists on taking into account the metadata from each teweet to rank them given a query. 

#### Implementation

In [12]:
def tweets_population(data, df):
    
    fav_mean = defaultdict()
    rtw_mean = defaultdict()
    
    for tw_id in data:
        for word in data[tw_id]['text']:
            if word not in fav_mean: 
                fav_mean[word] = data[tw_id]['favorites']
                rtw_mean[word] = data[tw_id]['retweets']
            else:
                fav_mean[word] += data[tw_id]['favorites']
                rtw_mean[word] += data[tw_id]['retweets']
                
    for word in fav_mean:
        fav_mean[word] = fav_mean[word] / df[word]
        rtw_mean[word] = rtw_mean[word] / df[word]
    
    return fav_mean, rtw_mean

In [13]:
def compute_diff_date(date):
    return (datetime.datetime.now().replace(tzinfo=None) - date.replace(tzinfo=None)).days

In [14]:
def rankTweets_sbdm(terms, tweets, index, idf, tf, fav_mean, rtw_mean, fav_rate, rtw_rate, date_rate):
    
    #Initialize the dictionaries with the tweets' vectors
    tweetVectors_terms = defaultdict(lambda: [0]*len(terms))
    tweetVectors_fav = defaultdict(lambda: [0]*len(terms))
    tweetVectors_rtw = defaultdict(lambda: [0]*len(terms))
    tweetVectors = defaultdict(lambda: [0]*len(terms))
    
    #Initialize the query vector
    queryVector_terms, queryVector_fav, queryVector_rtw = [0]*len(terms), [0]*len(terms), [0]*len(terms)

    # Compute the norm for the query TF
    query_terms_count = collections.Counter(terms) # get the frequency of each term in the query 
    query_norm = la.norm(list(query_terms_count.values()))
    
    #Get TF-IDF vectors
    for termIndex, term in enumerate(terms): #termIndex is the index of the term in the query
        if term not in index:
            continue
                    
        ## Compute tf*idf normalized
        queryVector_terms[termIndex] = (query_terms_count[term] / query_norm) * idf[term]

        # Generate tweetVectors for matching docs
        for tweetIndex, (tweet, postings) in enumerate(index[term]):
            if tweet in tweets:
                tweetVectors_terms[tweet][termIndex] = tf[term][tweetIndex]
    
    #Get population vectors
    for termIndex, term in enumerate(terms): 
        if term not in index:
            continue
                    
        queryVector_fav[termIndex] = fav_mean[term]
        queryVector_rtw[termIndex] = rtw_mean[term]

        for tweetIndex, (tweet, postings) in enumerate(index[term]):
            if tweet in tweets:
                tweetVectors_fav[tweet][termIndex] = fav_mean[term]
                tweetVectors_rtw[tweet][termIndex] = rtw_mean[term]
    
    
    #We normalize the population vectors
    queryVector_fav = queryVector_fav / la.norm(queryVector_fav)
    queryVector_rtw = queryVector_rtw / la.norm(queryVector_rtw)
    
    for tweet in tweetVectors_fav:
        tweetVectors_fav[tweet] = tweetVectors_fav[tweet] / la.norm(tweetVectors_fav[tweet])
        tweetVectors_rtw[tweet] = tweetVectors_rtw[tweet] / la.norm(tweetVectors_rtw[tweet])    
    
    #Final tweets and query vectors
    for tweet, curTweetVec in tweetVectors_terms.items():
        tweetVectors[tweet] = curTweetVec + rtw_rate*tweetVectors_rtw[tweet] + fav_rate*tweetVectors_fav[tweet]
        tweet_date = dateutil.parser.parse(data[tweet]['date'])
        date_diff = compute_diff_date(tweet_date)
        tweetVectors[tweet] = np.append(tweetVectors[tweet], 1/(date_diff*date_rate))
    
    queryVector = queryVector_terms + rtw_rate*queryVector_rtw + fav_rate*queryVector_fav
    queryVector = np.append(queryVector, 1/date_rate)
    
    #We compute cosine similarity
    tweetScores = [[np.dot(curTweetVec, queryVector), tweet] for tweet, curTweetVec in tweetVectors.items()]
    tweetScores.sort(reverse=True)
    scores = [x[0] for x in tweetScores]
    resultTweets = [x[1] for x in tweetScores]
    if len(resultTweets) == 0:
        print("No results found, try again")
        query = input()
        tweets = search_tf_idf(query, index) 
    return resultTweets, scores

In [15]:
def search_sbdm(query, index, fav_mean, rtw_mean, fav_rate = 1, rtw_rate = 0.5, date_rate = 0.5):
    stemming = PorterStemmer()
    stops = set(stopwords.words("english")).union(set({'amp', 'rt'}))
    
    query = getTerms(query, stemming, stops)
    tweets = set()

    for i, term in enumerate(query):
        try:
            # store in termDocs the ids of the docs that contain "term"                        
            termDocs = [posting[0] for posting in index[term]]
            
            # if this is the first word of the query, save all the tweets with this word
            if i == 0:
                tweets = tweets.union(termDocs)
            # for the next words, only keey those that contain that word and all the past words of the query
            else:
                tweets = tweets.intersection(termDocs)
        except:
            #term is not in index
            pass
    
    tweets = list(tweets)
    ranked_tweets, scores = rankTweets_sbdm(query, tweets, index, idf, tf, fav_mean, rtw_mean, fav_rate, rtw_rate, date_rate)   
    return ranked_tweets, scores

#### Testing

In [17]:
print("\033[1mInsert a query:\033[0m\n")
query = input()
fav_mean, rtw_mean = tweets_population(data, df)
ranked_tweets, scores = search_sbdm(query, index, fav_mean, rtw_mean, fav_rate = 1, rtw_rate = 0.5, date_rate = 0.5)    
top = 20

print("\n======================\n\033[1mTop {} results out of {} for the searched query:\033[0m\n".format(top, len(ranked_tweets)))
for d_id in ranked_tweets[:top]:
    print("\n\033[1mTweet id = {}\033[0m".format(d_id))
    print("   - Tweet_text = {} ".format(data[d_id]['org_text']))
    print("   - Tweet_user = {} ".format(data[d_id]['user']))
    print("   - Tweet_date = {} ".format(data[d_id]['date']))
    print("   - Tweet_hashtags = {} ".format(data[d_id]['hashtags']))
    print("   - Tweet_likes = {} ".format(data[d_id]['favorites']))
    print("   - Tweet_retweets = {} ".format(data[d_id]['retweets']))
    print("   - Tweet_url = {} ".format(data[d_id]['url']))

[1mInsert a query:[0m

death risk

[1mTop 20 results out of 18 for the searched query:[0m


[1mTweet id = 541[0m
   - Tweet_text = #Breastfeeding 🤱 is a critical first #FoodSystem that ensures nutrition, health &amp; development of 👶.

It's sustainable, not for profit &amp; it helps reduce:
📉preventable deaths
📉the risk of noncommunicable diseases
📉the risk of overweight/obesity

👉https://t.co/0MM8lIxZAx https://t.co/lphfIbfaYt 
   - Tweet_user = World Health Organization (WHO) 
   - Tweet_date = Thu Sep 23 15:52:59 +0000 2021 
   - Tweet_hashtags = ['Breastfeeding', 'FoodSystem'] 
   - Tweet_likes = 139 
   - Tweet_retweets = 52 
   - Tweet_url = https://t.co/lphfIbfaYt 

[1mTweet id = 1386[0m
   - Tweet_text = These therapies - artesunate, imatinib and infliximab – were selected by an independent expert panel for their potential in reducing the risk of death in hospitalized #COVID19 patients. https://t.co/K6tk22NnFf 
   - Tweet_user = World Health Organization (WHO) 
   - Twe

## Word2Vec + Cosine Similarity

#### Implementation

In [18]:
def rankTweets_Word2Vec(query, tweets, X, X_tweets, pos_to_id, ids_pos, model):
    # Get the ids of all words of the query
    words_ids = list()

    for word in query:
        if word in model.wv.key_to_index:
            words_ids.append(model.wv.key_to_index[word])
    
    query_w2v = X[words_ids,:].mean(axis=0)
    
    pos_list = [ids_pos[id_] for id_ in tweets]
    X_q = X_tweets[pos_list,:]
    
    cos_sim = np.dot(X_q, query_w2v) / (LA.norm(query_w2v) * np.linalg.norm(X_q, axis=1)) # Revisar
    
    # pares <cos_sim, id en data>
    tweetScores = [ [cos_sim[i], pos_to_id[pos_list[i]]] for i in range(len(pos_list))]
    tweetScores.sort(reverse=True)
    scores = [x[0] for x in tweetScores]
    resultTweets = [x[1] for x in tweetScores]
    #print document titles instead if document id's
    #resultDocs=[ titleIndex[x] for x in resultDocs ]
    if len(resultTweets) == 0:
        print("No results found, try again")
        query = input()
        tweets = search_tf_idf(query, index)    
    #print ('\n'.join(resultDocs), '\n')
    return resultTweets, scores

In [19]:
def search_Word2Vec(query, index, X, X_tweets, pos_ids, ids_pos, model):
    stemming = PorterStemmer()
    stops = set(stopwords.words("english")).union(set({'amp', 'rt'}))
    
    query = getTerms(query, stemming, stops)
    tweets = set()

    for i, term in enumerate(query):
        try:
            # store in termDocs the ids of the docs that contain "term"                        
            termDocs = [posting[0] for posting in index[term]]
            
            # if this is the first word of the query, save all the tweets with this word
            if i == 0:
                tweets = tweets.union(termDocs)
            # for the next words, only keey those that contain that word and all the past words of the query
            else:
                tweets = tweets.intersection(termDocs)
        except:
            #term is not in index
            pass
    
    tweets = list(tweets)
    
    ranked_tweets, scores = rankTweets_Word2Vec(query, tweets, X, X_tweets, pos_ids, ids_pos, model)   
    return ranked_tweets, scores

In [20]:
def tweets_word2vec(data):
    ## Transform each word to a 2D vector
    sentences = list()
    for tw_id in data:
        sentences.append(data[tw_id]['text'])

    model = Word2Vec(sentences)

    X = model.wv[model.wv.index_to_key]

    pos_ids = dict()
    ids_pos = dict()
    for i, id_ in enumerate(data.keys()):
        pos_ids[i] = id_
        ids_pos[id_] = i
        
    ## Represent each tweet as the mean of each of it's words

    # number of tweets
    n = len(sentences)
    # number of components of each tweet
    m = X.shape[1]
    X_tweets = np.zeros((n, m))

    for i, sentence in enumerate(sentences):
        words_ids = list()

        for word in sentence:
            if word in model.wv.key_to_index:
                words_ids.append(model.wv.key_to_index[word])

        X_tweets[i,:] = X[words_ids,:].mean(axis=0)
        
    return X, X_tweets, pos_ids, ids_pos, model

In [21]:
X, X_tweets, pos_ids, ids_pos, model = tweets_word2vec(data)

#### Testing

In [22]:
print("\033[1mInsert a query:\033[0m\n")
query = input()
ranked_tweets, scores = search_Word2Vec(query, index, X, X_tweets, pos_ids, ids_pos, model)    
top = 5

print("\n======================\n\033[1mTop {} results out of {} for the seached query:\033[0m\n".format(top, len(ranked_tweets)))
for d_id in ranked_tweets[:top] :
    print("\n\033[1mTweet id = {}\033[0m".format(d_id))
    print("   - Tweet_text = {} ".format(data[d_id]['org_text']))
    print("   - Tweet_user = {} ".format(data[d_id]['user']))
    print("   - Tweet_date = {} ".format(data[d_id]['date']))
    print("   - Tweet_hashtags = {} ".format(data[d_id]['hashtags']))
    print("   - Tweet_likes = {} ".format(data[d_id]['favorites']))
    print("   - Tweet_retweets = {} ".format(data[d_id]['retweets']))
    print("   - Tweet_url = {} ".format(data[d_id]['url']))

[1mInsert a query:[0m

covid

[1mTop 5 results out of 39 for the seached query:[0m


[1mTweet id = 1561[0m
   - Tweet_text = If you have recovered from #COVID19 but are still experiencing certain symptoms you could have post COVID-19 condition or "long COVID". What are these symptoms? How long do they last and are there any treatment options? Dr @diazjv explains in #ScienceIn5 ⬇️ https://t.co/vtDiBhZsJE 
   - Tweet_user = World Health Organization (WHO) 
   - Tweet_date = Mon Aug 02 11:38:39 +0000 2021 
   - Tweet_hashtags = ['COVID19', 'ScienceIn5'] 
   - Tweet_likes = 277 
   - Tweet_retweets = 154 
   - Tweet_url = https://t.co/vtDiBhZsJE 

[1mTweet id = 1045[0m
   - Tweet_text = ▶️ If you have #COVID19, is it safe to breastfeed your baby❓

▶️ Is it safe to get vaccinated against COVID-19 if you are breastfeeding❓

▶️ How can you keep your baby safe while breastfeeding 🤱 if you have COVID-19❓

Dr Laurence Grummer-Strawn explains in #ScienceIn5. https://t.co/QMAq9TMY7A 
   - 

#### Testing with our selected queries

#### Query 1

In [23]:
print("\033[1mInsert a query:\033[0m\n")
query = q[0]
ranked_tweets, scores = search_Word2Vec(query, index, X, X_tweets, pos_ids, ids_pos, model)    
top = 20

print("\n======================\n\033[1mTop {} results out of {} for the seached query:\033[0m\n".format(top, len(ranked_tweets)))
for d_id in ranked_tweets[:top] :
    print("\n\033[1mTweet id = {}\033[0m".format(d_id))
    print("   - Tweet_text = {} ".format(data[d_id]['org_text']))
    print("   - Tweet_user = {} ".format(data[d_id]['user']))
    print("   - Tweet_date = {} ".format(data[d_id]['date']))
    print("   - Tweet_hashtags = {} ".format(data[d_id]['hashtags']))
    print("   - Tweet_likes = {} ".format(data[d_id]['favorites']))
    print("   - Tweet_retweets = {} ".format(data[d_id]['retweets']))
    print("   - Tweet_url = {} ".format(data[d_id]['url']))

[1mInsert a query:[0m


[1mTop 20 results out of 731 for the seached query:[0m


[1mTweet id = 1820[0m
   - Tweet_text = When will the #COVID19 pandemic be over? https://t.co/bebc6ccur1 
   - Tweet_user = World Health Organization (WHO) 
   - Tweet_date = Fri Jul 23 21:04:48 +0000 2021 
   - Tweet_hashtags = ['COVID19'] 
   - Tweet_likes = 2878 
   - Tweet_retweets = 1243 
   - Tweet_url = https://t.co/bebc6ccur1 

[1mTweet id = 1828[0m
   - Tweet_text = #COVID19 vaccines 💉 are halal.

Read more 👉https://t.co/y9lNOaCjgx https://t.co/mY2GHx0VYe 
   - Tweet_user = World Health Organization (WHO) 
   - Tweet_date = Fri Jul 23 12:16:48 +0000 2021 
   - Tweet_hashtags = ['COVID19'] 
   - Tweet_likes = 581 
   - Tweet_retweets = 307 
   - Tweet_url = https://t.co/mY2GHx0VYe 

[1mTweet id = 1043[0m
   - Tweet_text = Vaccines 💉 can’t stop #COVID19 alone, but by doing it all we can make a difference. https://t.co/746LlKfXQJ 
   - Tweet_user = World Health Organization (WHO) 
   - Twee

#### Query 2

In [24]:
print("\033[1mInsert a query:\033[0m\n")
query = q[1]
ranked_tweets, scores = search_Word2Vec(query, index, X, X_tweets, pos_ids, ids_pos, model)    
top = 20

print("\n======================\n\033[1mTop {} results out of {} for the seached query:\033[0m\n".format(top, len(ranked_tweets)))
for d_id in ranked_tweets[:top] :
    print("\n\033[1mTweet id = {}\033[0m".format(d_id))
    print("   - Tweet_text = {} ".format(data[d_id]['org_text']))
    print("   - Tweet_user = {} ".format(data[d_id]['user']))
    print("   - Tweet_date = {} ".format(data[d_id]['date']))
    print("   - Tweet_hashtags = {} ".format(data[d_id]['hashtags']))
    print("   - Tweet_likes = {} ".format(data[d_id]['favorites']))
    print("   - Tweet_retweets = {} ".format(data[d_id]['retweets']))
    print("   - Tweet_url = {} ".format(data[d_id]['url']))

[1mInsert a query:[0m


[1mTop 20 results out of 15 for the seached query:[0m


[1mTweet id = 1959[0m
   - Tweet_text = 💉💉💉💉
💉💉💉💉
💉💉💉💉
💉💉💉💉
💉💉💉💉                 💉💉💉💉
💉💉💉💉                 💉💉💉💉

COVID-19 vaccines     COVID-19 vaccines
in 10 countries             in the rest of the 🌍

#VaccinEquity is 🗝️ to ending the pandemic, together!

#WorldEmojiDay 
   - Tweet_user = World Health Organization (WHO) 
   - Tweet_date = Sat Jul 17 16:24:23 +0000 2021 
   - Tweet_hashtags = ['VaccinEquity', 'WorldEmojiDay'] 
   - Tweet_likes = 3486 
   - Tweet_retweets = 1517 
   - Tweet_url =  

[1mTweet id = 4[0m
   - Tweet_text = RT @opsoms: If you are fully vaccinated 💉💉, can you still get COVID-19?

🚨 It does not matter if you are vaccinated or if you are still waiting, yes ... 
   - Tweet_user = OPS/OMS 
   - Tweet_date = Wed Oct 13 05:47:10 +0000 2021 
   - Tweet_hashtags = [] 
   - Tweet_likes = 0 
   - Tweet_retweets = 43 
   - Tweet_url = https://t.co/YxWlXt1QWr 

[1mTweet id = 1849[0

#### Query 3

In [25]:
print("\033[1mInsert a query:\033[0m\n")
query = q[2]
ranked_tweets, scores = search_Word2Vec(query, index, X, X_tweets, pos_ids, ids_pos, model)    
top = 20

print("\n======================\n\033[1mTop {} results out of {} for the seached query:\033[0m\n".format(top, len(ranked_tweets)))
for d_id in ranked_tweets[:top] :
    print("\n\033[1mTweet id = {}\033[0m".format(d_id))
    print("   - Tweet_text = {} ".format(data[d_id]['org_text']))
    print("   - Tweet_user = {} ".format(data[d_id]['user']))
    print("   - Tweet_date = {} ".format(data[d_id]['date']))
    print("   - Tweet_hashtags = {} ".format(data[d_id]['hashtags']))
    print("   - Tweet_likes = {} ".format(data[d_id]['favorites']))
    print("   - Tweet_retweets = {} ".format(data[d_id]['retweets']))
    print("   - Tweet_url = {} ".format(data[d_id]['url']))

[1mInsert a query:[0m


[1mTop 20 results out of 36 for the seached query:[0m


[1mTweet id = 516[0m
   - Tweet_text = Global leaders attending the 🇺🇸-hosted Global #COVID19 Summit re-affirm their commitment to ending the acute phase of the pandemic, and the goals of the @ACTAccelerator, by agreeing targets to provide equitable access to:
✅tests
✅treatments
✅vaccines

👉https://t.co/T3utXY29Bv https://t.co/njtxVXEbs0 
   - Tweet_user = World Health Organization (WHO) 
   - Tweet_date = Fri Sep 24 16:13:12 +0000 2021 
   - Tweet_hashtags = ['COVID19'] 
   - Tweet_likes = 193 
   - Tweet_retweets = 63 
   - Tweet_url = https://t.co/njtxVXEbs0 

[1mTweet id = 1087[0m
   - Tweet_text = "Most recently, the @g20org established a High-Level Independent Panel on Financing the Global Commons for Pandemic Preparedness and Response."-@DrTedros 
https://t.co/5U2cYU5mDm 
   - Tweet_user = World Health Organization (WHO) 
   - Tweet_date = Wed Aug 25 15:34:09 +0000 2021 
   - Tweet_hashtags =

#### Query 4

In [26]:
print("\033[1mInsert a query:\033[0m\n")
query = q[3]
ranked_tweets, scores = search_Word2Vec(query, index, X, X_tweets, pos_ids, ids_pos, model)    
top = 20

print("\n======================\n\033[1mTop {} results out of {} for the seached query:\033[0m\n".format(top, len(ranked_tweets)))
for d_id in ranked_tweets[:top] :
    print("\n\033[1mTweet id = {}\033[0m".format(d_id))
    print("   - Tweet_text = {} ".format(data[d_id]['org_text']))
    print("   - Tweet_user = {} ".format(data[d_id]['user']))
    print("   - Tweet_date = {} ".format(data[d_id]['date']))
    print("   - Tweet_hashtags = {} ".format(data[d_id]['hashtags']))
    print("   - Tweet_likes = {} ".format(data[d_id]['favorites']))
    print("   - Tweet_retweets = {} ".format(data[d_id]['retweets']))
    print("   - Tweet_url = {} ".format(data[d_id]['url']))

[1mInsert a query:[0m


[1mTop 20 results out of 64 for the seached query:[0m


[1mTweet id = 150[0m
   - Tweet_text = For more information on #mentalhealth , please visit 👇https://t.co/n3Iaz7WUVf 
   - Tweet_user = World Health Organization (WHO) 
   - Tweet_date = Fri Oct 08 13:40:30 +0000 2021 
   - Tweet_hashtags = ['mentalhealth'] 
   - Tweet_likes = 68 
   - Tweet_retweets = 20 
   - Tweet_url =  

[1mTweet id = 47[0m
   - Tweet_text = RT @WHOSEARO: Spending too much time on your laptop 🧑🏽‍💻 or mobile phone 📲? Please don’t. Take care of your #mentalhealth.

#WorldMentalHea… 
   - Tweet_user = WHO South-East Asia 
   - Tweet_date = Mon Oct 11 04:45:10 +0000 2021 
   - Tweet_hashtags = ['mentalhealth'] 
   - Tweet_likes = 0 
   - Tweet_retweets = 21 
   - Tweet_url = https://t.co/Fiq2amvmg2 

[1mTweet id = 225[0m
   - Tweet_text = Block your calendars for @TwitterSpaces chat on #MentalHealth &amp; young people 👇

🗓 Saturday, 9 October 
🕛 12:00pm CEST 

#AskWHO
#WorldMenta

#### Query 5

In [27]:
print("\033[1mInsert a query:\033[0m\n")
query = q[4]
ranked_tweets, scores = search_Word2Vec(query, index, X, X_tweets, pos_ids, ids_pos, model)    
top = 20

print("\n======================\n\033[1mTop {} results out of {} for the seached query:\033[0m\n".format(top, len(ranked_tweets)))
for d_id in ranked_tweets[:top] :
    print("\n\033[1mTweet id = {}\033[0m".format(d_id))
    print("   - Tweet_text = {} ".format(data[d_id]['org_text']))
    print("   - Tweet_user = {} ".format(data[d_id]['user']))
    print("   - Tweet_date = {} ".format(data[d_id]['date']))
    print("   - Tweet_hashtags = {} ".format(data[d_id]['hashtags']))
    print("   - Tweet_likes = {} ".format(data[d_id]['favorites']))
    print("   - Tweet_retweets = {} ".format(data[d_id]['retweets']))
    print("   - Tweet_url = {} ".format(data[d_id]['url']))

[1mInsert a query:[0m


[1mTop 20 results out of 18 for the seached query:[0m


[1mTweet id = 1100[0m
   - Tweet_text = #Hypertension - or elevated blood pressure - is a serious medical condition that significantly increases the risks of ❤, 🧠, kidney &amp; other diseases.
 
It is a major cause of premature death worldwide.
 
👉 https://t.co/PEXOASL1Tg https://t.co/QIAIsudIxE 
   - Tweet_user = World Health Organization (WHO) 
   - Tweet_date = Wed Aug 25 12:38:53 +0000 2021 
   - Tweet_hashtags = ['Hypertension'] 
   - Tweet_likes = 145 
   - Tweet_retweets = 70 
   - Tweet_url = https://t.co/QIAIsudIxE 

[1mTweet id = 323[0m
   - Tweet_text = Today is #WorldHeartDay 💗
 
#Hypertension - or elevated blood pressure - is a serious medical condition that significantly increases the risks of ❤, 🧠, kidney &amp; other diseases. It is a major cause of premature death worldwide.
 
👉 https://t.co/PEXOASL1Tg https://t.co/L6vriEK1yc 
   - Tweet_user = World Health Organization (WHO) 
   - T