# Project part 3: **RANKING**
Èric Dalmases, Joel Duran, Marc Aguilar  

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import json
import nltk
import re
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import pandas as pd
from datetime import datetime
from collections import defaultdict
import collections
from array import array
import math
import numpy as np
import time
from numpy import linalg as la

Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# 1. Importing code from previous parts

In [2]:
data_path = '/content/drive/Shareddrives/IRWA/Project - 1st Deliverable/IRWA_data_2023/'
tweets_path = 'Rus_Ukr_war_data.json'
maps_path = 'Rus_Ukr_war_data_ids.csv'
json_tweets = []
# We load the json tweets line by line
with open(f'{data_path}{tweets_path}', 'r') as file:
    for line in file:
        json_tweets.append(json.loads(line))

# We also load the document ids in a dataframe to perform the posterior mapping
documents = pd.read_csv(f'{data_path}{maps_path}', sep='\t', names=['doc_id', 'id'], index_col='id')

In [3]:
class TextProcessor():
  @staticmethod
  def process(input: str) -> str:
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    output = input.lower() ## Transform in lowercase
    output = re.sub(r"(https?://\S+)|[^\w\s@🇺🇸🇺🇦🇷🇺🇨🇳🇪🇺🇹🇷]|\n|#", ' ', output) # Delete URLs, all symbols that are not mentions, hashtags or important flags (US, UA, RU)
    output = output.strip() # Delete spaces in front / at the end of the text ()
    output = re.split(r'\s+', output) # Split the text by spaces (same as output.split but considers a set of blank spaces as one)
    output = [i for i in output if i not in stop_words] # and .isalpha()
    output = [stemmer.stem(i) for i in output]
    return output

In [4]:
class User:
  def __init__(self, id, username, private, followers, followed, created_at, favourites, verified, num_tweets):
    self.id = id
    self.username = username
    self.private = private
    self.followers = followers
    self.followed = followed
    self.created_at = created_at
    self.favourites = favourites
    self.verified = verified
    self.num_tweets = num_tweets

  @classmethod
  def fromJson(self, json:str):
    return User(json['id'], json['screen_name'], json['protected'],
                json['followers_count'], json['friends_count'], json['created_at'],
                json['favourites_count'], json['verified'], json['statuses_count'])


# This class wil represent a single instance of a tweet
class Tweet:
  def __init__(self, id, user_id, username, doc_id, text, tokenized_text, date, hashtags, likes, retweets, url):
    self.id = id
    self.user_id = user_id
    self.username = username
    self.doc_id = doc_id
    self.text = text
    self.tokenized_text = tokenized_text
    self.date = date
    self.hashtags = hashtags
    self.likes = likes
    self.retweets = retweets
    self.url = url
    self.num_replies = 0
    self.user = None

  # This method is the one in charge of parsing the json data
  @classmethod
  def fromJson(self, json: str):
    doc_id = documents.loc[int(json['id'])]['doc_id']
    t = Tweet(
        id = json['id'],
        user_id = json['user']['id'],
        username = json['user']['screen_name'],
        doc_id = doc_id,
        text = json['full_text'],
        tokenized_text = TextProcessor.process((json['full_text'])),
        date = datetime.strptime(json['created_at'], "%a %b %d %H:%M:%S %z %Y"),
        hashtags = [h['text'] for h in json['entities']['hashtags']],
        likes = json['favorite_count'],
        retweets = json['retweet_count'],
        url = f"https://www.twitter.com/{json['user']['screen_name']}/status/{json['id']}",
    )
    t.user = User.fromJson(json['user'])
    return t

  def getScore(self):
    #likes, retweets| private, followers, followed, verified
    #1. retweets 2. likes 3. ratio followers/followed 4. verified
    verifiedCount = 0
    if self.user.verified:
      verifiedCount = 1
    #print(f"account: {self.doc_id}, SCORE: {self.retweets * 2 + self.likes + self.num_replies * 1.5 + ((np.log(self.user.followers+1)/np.log(max(10,self.user.followed)))) + verifiedCount + (1/self.user.num_tweets)},  retweets: {self.retweets * 2}, likes: {self.likes}, replies: {self.num_replies * 1.5}, ratio: {(np.log(self.user.followers+1)/np.log(max(10,self.user.followed)))}, verified: {verifiedCount}, num_tweets:{(1/self.user.num_tweets)}")
    return self.retweets * 2 + self.likes + self.num_replies * 1.5 + ((np.log(self.user.followers+1)/np.log(max(10,self.user.followed)))) + verifiedCount + (1/self.user.num_tweets)

In [5]:
tweets = []
for t in json_tweets:
  tweets.append(Tweet.fromJson(t))

In [6]:
# We load the all the tweets from the loaded jsons
tweets = []
replies_dict = {}
users = {}
for t in json_tweets:
  tweet = Tweet.fromJson(t)
  tweets.append(tweet)
  if tweet.user_id not in users.keys():
    users[tweet.user_id] = User.fromJson(t['user'])
  replyTweetId = t["in_reply_to_status_id"]
  if replyTweetId != None:
    if replyTweetId in replies_dict.keys():
      replies_dict[replyTweetId] += 1
    else:
      replies_dict[replyTweetId] = 1

for t in tweets:
  t.user = users[t.user_id]

def getTweetFromId(tweets, id):
  for t in tweets:
    if t.id == id:
      return t
  return None

count = 0
for id in replies_dict.keys():
  tweet = getTweetFromId(tweets, id)
  if tweet != None:
    count += 1
    tweet.num_replies = replies_dict[id]

print(f"There are {count} tweets that are replies to tweets of our database out of {len(replies_dict)}")

There are 82 tweets that are replies to tweets of our database out of 322


In [7]:
def create_index_tfidf(tweets, num_documents):
    """
    Implement the inverted index and compute tf, df and idf

    Argument:
    tweets -- collection of Tweets
    num_documents -- total number of tweets

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """

    index = defaultdict(list)
    tf = defaultdict(list)  # term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  # document frequencies of terms in the corpus
    title_index = defaultdict(str)
    idf = defaultdict(float)

    for tweet in tweets:
        title = tweet.url
        page_id = tweet.doc_id
        terms = tweet.tokenized_text
        title_index[page_id] = title

        ## ===============================================================
        ## create the index for the **current page** and store it in current_page_index
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}

        ## Example: if the curr_doc has id 1 and his text is
        ##"web retrieval information retrieval":

        ## current_page_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

        ## the term ‘web’ appears in document 1 in positions 0,
        ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
        ## ===============================================================

        current_page_index = {}

        for position, term in enumerate(terms):  ## terms contains page_title + page_text
            try:
                # if the term is already in the dict append the position to the corresponding list
                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term]=[page_id, array('I',[position])] #'I' indicates unsigned int (int in Python)

        #normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_page_index.items():
            # posting will contain the list of positions for current term in current document.
            # posting ==> [current_doc, [list of positions]]
            # you can use it to infer the frequency of current term.
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        # calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_page_index.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting[1])/norm,4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1 # increment DF for current term

        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

        # Compute IDF following the formula (3) above. HINT: use np.log
        for term in df:
            idf[term] = np.round(np.log(float(num_documents/df[term])), 4)

    return index, tf, df, idf, title_index

In [8]:
start_time = time.time()
num_documents = len(tweets)
index, tf, df, idf, title_index = create_index_tfidf(tweets, num_documents)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the index: 236.8 seconds


In [9]:
def rank_documents(terms, docs, index, idf, tf, title_index):
    """
    Perform the ranking of the results of a search based on the tf-idf weights

    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    title_index -- mapping between page id and page title

    Returns:
    Print the list of ranked documents
    """

    # I'm interested only on the element of the docVector corresponding to the query terms
    # The remaining elements would became 0 when multiplied to the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)
    #print('len docs:', len(docs))

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query.
    # Example: collections.Counter(["hello","hello","world"]) --> Counter({'hello': 2, 'world': 1})
    # HINT: use when computing tf for query_vector

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex]= query_terms_count[term]/ query_norm * idf[term]

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):
            # Example of [doc_index, (doc, postings)]
            # 0 (26, array('I', [1, 4, 12, 15, 22, 28, 32, 43, 51, 68, 333, 337]))
            # 1 (33, array('I', [26, 33, 57, 71, 87, 104, 109]))
            # term is in doc 26 in positions 1,4, .....
            # term is in doc 33 in positions 26,33, .....

            #tf[term][0] will contain the tf of the term "term" in the doc 26
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]  # TODO: check if multiply for idf

    # Calculate the score of each doc
    # compute the cosine similarity between queyVector and each docVector:
    # HINT: you can use the dot product because in case of normalized vectors it corresponds to the cosine similarity
    # see np.dot

    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    scores = [x[0] for x in doc_scores]
    #print document titles instead if document id's
    #result_docs=[ title_index[x] for x in result_docs ]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_tf_idf(query, index)
    #print ('\n'.join(result_docs), '\n')
    return doc_scores

In [10]:
def search_tf_idf(query, index):
    """
    output is the list of documents that contain any of the query terms.
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = TextProcessor.process(query) # We use our custom processor for the text to match the processing performed to the tweets
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"
            term_docs=[posting[0] for posting in index[term]]

            # docs = docs Union term_docs
            docs = docs.union(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    ranked_docs = rank_documents(query, docs, index, idf, tf, title_index)
    return ranked_docs

In [11]:
def getTweetFromDocId(docId):
  for tweet in tweets:
    if tweet.doc_id == docId:
      return tweet
  return None

In [12]:
class Query:
  def __init__(self, query, query_id):
    self.query = query
    self.query_id = "Q"+str(query_id+1)
    self.tokenized_query = TextProcessor.process(query)
    self.relevant_tweets = set()
    self.non_relevant_tweets = set()
    self.tweets_relevancy = set()

  def addRelevantTweet(self, doc_id):
    if len(self.relevant_tweets) < 10:
      self.relevant_tweets.add(doc_id)

  def addNonRelevantTweet(self, doc_id):
    if len(self.non_relevant_tweets) < 10:
      self.non_relevant_tweets.add(doc_id)

In [13]:
class TweetSimilarityPair:
  def __init__(self, tweet, similarity):
    self.tweet = tweet
    self.similarity = similarity


  @staticmethod
  def fromScoreList(scores, tweet):
    for pair in scores:
      if pair[1] == tweet.doc_id:
        return TweetSimilarityPair(tweet, pair[0])
    return TweetSimilarityPair(tweet, 0)

# 2. Our own score vs tf-idf

In [29]:
queries_text = ["conflict in Ukraine", "gas in Ukraine", "president of russia", "support to Ukraine", "power of russia"]
our_queries = []
for i, text in enumerate(queries_text):
  our_queries.append(Query(text, i))

output_tfidf = {

}

# Ranked by TF-IDF
for q in our_queries:
  print(f"Query : {q.query}")
  doc_scores = search_tf_idf(q.query, index)
  ranked_docs = [x[1] for x in doc_scores]

  query_tokens = set(q.tokenized_query)
  filtered_docs = []
  i = 0
  for pair in doc_scores:
    rank = pair[0]
    doc_id = pair[1]
    tweet = getTweetFromDocId(doc_id)
    if len(query_tokens.intersection(set(tweet.tokenized_text))) >= len(query_tokens): # We check the AND condition between the query and the tweet
      filtered_docs.append(tweet)
  top = 20
  output_tfidf[q] = filtered_docs[:top]
  print("Top {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
  for tweet in filtered_docs[:top]:
      print(f"- doc_id: {tweet.doc_id} | - tweet_id: {tweet.id} | - tweet: {tweet.text} | - hashtags: {tweet.hashtags} | - likes: {tweet.likes} | - retweets: {tweet.retweets} | - url: {tweet.url}\n")
  print("========")

Query : conflict in Ukraine
Top 20 results out of 2659 for the searched query:

- doc_id: doc_3754 | - tweet_id: 1575182921664671746 | - tweet: Ukraine 38 sept; The conflict spills into Russia #UkraineRussiaWar https://t.co/nu8pVAbdTz via @YouTube | - hashtags: ['UkraineRussiaWar'] | - likes: 0 | - retweets: 0 | - url: https://www.twitter.com/KacoBlokland/status/1575182921664671746

- doc_id: doc_2444 | - tweet_id: 1575476963833434113 | - tweet: It is always one adversary of the #war (or #revolution) who's dancing during the conflict.
#UkraineRussiaWar #Ukraine https://t.co/GH6Q8DRZPg | - hashtags: ['war', 'revolution', 'UkraineRussiaWar', 'Ukraine'] | - likes: 0 | - retweets: 0 | - url: https://www.twitter.com/MatiStein/status/1575476963833434113

- doc_id: doc_698 | - tweet_id: 1575825011776290817 | - tweet: #UkraineRussiaWar #UNGA #UNSecurityCouncil 
"The Game and the Stakes of the Conflict in Ukraine". Read our Policy Brief on : 
https://t.co/HeDKTPmUva https://t.co/c46DvwI6AG | - 

In [30]:
def rank_documents_custom(terms, docs, index, custom_scores):
    """
    Perform the ranking of the results of a search based on the custom scores

    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    custom_scores -- custom scores for each document

    Returns:
    Print the list of ranked documents
    """

    # I'm interested only in the element of the docVector corresponding to the query terms
    # The remaining elements would become 0 when multiplied to the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms))  # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query.
    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  # termIndex is the index of the term in the query
        if term not in index:
            continue

        # Compute the custom score for the query term
        query_vector[termIndex] = query_terms_count[term] / query_norm

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):
            # Check if the doc is in the list of documents to rank
            if doc in docs:
                doc_vectors[doc][termIndex] = custom_scores[doc] # Use the custom score directly

    # Calculate the score of each doc
    # compute the cosine similarity between queryVector and each docVector:
    # HINT: you can use the dot product because in case of normalized vectors it corresponds to the cosine similarity
    # see np.dot
    doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    scores = [x[0] for x in doc_scores]

    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_tf_idf(query, index)
    #print ('\n'.join(result_docs), '\n')
    return doc_scores


In [31]:
def search_custom(query, index, custom_scores):
    """
    output is the list of documents that contain any of the query terms.
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = TextProcessor.process(query) # We use our custom processor for the text to match the processing performed to the tweets
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"
            term_docs=[posting[0] for posting in index[term]]

            # docs = docs Union term_docs
            docs = docs.union(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    ranked_docs = rank_documents_custom(query, docs, index, custom_scores)
    return ranked_docs

In [32]:
custom_scores = {}
for t in tweets:
  custom_scores[t.doc_id] = t.getScore()
doc_ids = [t.doc_id for t in tweets]

output_custom = {}
for q in our_queries:
  print(f"Query : {q.query}")
  doc_scores = search_custom(q.query, index, custom_scores)
  ranked_docs = [x[1] for x in doc_scores]

  query_tokens = set(q.tokenized_query)
  filtered_docs = []
  for pair in doc_scores:
    rank = pair[0]
    doc_id = pair[1]
    tweet = getTweetFromDocId(doc_id)
    if len(query_tokens.intersection(set(tweet.tokenized_text))) >= len(query_tokens): # We check the AND condition between the query and the tweet
      filtered_docs.append(tweet)
  top = 20
  output_custom[q] = filtered_docs[:top]
  print("Top {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
  for tweet in filtered_docs[:top]:
      print(f"- doc_id: {tweet.doc_id} | - tweet_id: {tweet.id} | - tweet: {tweet.text} | - hashtags: {tweet.hashtags} | - likes: {tweet.likes} | - retweets: {tweet.retweets} | - url: {tweet.url}\n")
  print("========")


Query : conflict in Ukraine
Top 20 results out of 2659 for the searched query:

- doc_id: doc_1709 | - tweet_id: 1575642675826417664 | - tweet: Rising conflict between Armenia and Azerbaijan is alarming these can lead the world into the same position that Russia and Ukraine are in now. the world needs peace and political solutions to the problems
#Armeniaazerbaijan 
#UkraineRussiaWar | - hashtags: ['Armeniaazerbaijan', 'UkraineRussiaWar'] | - likes: 25 | - retweets: 13 | - url: https://www.twitter.com/ZainSayed78/status/1575642675826417664

- doc_id: doc_3645 | - tweet_id: 1575194020690927616 | - tweet: It should be noted that on September 24, #Russian Foreign Minister Sergei Lavrov called the United States a party to the conflict in #Ukraine.

#RussiaUkraineConflict #UkraineRussiaWar | - hashtags: ['Russian', 'Ukraine', 'RussiaUkraineConflict', 'UkraineRussiaWar'] | - likes: 9 | - retweets: 1 | - url: https://www.twitter.com/PatilSushmit/status/1575194020690927616

- doc_id: doc_2852 

In [33]:
for q in our_queries:
  print(f"Matching : {list(map(lambda x: x.doc_id, list(set(output_tfidf[q]).intersection(set(output_custom[q])))))}")
  print(f"Match at query {q.query}: {len(set(output_tfidf[q]).intersection(set(output_custom[q])))}")

Matching : ['doc_2852', 'doc_1105', 'doc_2798', 'doc_1505', 'doc_324', 'doc_689', 'doc_1709', 'doc_2469', 'doc_1077', 'doc_2766', 'doc_1764', 'doc_3645', 'doc_3756', 'doc_2251', 'doc_2132']
Match at query conflict in Ukraine: 15
Matching : ['doc_2862', 'doc_2681', 'doc_3926', 'doc_3923', 'doc_3441', 'doc_2484', 'doc_3080', 'doc_2828', 'doc_3692', 'doc_3913', 'doc_220']
Match at query gas in Ukraine: 11
Matching : ['doc_403', 'doc_87', 'doc_2393', 'doc_585', 'doc_417', 'doc_470', 'doc_418', 'doc_453', 'doc_632', 'doc_3996', 'doc_408']
Match at query president of russia: 11
Matching : ['doc_1442', 'doc_324', 'doc_3094', 'doc_1416', 'doc_1298', 'doc_2817', 'doc_627']
Match at query support to Ukraine: 7
Matching : ['doc_2548', 'doc_1260', 'doc_2478', 'doc_1254', 'doc_3760', 'doc_457', 'doc_2554', 'doc_1253', 'doc_3090', 'doc_633', 'doc_1446', 'doc_1117', 'doc_1251', 'doc_741', 'doc_2316', 'doc_3572', 'doc_1741', 'doc_2611']
Match at query power of russia: 18


# 3. Word2Vec + Cosine Similarity

In [34]:
import gensim
from gensim.models import Word2Vec, KeyedVectors
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
token_tweets = []
for t in tweets:
  token_tweets.append(t.tokenized_text)

# We train the Word2Vec model over all our tweets
model = Word2Vec(token_tweets, min_count = 1, vector_size = 100, window = 5)

def average_vector(tokens, model):
    # Filter out words that are not in the vocabulary
    tokens = [word for word in tokens if word in model.wv.key_to_index]

    if len(tokens) == 0:
        return np.zeros(model.wv.vector_size)

    # Calculate the average vector for the given tokens
    vector_sum = np.zeros(model.wv.vector_size)
    for word in tokens:
        vector_sum += model.wv[word]

    return vector_sum / len(tokens)



In [36]:
similarity_map = {}
'''
  "Query1": [TweetSimilarityPair, ..., ],
  ...
'''

for query in our_queries:
  similarity_map[query] = []
  query_tokens = set(query.tokenized_query)
  query_embedded = average_vector(query.tokenized_query, model)
  for t in tweets:
    if len(query_tokens.intersection(set(t.tokenized_text))) < len(query_tokens): # All terms of the query must appear in the tweet (AND)
      continue
    tweet_embedded = average_vector(t.tokenized_text, model) # We compute the embedding of the tweet
    # We compute the similarity beteween the query and the tweet
    pair = TweetSimilarityPair(t, cosine_similarity(np.array(query_embedded).reshape(1, -1), np.array(tweet_embedded).reshape(1, -1))[0][0])
    similarity_map[query].append(pair)

In [37]:
for q in similarity_map.keys():
  assert len(similarity_map[q]) >= 20, f"Error with query {q.query}"
  similarity_map[q] = sorted(similarity_map[q], key=lambda x: x.similarity, reverse=True) # We sort the tweets by similarity
  top = 20

  ranked_docs = [pair.tweet for pair in similarity_map[q][:20]]
  print("Top {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
  for tweet in ranked_docs[:top]:
      print(f"- doc_id: {tweet.doc_id} | - tweet_id: {tweet.id} | - tweet: {tweet.text} | - hashtags: {tweet.hashtags} | - likes: {tweet.likes} | - retweets: {tweet.retweets} | - url: {tweet.url}\n")
  print("========")


Top 20 results out of 20 for the searched query:

- doc_id: doc_2251 | - tweet_id: 1575525609714831373 | - tweet: Washington actually became a party to the conflict in Ukraine - Russian Foreign Ministry Lavrov.

#UkraineRussiaWar #Ukraine #UkraineUnderAttack #UkraineWar #UkraineWarCrimes #Ukrainian #UkraineRussiaConflict #Ukraina #StopPutinNOW #StopRussia https://t.co/OP8Gs4xuP6 | - hashtags: ['UkraineRussiaWar', 'Ukraine', 'UkraineUnderAttack', 'UkraineWar', 'UkraineWarCrimes', 'Ukrainian', 'UkraineRussiaConflict', 'Ukraina', 'StopPutinNOW', 'StopRussia'] | - likes: 0 | - retweets: 0 | - url: https://www.twitter.com/W_W_3_2022/status/1575525609714831373

- doc_id: doc_2469 | - tweet_id: 1575473073507155969 | - tweet: #UkraineWar #Ukraine #Russia #ukrainerussiawar #Putin #SanktionengegendieUSA #MAGA #俄罗斯 #乌克兰 #中國

‼️🇺🇸🇺🇦🇷🇺Washington has actually become a party to the conflict in Ukraine | - hashtags: ['UkraineWar', 'Ukraine', 'Russia', 'ukrainerussiawar', 'Putin', 'SanktionengegendieUS