# **Part 3: Ranking**

## 0. Preparation, Text Processing and Indexing

0.1 *Import libraries*

In [None]:
!pip install --upgrade nltk
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
import collections
from collections import defaultdict
from array import array
import math
import numpy as np
from numpy import linalg as la
import string
import json
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import spacy
from textblob import TextBlob
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0.2 *Connect to Google Drive*

In [None]:
from google.colab import drive
drive.mount('/content/drive')
docs_path = '/content/drive/Shareddrives/IRWA Labs/PROJECT/IRWA - PROJECT DATA/farmers-protest-tweets.json' # modify path where are the documents

Mounted at /content/drive


0.3 *Read and preprocess tweets lines*

In [None]:
with open(docs_path) as fp:
    lines = fp.readlines()
lines = [l.strip().replace(' +', ' ') for l in lines]

0.4 *Load and map tweets with IDs*

In [None]:
excel_data = pd.read_csv('/content/drive/Shareddrives/IRWA Labs/PROJECT/IRWA - PROJECT DATA/tweet_document_ids_map.csv') # change path if necessary
tweet_id_to_doc_id = dict(zip(excel_data['id'], excel_data['docId']))

0.5 *Processing tweets (clean and normalize the content)*

In [None]:
def build_terms(line):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))

    try:
        tweet_data = json.loads(line)
    except json.JSONDecodeError:
        print(f"Error: {line}")
        return None

    if tweet_data.get("lang") != "en": # only filter English tweets
        return None

    #Relevant information
    tweet_content = tweet_data.get("content", "")
    tweet_date = tweet_data.get("date", "")
    tweet_likes = tweet_data.get("likeCount", 0)
    tweet_retweets = tweet_data.get("retweetCount", 0)
    tweet_url = tweet_data.get("url", "")
    id = tweet_data.get("id", "")
    doc_id = tweet_id_to_doc_id.get(int(id))

    #Lowercase
    line = tweet_content.lower()

    #Tokenize
    line = line.split()

    #Punctuation signs
    line = [x.translate(str.maketrans('', '', string.punctuation)) for x in line]

    #Eliminate #
    hashtags = [word for word in tweet_content.split() if word.startswith('#')]

    #Stopwords
    line = [x for x in line if x not in stop_words and x]

    #Steaming
    line = [stemmer.stem(x) for x in line]

    #Return relevant information
    return {
        "tweet": line,
        "date": tweet_date,
        "hashtags": hashtags,
        "likes": tweet_likes,
        "retweets": tweet_retweets,
        "url": tweet_url,
        "id": id,
        "doc_id": doc_id
    }

0.6 *Apply text processing to all tweets and count the words in each tweet*

In [None]:
processed_tweets = []
total_words = []
for line in lines:
    processed_tweet = build_terms(line)
    if processed_tweet:
        processed_tweets.append(processed_tweet)
        total_words.append(len(processed_tweet['tweet']))

In [None]:
n_tweets = len(processed_tweets)
print(processed_tweets[0])
print(f"Total number of processed tweets: {n_tweets}")

{'tweet': ['world', 'progress', 'indian', 'polic', 'govt', 'still', 'tri', 'take', 'india', 'back', 'horrif', 'past', 'tyranni', 'narendramodi', 'delhipolic', 'shame', 'modidontsellfarm', 'farmersprotest', 'freenodeepkaur', 'httpstcoes3kn0iqaf'], 'date': '2021-02-24T09:23:35+00:00', 'hashtags': ['#ModiDontSellFarmers', '#FarmersProtest', '#FreeNodeepKaur'], 'likes': 0, 'retweets': 0, 'url': 'https://twitter.com/ArjunSinghPanam/status/1364506249291784198', 'id': 1364506249291784198, 'doc_id': 'doc_0'}
Total number of processed tweets: 48429


0.7 *Create inverted index with TF-IDF (code fomr previous part of the project)*

In [None]:
# Build the inverted index and calculate TF-IDF
inverted_index = defaultdict(set)
tfidf_scores = defaultdict(lambda: defaultdict(float))
tweet_lengths = defaultdict(int)
total_tweets = len(processed_tweets)

# Construct the inverted index and calculate TF-IDF
for tweet in processed_tweets:
    tweet_id = tweet["doc_id"]
    term_count = defaultdict(int)

    # Count term frequencies in the tweet
    for term in tweet["tweet"]:
        term_count[term] += 1
        inverted_index[term].add(tweet_id)

    # Calculate TF-IDF scores for each term in the tweet
    for term, count in term_count.items():
        # Term Frequency (TF)
        tf = count / len(tweet["tweet"])

        # Document Frequency (DF) for the term across tweets
        tweet_frequency = len(inverted_index[term])

        # Inverse Document Frequency (IDF)
        idf = math.log(total_tweets / (1 + tweet_frequency))

        # Calculate and store TF-IDF score for each term in the tweet
        tfidf_scores[tweet_id][term] = tf * idf

        # Update tweet length for normalization (sum of squared tf-idf scores)
        tweet_lengths[tweet_id] += (tf * idf) ** 2

# Finalize tweet lengths by taking the square root of summed squares (for normalization)
for tweet_id in tweet_lengths:
    tweet_lengths[tweet_id] = math.sqrt(tweet_lengths[tweet_id])

# Convert sets to lists and sort them for consistency
for term in inverted_index:
    inverted_index[term] = sorted(list(inverted_index[term]))

0.8 *Our 5 Queries*

In [None]:
# Define the five queries
queries = [
    ["indian", "protest"],
    ["support", "farmersprotest"],
    ["people", "right"],
    ["free", "speech"],
    ["climat", "activist"]
]

## 1. Different ways of ranking

1.1 *TF-IDF + Cosine Similarity (code from previous part)*

In [None]:
# Function to rank tweets based on TF-IDF and cosine similarity
def rank_tweets_tfidf(terms, tweet_ids, index, tfidf_scores, tweet_lengths):
    tweet_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)

    # Calculate the norm for the query
    query_terms_count = collections.Counter(terms)
    query_norm = np.linalg.norm(list(query_terms_count.values()))

    for term_idx, term in enumerate(terms):
        if term in index:
            query_vector[term_idx] = (query_terms_count[term] / query_norm) * (math.log(total_tweets / (1 + len(index[term]))))

            # Generate tweet vectors for each matching tweet
            for tweet_id in index[term]:
                if tweet_id in tweet_ids:
                    tweet_vectors[tweet_id][term_idx] = tfidf_scores[tweet_id][term]

    # Calculate scores using cosine similarity
    tweet_scores = [[np.dot(tweet_vec, query_vector) / (tweet_lengths[tweet_id] * query_norm), tweet_id]
                    for tweet_id, tweet_vec in tweet_vectors.items()]

    tweet_scores.sort(reverse=True, key=lambda x: x[0])
    return [x[1] for x in tweet_scores if x[0] > 0]

# Perform search and ranking for each query based on TF-IDF and Cosine Similarity
for query_terms in queries:
    matched_tweets = set()
    for term in query_terms:
        if term in inverted_index:
            matched_tweets.update(inverted_index[term])
    ranked_tweet_ids = rank_tweets_tfidf(query_terms, matched_tweets, inverted_index, tfidf_scores, tweet_lengths)

    # Display top results with scores
    top_n = 5
    print(f"\nTop {top_n} ranked tweets for query {' '.join(query_terms)}:")
    for tweet_id in ranked_tweet_ids[:top_n]:
        tweet_data = next((tweet for tweet in processed_tweets if tweet["doc_id"] == tweet_id), None)
        if tweet_data:
            tweet_text = ' '.join(tweet_data["tweet"])
            print(f"Tweet ID: {tweet_id} - Tweet Text: {tweet_text}")


Top 5 ranked tweets for query indian protest:
Tweet ID: doc_19653 - Tweet Text: punyaab farmer indian n everi person protest indian first think tweet protest farmersprotest
Tweet ID: doc_9676 - Tweet Text: indian farmer protest farmersprotest httpstco9mzfbgqaxl
Tweet ID: doc_2022 - Tweet Text: support indian farmer farmersprotest
Tweet ID: doc_34729 - Tweet Text: indian farmer protest matter british indian farmersprotest httpstcokycwndvyem
Tweet ID: doc_884 - Tweet Text: farmersprotest farmlaw farmlaws2020 farmer indian farmer protest httpstcokpk7turg6

Top 5 ranked tweets for query support farmersprotest:
Tweet ID: doc_43898 - Tweet Text: support farmersprotest
Tweet ID: doc_39708 - Tweet Text: support farmersprotest
Tweet ID: doc_36108 - Tweet Text: support farmersprotest
Tweet ID: doc_30903 - Tweet Text: support farmersprotest
Tweet ID: doc_26647 - Tweet Text: support farmersprotest

Top 5 ranked tweets for query people right:
Tweet ID: doc_5988 - Tweet Text: right farmersprotest
T

1.2 *Popularity Score + Cosine Similarity*

In [None]:
# First of all, we create a function to compute popularity based on likes and retweets
def calculate_popularity_score(tweet_data, max_likes, max_retweets):
    like_score = tweet_data["likes"] / max_likes if max_likes > 0 else 0
    retweet_score = tweet_data["retweets"] / max_retweets if max_retweets > 0 else 0
    popularity_score = (0.5 * like_score) + (0.5 * retweet_score)
    return popularity_score

# Then, we do a function using a new score that combines popularity metrics with cosine similarity
def rank_tweets_new_score(terms, tweet_ids, index, tfidf_scores, tweet_lengths, processed_tweets):
    # Maximum values for likes and retweets
    max_likes = max(tweet["likes"] for tweet in processed_tweets if tweet["doc_id"] in tweet_ids)
    max_retweets = max(tweet["retweets"] for tweet in processed_tweets if tweet["doc_id"] in tweet_ids)

    # TF-IDF-based Cosine Similarity
    tweet_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)
    query_terms_count = collections.Counter(terms)
    query_norm = np.linalg.norm(list(query_terms_count.values()))
    for term_idx, term in enumerate(terms):
        if term in index:
            query_vector[term_idx] = (query_terms_count[term] / query_norm) * (math.log(total_tweets / (1 + len(index[term]))))
            for tweet_id in index[term]:
                if tweet_id in tweet_ids:
                    tweet_vectors[tweet_id][term_idx] = tfidf_scores[tweet_id][term]

    # Calculate the combined score for each tweet by weighting cosine similarity with popularity
    tweet_scores = []
    for tweet_id, tweet_vec in tweet_vectors.items():
        cosine_similarity = np.dot(tweet_vec, query_vector) / (tweet_lengths[tweet_id] * query_norm)
        if cosine_similarity > 0:
            tweet_data = next((tweet for tweet in processed_tweets if tweet["doc_id"] == tweet_id), None)
            if tweet_data:
                popularity_score = calculate_popularity_score(tweet_data, max_likes, max_retweets)
                combined_score = cosine_similarity * popularity_score
                tweet_scores.append((combined_score, tweet_id))

    # Sort tweets by combined score in descending order
    tweet_scores.sort(reverse=True, key=lambda x: x[0])
    return [tweet_id for _, tweet_id in tweet_scores]


# Perform search and ranking for each query based on popularity metrics and cosine similarity
for query_terms in queries:
    matched_tweets = set()
    for term in query_terms:
        if term in inverted_index:
            matched_tweets.update(inverted_index[term])
    ranked_tweet_ids = rank_tweets_new_score(query_terms, matched_tweets, inverted_index, tfidf_scores, tweet_lengths, processed_tweets)

    # Display top results
    top_n = 5
    print(f"\nTop {top_n} ranked tweets for query {' '.join(query_terms)}:")
    for tweet_id in ranked_tweet_ids[:top_n]:
        tweet_data = next((tweet for tweet in processed_tweets if tweet["doc_id"] == tweet_id), None)
        if tweet_data:
            tweet_text = ' '.join(tweet_data["tweet"])
            print(f"Tweet ID: {tweet_id} - Tweet Text: {tweet_text}")


Top 5 ranked tweets for query indian protest:
Tweet ID: doc_23286 - Tweet Text: arrest climatechang activist disharavi huge mistak indian polic taken farmersprotest anoth level amp audienc human right activist amp protest must respect freedisharavi
Tweet ID: doc_13630 - Tweet Text: keep support peac farmer protest dpstoplntimidatingfarm farmersprotest kisanandolan supportfarm australiasupportingfarm digitalkisan digitalkisanmorcha httpstcod5darstzw9
Tweet ID: doc_34869 - Tweet Text: indian non indian friend disha ravi 21 year old activist arrest put togeth toolkit link donat farmersprotest help us show indian govern toler tyranni tweet hashtag releasedisharavi
Tweet ID: doc_14687 - Tweet Text: indian farmersprotest largest protest world support farmer ✊✊ mspकिसानकाहक dpstopintimidatingfarm httpstcona5kgotlet
Tweet ID: doc_44034 - Tweet Text: guy pleas keep use follow hashtag farmer protest relat post farmersprotest standwithfarm

Top 5 ranked tweets for query support farmersprotest:
T

1.3 *BM25 Ranking*

In [None]:
# BM25 ranking function
def rank_tweets_bm25(terms, tweet_ids, index, tweet_lengths, avg_doc_len, k1=1.5, b=0.75):
    tweet_scores = defaultdict(float)

    # Calculate IDF for each term
    idf = {}
    total_tweets = len(tweet_ids)

    for term in terms:
        doc_freq = len(index.get(term, []))
        idf[term] = math.log((total_tweets - doc_freq + 0.5) / (doc_freq + 0.5) + 1.0)

    # Rank tweets using BM25 formula
    for tweet_id in tweet_ids:
        tweet_len = tweet_lengths.get(tweet_id, 0)
        score = 0.0

        for term in terms:
            if tweet_id in index.get(term, []):
                # Calculate term frequency in the tweet
                term_freq = sum(1 for t in index[term] if t == tweet_id)

                # BM25 formula component for each term
                tf_component = (term_freq * (k1 + 1)) / (term_freq + k1 * (1 - b + b * (tweet_len / avg_doc_len)))
                score += idf[term] * tf_component

        # Store score for tweet
        if score > 0:
            tweet_scores[tweet_id] = score

    # Sort tweets by score in descending order
    ranked_tweet_ids = sorted(tweet_scores.items(), key=lambda x: x[1], reverse=True)

    return [tweet_id for tweet_id, score in ranked_tweet_ids]

# Perform search and ranking for each query based on BM25 and cosine similarity
for query_terms in queries:
    matched_tweets = set()
    for term in query_terms:
        if term in inverted_index:
            matched_tweets.update(inverted_index[term])
    avg_doc_len = sum(tweet_lengths.values()) / len(tweet_lengths)
    ranked_tweet_ids = rank_tweets_bm25(query_terms, matched_tweets, inverted_index, tweet_lengths, avg_doc_len)

    # Display top results
    top_n = 5
    print(f"\nTop {top_n} ranked tweets for query {' '.join(query_terms)}:")
    for tweet_id in ranked_tweet_ids[:top_n]:
        tweet_data = next((tweet for tweet in processed_tweets if tweet["doc_id"] == tweet_id), None)
        if tweet_data:
            tweet_text = ' '.join(tweet_data["tweet"])
            print(f"Tweet ID: {tweet_id} - Tweet Text: {tweet_text}")


Top 5 ranked tweets for query indian protest:
Tweet ID: doc_44995 - Tweet Text: pmoindia narendramodi indian farmer r peac protest 3 farm bill amp r blind struggl spread fals narr farmer amp khalsa aid farmer present amp futur india mahapanchayatrevolut farmerscallpmfordeb istandwithkhalsaaid farmersprotest
Tweet ID: doc_47236 - Tweet Text: gretathunberg greta problem peopl modi bhagat amp godimedia say farmer protest say khalsatani terrorist farmer indian amp peac protest thank god modibhagat fail plan violenc redfort gretathunberg farmersprotest jaijawanjaikisan
Tweet ID: doc_45011 - Tweet Text: rakeshtikaitbku pmoindia narendramodi ahindinew officialbku ptinew ndtvindia kisanmorchaekta bbchindi bbcworld support indian farmer peac protest sinc 145 day delhi border cold wintersrul govt suppress voic farmer firm amp open debat pm mahapanchayatrevolut farmerscallpmfordeb farmersabovereligioush farmersprotest httpstco86tpreoymt
Tweet ID: doc_46909 - Tweet Text: thedailyshow thank alot s

## 2. Top-20 Documents

In [None]:
# Return a top-20 list of documents for each of the 5 queries, using word2vec + cosine similarity.
from sklearn.metrics.pairwise import cosine_similarity
tweets = [tweet["tweet"] for tweet in processed_tweets]
import numpy as np

# Train the Word2Vec model
model = Word2Vec(sentences=tweets, vector_size=100, window=5, min_count=1, workers=4)

# Function to get the average vector of a tweet
def tweet_to_vec(tweet, model):
    word_vectors = [model.wv[word] for word in tweet if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Generate tweet representations
tweet_vectors = {tweet_id: tweet_to_vec(tweet, model) for tweet_id, tweet in enumerate(tweets)}

# Generate query representations
query_vectors = [tweet_to_vec(query, model) for query in queries]

# Calculate similarity and get Top-20 for each query
top_20_tweets = {}

for i, query_vec in enumerate(query_vectors):
    similarities = [(tweet_id, cosine_similarity([query_vec], [tweet_vec])[0][0]) for tweet_id, tweet_vec in tweet_vectors.items()]
    # Sort tweets by descending similarity and select the Top-20
    top_20_tweets[i] = sorted(similarities, key=lambda x: x[1], reverse=True)[:20]

    # Display results
    print(f"\nTop 20 tweets for query {i+1}:")
    for tweet_id, similarity in top_20_tweets[i]:
        tweet_text = ' '.join(tweets[tweet_id])  # Rebuild the original text of the tweet
        print(f"Tweet ID: {tweet_id} - Similarity: {similarity:.4f} - Text: {tweet_text}")


Top 20 tweets for query 1:
Tweet ID: 34729 - Similarity: 0.9654 - Text: indian farmer protest matter british indian farmersprotest httpstcokycwndvyem
Tweet ID: 30746 - Similarity: 0.9549 - Text: stevehank largest crowd ever seen protest except indian farmer protest farmersprotest
Tweet ID: 9676 - Similarity: 0.9541 - Text: indian farmer protest farmersprotest httpstco9mzfbgqaxl
Tweet ID: 23958 - Similarity: 0.9465 - Text: indian journalist ⁦ranaayyub⁩ spoken ⁦msnbc⁩ host chri hay regard farmersprotest india titl “modi’ “arrog power” indian farmers’ protest rana ayyub httpstco1film2cmf7
Tweet ID: 30422 - Similarity: 0.9453 - Text: indian farmer protest govern farmersprotest httpstcoemugoxtabz
Tweet ID: 40582 - Similarity: 0.9439 - Text: veteran indian armi thrown jail peac protest total disrespect indian author farmersprotest httpstcoomaver86xw
Tweet ID: 33299 - Similarity: 0.9435 - Text: indian farmer protest matter british indian bbc news farmersprotest farmersprotest httpstcoruuhvrx