In [65]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import time
import nltk
import pandas as pd
import re
import warnings

from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [20]:
# Uncomment the next lines to run on Google Colab, please.

#from google.colab import drive
#drive.mount('/content/drive')

In [21]:
DOC_PATH = '../data/Rus_Ukr_war_data.json'
MAPPING_PATH = '../data/Rus_Ukr_war_data_ids.csv'

# Functions and modules

### Data cleaning functions

In [22]:
def from_json_to_dataframe(doc_path = '../data/Rus_Ukr_war_data.json'):
    #with open(doc_path) as fp:
    #    lines = fp.readlines()
    df=pd.read_json(doc_path, lines=True)
    return df


def clean_raw_dataset(raw_df):
    # Select only relevant columns
    clean_df = raw_df[["created_at","id","full_text","entities","favorite_count","retweet_count","user"]]

    # Rename columns
    renames = {"created_at":"date", "full_text":"tweet", "favorite_count":"likes","retweet_count":"retweets", "id":"tweet_id"}
    clean_df = clean_df.rename(columns=renames)

    # Create Series of list of hashtags from `entities` object
    df_hashtags = pd.json_normalize(clean_df["entities"])["hashtags"]
    df_hashtags = df_hashtags.apply(lambda x: [item["text"] for item in x])

    # Create Series of username ids
    df_user = pd.json_normalize(clean_df["user"])["id"].rename("user_id")

    # Merge hashtags and username columns to the DataFrame
    clean_df = pd.concat([clean_df,df_hashtags,df_user], axis=1).drop(columns=["entities","user"])

    # Create URL column manually from the user id and tweet id columns
    clean_df["url"] = "https://twitter.com/" + clean_df["user_id"].astype(str) + "/status/" + clean_df["tweet_id"].astype(str)

    # Extract tags to other users from the tweet body
    clean_df["tags"] = clean_df["tweet"].apply(lambda x: re.findall(r"@(\w+)", x))

    # Returns a DataFrame of tweets with columns ["date", "tweet_id", "tweet", "likes", "retweets", "hashtags", "user_id", "url", "tags", "tags"]
    return clean_df


def remove_emojis(tweet):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emojis
                           u"\U0001F300-\U0001F5FF"  # symbols & pictograms
                           u"\U0001F680-\U0001F6FF"  # map symbols
                           u"\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', tweet)


def clean_tweet(line):
    
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    ## START CODE
    line = re.sub(r'[.,;:!?"\'-@]', '', line).replace("#", "").replace("’", "").replace("“", "").replace("\n"," ")
    line =  line.lower() ## Transform in lowercase
    line = remove_emojis(line).strip().replace("  ", " ")
    line = line.split(" ") ## Tokenize the text to get a list of terms
    line =[word for word in line if word not in stop_words]  ## eliminate the stopwords (HINT: use List Comprehension)
    line =[stemmer.stem(word) for word in line] ## perform stemming (HINT: use List Comprehension)
    line = [word for word in line if word != ""]
    ## END CODE
    
    return line


def process_text_column(column):
    column = column.apply(clean_tweet)
    return column

def join_docs_tweets_dfs(tweets, csv_file='../data/Rus_Ukr_war_data_ids.csv'):
    docs = pd.read_csv(csv_file, sep="\t", header=None)
    docs = docs.rename(columns={0:"doc_id",1:"tweet_id"})
    tweets = tweets.join(docs.set_index('tweet_id'), on='tweet_id')
    return tweets

### Search engine models

In [23]:
class TfIdfIndex():

    def __init__(self, ids, stemmed_text, num_documents):
        
        self.index = defaultdict(list)
        self.tf = defaultdict(list)
        self.df = defaultdict(int)
        self.idf = defaultdict(float)

        stemmed_text = stemmed_text.tolist()
        ids = ids.tolist()

        for i in range(len(ids)):

            tweet = stemmed_text[i]
            tweet_id = ids[i]

            terms = [word for word in tweet]
            page_id = int(tweet_id)

            current_page_index = {}

            for position, term in enumerate(terms):
                try:
                    current_page_index[term][1].append(position)
                except:
                    current_page_index[term] = [page_id, array('I', [position])]

            norm = 0
            for term, posting in current_page_index.items():
                norm += len(posting[1]) ** 2
            norm = math.sqrt(norm)

            for term, posting in current_page_index.items():
                self.tf[term].append(np.round(len(posting[1]) / norm, 4))
                self.df[term] += 1

            for term_page, posting_page in current_page_index.items():
                self.index[term_page].append(posting_page)

            for term in self.df:
                self.idf[term] = np.round(np.log(float(num_documents / self.df[term])), 4)


    def rank(self, stemmed_query, unranked_results):
                                          
        doc_vectors = defaultdict(lambda: [0] * len(stemmed_query))
        query_vector = [0] * len(stemmed_query)

        query_terms_count = collections.Counter(stemmed_query)

        query_norm = la.norm(list(query_terms_count.values()))

        for termIndex, term in enumerate(stemmed_query):
            if term not in self.index:
                continue
            query_vector[termIndex] = query_terms_count[term] / query_norm * self.idf[term]

            for doc_index, (doc, postings) in enumerate(self.index[term]):

                if doc in unranked_results:
                    doc_vectors[doc][termIndex] = self.tf[term][doc_index] * self.idf[term] 

        doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
        doc_scores.sort(reverse=True)
        result_docs = [x[1] for x in doc_scores]

        return result_docs
                                          
    def search(self, query):

        query = clean_tweet(query)
        docs = set()
        for term in query:
            try:
                # store in term_docs the ids of the docs that contain "term"
                term_docs = set([posting[0] for posting in self.index[term]])
                                          
                # retain all documents which contain all words from the query
                if len(docs)==0:
                    docs = term_docs
                else:
                    docs = docs.intersection(term_docs)
            except:
                #term is not in index
                pass
            
        docs = list(docs) #docs are the unranked results
                                          
        ranked_docs = self.rank(query, docs)

        return ranked_docs

Our score will be computed as follows:

$$
\text{Our score} = \frac{1}{4} \text{likes} + \frac{3}{4} \text{retweets}
$$

Where $\text{likes}$ and $\text{retweets}$ are normalized from $0$ to $1$, and in logarithmic scale, as in the EDA in Part 1 we saw that there were a lot of tweets with few likes and retweets and very few tweets with a lot of likes and retweets.

We gave different weights to the number of likes and the number of retweets because we considered that, in Twitter, retweets are more representative of the popularity of a tweet because when retweeting a tweet, it appears in your feed so that your community also interacts with the tweet. On the other hand, liking a tweet only represents the fact that you agree or enjoy the tweet.

In [52]:
class OurScore():

    def __init__(self, ids, stemmed_text, num_documents, our_score, alpha=0.5):
        
        self.index = defaultdict(list)
        self.tf = defaultdict(list)
        self.df = defaultdict(int)
        self.idf = defaultdict(float)
        self.our_score = our_score
        self.alpha = alpha

        stemmed_text = stemmed_text.tolist()
        ids = ids.tolist()

        for i in range(len(ids)):

            tweet = stemmed_text[i]
            tweet_id = ids[i]

            terms = [word for word in tweet]
            page_id = int(tweet_id)

            current_page_index = {}

            for position, term in enumerate(terms):
                try:
                    current_page_index[term][1].append(position)
                except:
                    current_page_index[term] = [page_id, array('I', [position])]

            norm = 0
            for term, posting in current_page_index.items():
                norm += len(posting[1]) ** 2
            norm = math.sqrt(norm)

            for term, posting in current_page_index.items():
                self.tf[term].append(np.round(len(posting[1]) / norm, 4))
                self.df[term] += 1

            for term_page, posting_page in current_page_index.items():
                self.index[term_page].append(posting_page)

            for term in self.df:
                self.idf[term] = np.round(np.log(float(num_documents / self.df[term])), 4)


    def rank(self, stemmed_query, unranked_results):
                                          
        doc_vectors = defaultdict(lambda: [0] * len(stemmed_query))
        query_vector = [0] * len(stemmed_query)

        query_terms_count = collections.Counter(stemmed_query)

        query_norm = la.norm(list(query_terms_count.values()))

        for termIndex, term in enumerate(stemmed_query):
            if term not in self.index:
                continue
            query_vector[termIndex] = query_terms_count[term] / query_norm * self.idf[term]

            for doc_index, (doc, postings) in enumerate(self.index[term]):

                if doc in unranked_results:
                    doc_vectors[doc][termIndex] = self.tf[term][doc_index] * self.idf[term] 

        doc_scores = [[(1-self.alpha) * np.dot(curDocVec, query_vector) + self.alpha * self.our_score[self.our_score["tweet_id"]==doc]["score"].item(), doc] for doc, curDocVec in doc_vectors.items()]
        doc_scores.sort(reverse=True)
        result_docs = [x[1] for x in doc_scores]

        return result_docs
                                          
    def search(self, query):

        query = clean_tweet(query)
        docs = set()
        for term in query:
            try:
                # store in term_docs the ids of the docs that contain "term"
                term_docs = set([posting[0] for posting in self.index[term]])
                                          
                # retain all documents which contain all words from the query
                if len(docs)==0:
                    docs = term_docs
                else:
                    docs = docs.intersection(term_docs)
            except:
                #term is not in index
                pass
            
        docs = list(docs) #docs are the unranked results
                                          
        ranked_docs = self.rank(query, docs)

        return ranked_docs

In [53]:
def compute_our_score (clean_df):

    tweet_ids = clean_df["tweet_id"]

    # LOG-SCALE AND NORMALIZE 0-1
    likes = np.log(clean_df["likes"].apply(lambda x: x + 1))
    likes = (likes - np.min(likes)) / (np.max(likes) - np.min(likes))

    retweets = np.log(clean_df["retweets"].apply(lambda x: x + 1))
    retweets = (retweets - np.min(retweets)) / (np.max(retweets) - np.min(retweets))

    # COMPUTE USING OUR FORMULA
    our_score = likes.apply(lambda x: x*0.25) + likes.apply(lambda x: x*0.75)

    # RETURN DATAFRAME OF TWEET IDS AND SCORES
    return pd.DataFrame({"tweet_id": tweet_ids, "score": our_score})

# Import data

In [54]:
# Import from JSON file
raw_df = from_json_to_dataframe(DOC_PATH)

# Clean raw DataFrame to have a more convenient structure
clean_df = clean_raw_dataset(raw_df)

# Stem tweets
clean_df["stemmed_tweet"] = process_text_column(clean_df["tweet"])

# Join with map csv
clean_df = join_docs_tweets_dfs(clean_df, MAPPING_PATH)


print("Total number of Tweets in the corpus: {}".format(len(clean_df)))

Total number of Tweets in the corpus: 4000


In [55]:
clean_df.head(3)

Unnamed: 0,date,tweet_id,tweet,likes,retweets,hashtags,user_id,url,tags,stemmed_tweet,doc_id
0,2022-09-30 18:39:17+00:00,1575918221013979136,@MelSimmonsFCDO Wrong. Dictator Putin's Fascis...,0,0,"[RussiainvadesUkraine, UkraineRussiaWar]",1404526426330701825,https://twitter.com/1404526426330701825/status...,[MelSimmonsFCDO],"[melsimmonsfcdo, wrong, dictat, putin, fascist...",doc_1
1,2022-09-30 18:38:44+00:00,1575918081461080065,🇺🇦❤️ The Armed Forces liberated the village of...,0,0,"[Drobysheve, Lymansk, Donetsk, UkraineRussiaWa...",1257116113898536961,https://twitter.com/1257116113898536961/status...,[],"[arm, forc, liber, villag, drobyshev, lymansk,...",doc_2
2,2022-09-30 18:38:23+00:00,1575917992390823936,ALERT 🚨Poland preps anti-radiation tablets ove...,0,0,"[NATO, Putin, Russia, RussiaInvadedUkraine, Uk...",1460003892415053828,https://twitter.com/1460003892415053828/status...,[],"[alert, poland, prep, antiradi, tablet, nuclea...",doc_3


In [56]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   date           4000 non-null   datetime64[ns, UTC]
 1   tweet_id       4000 non-null   int64              
 2   tweet          4000 non-null   object             
 3   likes          4000 non-null   int64              
 4   retweets       4000 non-null   int64              
 5   hashtags       4000 non-null   object             
 6   user_id        4000 non-null   int64              
 7   url            4000 non-null   object             
 8   tags           4000 non-null   object             
 9   stemmed_tweet  4000 non-null   object             
 10  doc_id         4000 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(4), object(6)
memory usage: 343.9+ KB


# 1. Comparing TFIDF+Cos vs. TFIDF+OurScore+Cos ranking methods

In [57]:
tfidf_index = TfIdfIndex(ids=clean_df["tweet_id"], stemmed_text = clean_df["stemmed_tweet"], num_documents=len(clean_df))

In [58]:
our_score = compute_our_score(clean_df)
our_score_index = OurScore(ids=clean_df["tweet_id"], stemmed_text = clean_df["stemmed_tweet"], num_documents=len(clean_df), our_score=our_score, alpha=0.5)

In [59]:
query = "tank Kharkiv"
print("Query:", query)
results = our_score_index.search(query)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(results)))
for d_id in results[:top]:
    print("tweet_id = {}\ntweet = {}".format(d_id, clean_df[clean_df["tweet_id"]==d_id]["tweet"].item()))
    print("\n\n-------------------------------------------------------------------------------------------\n\n")

Query: tank Kharkiv

Sample of 10 results out of 27 for the searched query:

tweet_id = 1575739143748927488
tweet = Ukrainian tank holds the ground against two advancing russian tanks.

Unfortunately, the Ukrainian tank takes a fatal hit

#Ukraine #UkraineRussiaCrisis #WarCrimes #UkraineRussiaWar #Kyiv #Mariupol #Chernihiv #Lviv #Kharkiv #Melitopol #Irpin #Bucha #Borodyanka #Odesa #Crimea https://t.co/cYGjgmoeLP


-------------------------------------------------------------------------------------------


tweet_id = 1575528927245770752
tweet = Destroyed Ukrainian tank in the Kharkiv region.

#Ukraine #Ukrainewar #UkraineRussiaWar #Kharkiv https://t.co/xt4JVrWchP


-------------------------------------------------------------------------------------------


tweet_id = 1575187749447307265
tweet = Soldiers of the Ukrainian army seizing a Russian tank in the #Kharkiv region 

#Ukraine-#UkraineRussianWar -#UkraineWar -#UkraineRussiaWar-#Russian https://t.co/7zqjb8cYCE


-------------------

In [61]:
print("Insert your query:\n")
query = "tank Kharkiv"

results_tfidf = tfidf_index.search(query)
results_ourscore = our_score_index.search(query)

k = 10

matches = 0

for i in range(min(len(results_tfidf), k)):
    print(f"TFIDF result: {results_tfidf[i]}, OURSCORE result: {results_ourscore[i]}")
    if results_tfidf[i] == results_ourscore[i]:
        matches += 1

print("There are", str(matches)+"/"+str(k), " tweets in common using the two methods.")

Insert your query:

TFIDF result: 1575739143748927488, OURSCORE result: 1575739143748927488
TFIDF result: 1575528927245770752, OURSCORE result: 1575528927245770752
TFIDF result: 1575187749447307265, OURSCORE result: 1575187749447307265
TFIDF result: 1575893901080027142, OURSCORE result: 1575435463682363392
TFIDF result: 1575435463682363392, OURSCORE result: 1575196507770593282
TFIDF result: 1575610720322211840, OURSCORE result: 1575893901080027142
TFIDF result: 1575600820229242880, OURSCORE result: 1575610720322211840
TFIDF result: 1575196507770593282, OURSCORE result: 1575600820229242880
TFIDF result: 1575889650471665665, OURSCORE result: 1575834054905462784
TFIDF result: 1575834054905462784, OURSCORE result: 1575889650471665665
There are 3/10  tweets in common using the two methods.


# 2. Word2Vec + cosine similarity

In [63]:
QUERY1 = "tank Kharkiv" # What is the discussion regarding a tank in Kharkiv?
QUERY2 = "nord stream" # What discussion are there about the Nord Stream pipeline?
QUERY3 = "territories annexation russia" # What is being said about the annexation of territories in Russia?

QUERY4 = "refugees" # Are there discussions about the Ukranian refugees?
QUERY5 = "kill putin" # Are there discussions or messages about killing president Putin or Putin killing people?

In [86]:
class Word2VecIndex():

    def __init__(self, ids, stemmed_text, num_documents, word2vec_model):
        
        self.index = defaultdict(list)
        self.tf = defaultdict(list)
        self.df = defaultdict(int)
        self.idf = defaultdict(float)
        self.word2vec_model = word2vec_model
        self.collection = dict(zip(ids, stemmed_text))

        stemmed_text = stemmed_text.tolist()
        ids = ids.tolist()

        for i in range(len(ids)):

            tweet = stemmed_text[i]
            tweet_id = ids[i]

            terms = [word for word in tweet]
            page_id = int(tweet_id)

            current_page_index = {}

            for position, term in enumerate(terms):
                try:
                    current_page_index[term][1].append(position)
                except:
                    current_page_index[term] = [page_id, array('I', [position])]

            norm = 0
            for term, posting in current_page_index.items():
                norm += len(posting[1]) ** 2
            norm = math.sqrt(norm)

            for term, posting in current_page_index.items():
                self.tf[term].append(np.round(len(posting[1]) / norm, 4))
                self.df[term] += 1

            for term_page, posting_page in current_page_index.items():
                self.index[term_page].append(posting_page)

            for term in self.df:
                self.idf[term] = np.round(np.log(float(num_documents / self.df[term])), 4)

    def model(self, text):
        
        tweet_vectors = []

        for word in text:
            tweet_vectors.append(self.word2vec_model.wv[word])

        return np.average(tweet_vectors, axis=0) # Computes the average of all the tweet vectors


    def rank(self, stemmed_query, unranked_results):
                                          
        doc_vectors = defaultdict(lambda: [0] * len(stemmed_query))
        query_vector = self.model(stemmed_query)

        for doc in unranked_results:
            doc_vectors[doc] = self.model(self.collection[doc])

        doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
        doc_scores.sort(reverse=True)
        result_docs = [x[1] for x in doc_scores]

        return result_docs
                                          
    def search(self, query):

        query = clean_tweet(query)
        docs = set()
        for term in query:
            try:
                # store in term_docs the ids of the docs that contain "term"
                term_docs = set([posting[0] for posting in self.index[term]])
                                          
                # retain all documents which contain all words from the query
                if len(docs)==0:
                    docs = term_docs
                else:
                    docs = docs.intersection(term_docs)
            except:
                #term is not in index
                pass
            
        docs = list(docs) #docs are the unranked results
                                          
        ranked_docs = self.rank(query, docs)

        return ranked_docs

In [87]:
vector_size = 300

model = Word2Vec(sentences=clean_df["stemmed_tweet"], vector_size=vector_size, window=5, min_count=1, workers=4)

In [88]:
w2v_score_index = Word2VecIndex(ids=clean_df["tweet_id"], stemmed_text=clean_df["stemmed_tweet"], num_documents=len(clean_df), word2vec_model=model)

In [89]:
query = "tank Kharkiv"
print("Query:", query)
results = w2v_score_index.search(query)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(results)))
for d_id in results[:top]:
    print("tweet_id = {}\ntweet = {}".format(d_id, clean_df[clean_df["tweet_id"]==d_id]["tweet"].item()))
    print("\n\n-------------------------------------------------------------------------------------------\n\n")

Query: tank Kharkiv

Sample of 10 results out of 27 for the searched query:

tweet_id = 1575528927245770752
tweet = Destroyed Ukrainian tank in the Kharkiv region.

#Ukraine #Ukrainewar #UkraineRussiaWar #Kharkiv https://t.co/xt4JVrWchP


-------------------------------------------------------------------------------------------


tweet_id = 1575642072295489536
tweet = #Russia #Ukraine #RussianArmy 
Why the Russian 🪖 Army T-72 Tank is Worse Than You Think

#Putin #Russian #RussiaUkraineWar #Russie #Ukrainian #UkraineRussiaWar #Russland #Kharkiv #Zelensky #UkraineWar #Kherson #RussianMobilization #USA #Russians #NATO
https://t.co/XZU7FObkVS


-------------------------------------------------------------------------------------------


tweet_id = 1575482368630353920
tweet = "#Russia #Ukraine #RussianArmy 
Why the Russian 🪖 Army T-72 Tank is Worse Than You Think

#Putin #Russian #RussiaUkraineWar #Russie #Ukrainian #UkraineRussiaWar #Russland #Kharkiv #Zelensky #UkraineWar #Kherson #Russi