In [148]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import time
import nltk
import pandas as pd
import re
import warnings

In [149]:
#from google.colab import drive
#drive.mount('/content/drive')

# Functions and modules

### Data cleaning functions

In [150]:
def from_json_to_dataframe(doc_path = '../data/Rus_Ukr_war_data.json'):
    with open(doc_path) as fp:
        lines = fp.readlines()
    df=pd.read_json(doc_path, lines=True)
    return df


def clean_raw_dataset(raw_df):
    # Select only relevant columns
    clean_df = raw_df[["created_at","id","full_text","entities","favorite_count","retweet_count","user"]]

    # Rename columns
    renames = {"created_at":"date", "full_text":"tweet", "favorite_count":"likes","retweet_count":"retweets", "id":"tweet_id"}
    clean_df = clean_df.rename(columns=renames)

    # Create Series of list of hashtags from `entities` object
    df_hashtags = pd.json_normalize(clean_df["entities"])["hashtags"]
    df_hashtags = df_hashtags.apply(lambda x: [item["text"] for item in x])

    # Create Series of username ids
    df_user = pd.json_normalize(clean_df["user"])["id"].rename("user_id")

    # Merge hashtags and username columns to the DataFrame
    clean_df = pd.concat([clean_df,df_hashtags,df_user], axis=1).drop(columns=["entities","user"])

    # Create URL column manually from the user id and tweet id columns
    clean_df["url"] = "https://twitter.com/" + clean_df["user_id"].astype(str) + "/status/" + clean_df["tweet_id"].astype(str)

    # Extract tags to other users from the tweet body
    clean_df["tags"] = clean_df["tweet"].apply(lambda x: re.findall(r"@(\w+)", x))

    # Returns a DataFrame of tweets with columns ["date", "tweet_id", "tweet", "likes", "retweets", "hashtags", "user_id", "url", "tags", "tags"]
    return clean_df


def remove_emojis(tweet):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emojis
                           u"\U0001F300-\U0001F5FF"  # symbols & pictograms
                           u"\U0001F680-\U0001F6FF"  # map symbols
                           u"\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', tweet)


def clean_tweet(line):
    
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    ## START CODE
    line = re.sub(r'[.,;:!?"\'-@]', '', line).replace("#", "").replace("’", "").replace("“", "").replace("\n"," ")
    line =  line.lower() ## Transform in lowercase
    line = remove_emojis(line).strip().replace("  ", " ")
    line = line.split(" ") ## Tokenize the text to get a list of terms
    line =[word for word in line if word not in stop_words]  ## eliminate the stopwords (HINT: use List Comprehension)
    line =[stemmer.stem(word) for word in line] ## perform stemming (HINT: use List Comprehension)
    line = [word for word in line if word != ""]
    ## END CODE
    
    return line


def process_text_column(column):
    column = column.apply(clean_tweet)
    return column

def join_docs_tweets_dfs(tweets, csv_file='../data/Rus_Ukr_war_data_ids.csv'):
    docs = pd.read_csv(csv_file, sep="\t", header=None)
    docs = docs.rename(columns={0:"doc_id",1:"tweet_id"})
    tweets = tweets.join(docs.set_index('tweet_id'), on='tweet_id')
    return tweets

### Indexing

In [151]:
term_docs = {1, 2, 3, 4, 6}
b = {1, 3}

c = term_docs.intersection(b)
print(c)

{1, 3}


In [152]:
class InvertedIndex():

    def __init__(self, ids, stemmed_text):
        
        self.index = defaultdict(list)

        stemmed_text = stemmed_text.tolist()
        ids = ids.tolist()

        for i in range(len(ids)):

            tweet = stemmed_text[i]
            tweet_id = ids[i]

            terms = [word for word in tweet]
            page_id = int(tweet_id)

            current_page_index = {}

            for position, term in enumerate(terms):
                try:
                    current_page_index[term][1].append(position)
                except:
                    current_page_index[term] = [page_id, array('I', [position])]

            for term_page, posting_page in current_page_index.items():
                self.index[term_page].append(posting_page)


    def search(self, query):

        query = clean_tweet(query)
        docs = set()
        for term in query:
            try:
                term_docs = [posting[0] for posting in self.index[term]]
                term_docs = set(term_docs)
                if len(docs)==0:
                    docs = docs.union(term_docs)
                else:
                    docs = docs.intersection(term_docs)
            except:
                pass
        docs = list(docs)
        return docs




class TfIdfIndex():

    def __init__(self, ids, stemmed_text, num_documents):
        
        self.index = defaultdict(list)
        self.tf = defaultdict(list)
        self.df = defaultdict(int)
        self.idf = defaultdict(float)

        stemmed_text = stemmed_text.tolist()
        ids = ids.tolist()

        for i in range(len(ids)):

            tweet = stemmed_text[i]
            tweet_id = ids[i]

            terms = [word for word in tweet]
            page_id = int(tweet_id)

            current_page_index = {}

            for position, term in enumerate(terms):
                try:
                    current_page_index[term][1].append(position)
                except:
                    current_page_index[term] = [page_id, array('I', [position])]

            norm = 0
            for term, posting in current_page_index.items():
                norm += len(posting[1]) ** 2
            norm = math.sqrt(norm)

            for term, posting in current_page_index.items():
                self.tf[term].append(np.round(len(posting[1]) / norm, 4))
                self.df[term] += 1

            for term_page, posting_page in current_page_index.items():
                self.index[term_page].append(posting_page)

            for term in self.df:
                self.idf[term] = np.round(np.log(float(num_documents / self.df[term])), 4)


    def rank(self, stemmed_query, unranked_results):
                                          
        doc_vectors = defaultdict(lambda: [0] * len(stemmed_query))
        query_vector = [0] * len(stemmed_query)

        # compute the norm for the query tf
        query_terms_count = collections.Counter(stemmed_query)  # get the frequency of each term in the query.
        # Example: collections.Counter(["hello","hello","world"]) --> Counter({'hello': 2, 'world': 1})

        query_norm = la.norm(list(query_terms_count.values()))

        for termIndex, term in enumerate(stemmed_query):  #termIndex is the index of the term in the query
            if term not in self.index:
                continue

            # TODO: check how to vectorize the query
            # query_vector[termIndex]=idf[term]  # original
            ## Compute tf*idf(normalize TF as done with documents)
            query_vector[termIndex] = query_terms_count[term] / query_norm * self.idf[term]

            # Generate doc_vectors for matching docs
            for doc_index, (doc, postings) in enumerate(self.index[term]):
                # Example of [doc_index, (doc, postings)]
                # 0 (26, array('I', [1, 4, 12, 15, 22, 28, 32, 43, 51, 68, 333, 337]))
                # 1 (33, array('I', [26, 33, 57, 71, 87, 104, 109]))
                # term is in doc 26 in positions 1,4, .....
                # term is in doc 33 in positions 26,33, .....

                #tf[term][0] will contain the tf of the term "term" in the doc 26
                if doc in unranked_results:
                    doc_vectors[doc][termIndex] = self.tf[term][doc_index] * self.idf[term] 

        # Calculate the score of each doc
        # compute the cosine similarity between queyVector and each docVector:
        # HINT: you can use the dot product because in case of normalized vectors it corresponds to the cosine similarity
        # see np.dot

        doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
        doc_scores.sort(reverse=True)
        #print(doc_scores)
        result_docs = [x[1] for x in doc_scores]
        #print document titles instead if document id's
        #result_docs=[ title_index[x] for x in result_docs ]

        return result_docs
                                          
    def search(self, query):

        query = clean_tweet(query)
        docs = set()
        for term in query:
            try:
                # store in term_docs the ids of the docs that contain "term"
                term_docs = set([posting[0] for posting in self.index[term]])
                                          
                # retain all documents which contain all words from the query
                if len(docs)==0:
                    docs = docs.union(term_docs)
                else:
                    docs = docs.intersection(term_docs)
            except:
                #term is not in index
                pass
        docs = list(docs) #docs are the unranked results
                                          
        ranked_docs = self.rank(query, docs)

        return ranked_docs

# Execution

In [153]:
doc_path = '../data/Rus_Ukr_war_data.json'
csv_path = '../data/Rus_Ukr_war_data_ids.csv'

# Import from JSON file
raw_df = from_json_to_dataframe(doc_path)

# Clean raw DataFrame to have a more convenient structure
clean_df = clean_raw_dataset(raw_df)

# 
clean_df["stemmed_tweet"] = process_text_column(clean_df["tweet"])


clean_df = join_docs_tweets_dfs(clean_df, csv_path)


print("Total number of Tweets in the corpus: {}".format(len(clean_df)))
clean_df.head()

Total number of Tweets in the corpus: 4000


Unnamed: 0,date,tweet_id,tweet,likes,retweets,hashtags,user_id,url,tags,stemmed_tweet,doc_id
0,2022-09-30 18:39:17+00:00,1575918221013979136,@MelSimmonsFCDO Wrong. Dictator Putin's Fascis...,0,0,"[RussiainvadesUkraine, UkraineRussiaWar]",1404526426330701825,https://twitter.com/1404526426330701825/status...,[MelSimmonsFCDO],"[melsimmonsfcdo, wrong, dictat, putin, fascist...",doc_1
1,2022-09-30 18:38:44+00:00,1575918081461080065,🇺🇦❤️ The Armed Forces liberated the village of...,0,0,"[Drobysheve, Lymansk, Donetsk, UkraineRussiaWa...",1257116113898536961,https://twitter.com/1257116113898536961/status...,[],"[arm, forc, liber, villag, drobyshev, lymansk,...",doc_2
2,2022-09-30 18:38:23+00:00,1575917992390823936,ALERT 🚨Poland preps anti-radiation tablets ove...,0,0,"[NATO, Putin, Russia, RussiaInvadedUkraine, Uk...",1460003892415053828,https://twitter.com/1460003892415053828/status...,[],"[alert, poland, prep, antiradi, tablet, nuclea...",doc_3
3,2022-09-30 18:38:03+00:00,1575917907774967809,I’m still waiting for my google map 🗺️ to upda...,0,0,"[Putin, UkraineRussiaWar]",285766081,https://twitter.com/285766081/status/157591790...,[],"[im, still, wait, googl, map, updat, russia, n...",doc_4
4,2022-09-30 18:37:56+00:00,1575917878410301441,@EmmanuelMacron probably you're right or you h...,0,0,"[European, UkraineRussiaWar]",1537193346107686915,https://twitter.com/1537193346107686915/status...,[EmmanuelMacron],"[emmanuelmacron, probabl, your, right, say, an...",doc_5


In [154]:
inverted_index = InvertedIndex(ids=clean_df["tweet_id"], stemmed_text = clean_df["stemmed_tweet"])

In [155]:
print("Insert your query:\n")
query = input()
print("Query:", query)
results = inverted_index.search(query)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(results)))
for d_id in results[:top]:
    print("tweet_id = {}\ntweet = {}".format(d_id, clean_df[clean_df["tweet_id"]==d_id]["tweet"].item()))
    print("\n\n-------------------------------------------------------------------------------------------\n\n")

Insert your query:

Query: 

Sample of 10 results out of 0 for the searched query:



In [156]:
# Execute after implementing TFIDF INDEX

num_tweets = len(clean_df)
tf_idf_index = TfIdfIndex(ids=clean_df["tweet_id"], stemmed_text = clean_df["stemmed_tweet"], num_documents=num_tweets)

In [157]:
# Also execute after having implemented TFIDF INDEX

print("Insert your query:\n")
query = input()
print("Query:", query)

results = tf_idf_index.search(query)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(results)))
for d_id in results[:top]:
    print("tweet_id= {}\ntweet: {}".format(d_id, clean_df[clean_df["tweet_id"]==d_id]["tweet"].item()))
    print("\n\n-------------------------------------------------------------------------------------------\n\n")

Insert your query:

Query: tanks helicopters

Sample of 10 results out of 1 for the searched query:

tweet_id= 1575891775834972160
tweet: Russia Loses 2 Jets, 13 Tanks and a Helicopter in a Single Day: Ukraine https://t.co/qVoOoN1dX0 #UkraineWillWin #UkraineRussiaWar #Ukraine #UkraineWar #NATO #RussiaInvadedUkraine #Russians #RussianUkrainianWar #Russia


-------------------------------------------------------------------------------------------




In [158]:
### EVALUATION PART

# We need to import the evaluation_gt file
# I think it is a just a csv, so just pandas read csv

# We have to do two separate evaluations

# First, running the queries in the pdf (they call them information needs)
# and then computing the P@K, R@K, etc. for the 3 queries they propose

# Second, inventing two new queries, and assessing ourselves if the top N results given by 
# our algorithm are relevant (1) or not (0), and then computing P@K, R@K, etc.

In [159]:
evaluation_df = pd.read_csv("../data/Evaluation_gt.csv")

evaluation_df.head()

Unnamed: 0,doc,query_id,label
0,doc_1452,Q3,1
1,doc_2908,Q3,1
2,doc_618,Q3,1
3,doc_489,Q3,1
4,doc_110,Q3,1


In [160]:
QUERY1 = "tank Kharkiv" # What is the discussion regarding a tank in Kharkiv?
QUERY2 = "nord stream" # What discussion are there about the Nord Stream pipeline?
QUERY3 = "territories annexation russia" # What is being said about the annexation of territories in Russia?

QUERY4 = "" # 
QUER5 = "putin kill" # Are there discussions or messages about killing president Putin?

In [210]:
def precision_at_k(doc_score, y_score, k=3): # P@K
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    precision @k : float

    """
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k])
    relevant = sum(doc_score == 1)
    return float(relevant) / k


def recall_at_k(): # R@K
    pass

def avg_precision_at_k(doc_score, y_score, k=3):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    average precision @k : float
    """

    doc_score = np.array(doc_score)
    y_score = np.array(y_score)

    gtp = np.sum(doc_score == 1)
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k])

    if gtp == 0:
        return 0
    n_relevant_at_i = 0
    prec_at_i = 0
    for i in range(len(doc_score)):
        if doc_score[i] == 1:
            n_relevant_at_i += 1
            prec_at_i += n_relevant_at_i / (i + 1)
    return prec_at_i / gtp

def f1_score(): # F1
    pass

def mean_avg_precision(): # MAP
    pass

def mean_reciprocal_rank(): # MRR
    pass

def normalized_discounted_cumulative_gain(): # NDCG
    pass

In [213]:
print("PERFORMING QUERY1:", QUERY1)

current_query = "Q1"

warnings.filterwarnings("ignore", category=pd.core.common.SettingWithCopyWarning)
ground_truth = evaluation_df.loc[evaluation_df["query_id"]==current_query]
ground_truth_aux = evaluation_df.loc[(evaluation_df["query_id"]!=current_query)&(evaluation_df["label"]==1)]
ground_truth_aux["label"] = 0
ground_truth = pd.concat([ground_truth, ground_truth_aux])
ground_truth = ground_truth.merge(clean_df, left_on='doc', right_on='doc_id', how='left')
warnings.filterwarnings("default", category=pd.core.common.SettingWithCopyWarning)
ground_truth = ground_truth[["tweet_id","label"]].rename(columns={"label": "true_score"})


print("\nUsing stadard Inverted Index...")
results = inverted_index.search(QUERY1)
results = pd.DataFrame({"tweet_id":results, "predicted_score":range(1, len(results)+1)})
results = results.merge(ground_truth, on='tweet_id', how='left')
print(results.head(5), "\n...")
# In results we have tweet_ids, predicted_scores and true_score
# TODO: CALL ALL EVALUATION FUNCTIONS AND PRINT THEIR RESULTS
print("P@K:", precision_at_k(results["true_score"], results["predicted_score"]))
print("AP@K:", avg_precision_at_k(results["true_score"], results["predicted_score"]))


print("\nUsing TF-IDF Index...")
results = tf_idf_index.search(QUERY1)
results = pd.DataFrame({"tweet_id":results, "predicted_score":range(1, len(results)+1)})
results = results.merge(ground_truth, on='tweet_id', how='left')
print(results.head(5), "\n...")
# In results we have tweet_ids, predicted_scores and true_score
# TODO: CALL ALL EVALUATION FUNCTIONS AND PRINT THEIR RESULTS
print("P@K:", precision_at_k(results["true_score"], results["predicted_score"]))
print("AP@K:", avg_precision_at_k(results["true_score"], results["predicted_score"]))




PERFORMING QUERY1: tank Kharkiv

Using stadard Inverted Index...
              tweet_id  predicted_score  true_score
0  1575482368630353920                1         NaN
1  1575889650471665665                2         NaN
2  1575196507770593282                3         NaN
3  1575204591469150210                4         NaN
4  1575458070381092866                5         NaN 
...
P@K: 0.3333333333333333
AP@K: 0.047619047619047616

Using TF-IDF Index...
              tweet_id  predicted_score  true_score
0  1575739143748927488                1         NaN
1  1575528927245770752                2         1.0
2  1575187749447307265                3         1.0
3  1575893901080027142                4         NaN
4  1575435463682363392                5         1.0 
...
P@K: 0.3333333333333333
AP@K: 0.14285714285714285
