In [1]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import time
import nltk
import pandas as pd
import re

In [4]:
#from google.colab import drive
#drive.mount('/content/drive')

# Functions and modules

### Data cleaning functions

In [2]:
def from_json_to_dataframe(doc_path = '../data/Rus_Ukr_war_data.json'):
    with open(doc_path) as fp:
        lines = fp.readlines()
    df=pd.read_json(doc_path, lines=True)
    return df


def clean_raw_dataset(raw_df):
    # Select only relevant columns
    clean_df = raw_df[["created_at","id_str","full_text","entities","favorite_count","retweet_count","user"]]

    # Rename columns
    renames = {"created_at":"date", "full_text":"tweet", "favorite_count":"likes","retweet_count":"retweets", "id_str":"tweet_id"}
    clean_df = clean_df.rename(columns=renames)

    # Create Series of list of hashtags from `entities` object
    df_hashtags = pd.json_normalize(clean_df["entities"])["hashtags"]
    df_hashtags = df_hashtags.apply(lambda x: [item["text"] for item in x])

    # Create Series of username ids
    df_user = pd.json_normalize(clean_df["user"])["id"].rename("user_id")

    # Merge hashtags and username columns to the DataFrame
    clean_df = pd.concat([clean_df,df_hashtags,df_user], axis=1).drop(columns=["entities","user"])

    # Create URL column manually from the user id and tweet id columns
    clean_df["url"] = "https://twitter.com/" + clean_df["user_id"].astype(str) + "/status/" + clean_df["tweet_id"].astype(str)

    # Extract tags to other users from the tweet body
    clean_df["tags"] = clean_df["tweet"].apply(lambda x: re.findall(r"@(\w+)", x))

    # Returns a DataFrame of tweets with columns ["date", "tweet_id", "tweet", "likes", "retweets", "hashtags", "user_id", "url", "tags", "tags"]
    return clean_df


def remove_emojis(tweet):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emojis
                           u"\U0001F300-\U0001F5FF"  # symbols & pictograms
                           u"\U0001F680-\U0001F6FF"  # map symbols
                           u"\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', tweet)


def clean_tweet(line):
    
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    ## START CODE
    line = re.sub(r'[.,;:!?"\'-@]', '', line).replace("#", "").replace("’", "").replace("“", "").replace("\n"," ")
    line =  line.lower() ## Transform in lowercase
    line = remove_emojis(line).strip().replace("  ", " ")
    line = line.split(" ") ## Tokenize the text to get a list of terms
    line =[word for word in line if word not in stop_words]  ## eliminate the stopwords (HINT: use List Comprehension)
    line =[stemmer.stem(word) for word in line] ## perform stemming (HINT: use List Comprehension)
    line = [word for word in line if word != ""]
    ## END CODE
    
    return line


def process_text_column(column):
    column = column.apply(clean_tweet)
    return column

def join_docs_tweets_dfs(tweets, csv_file='../data/Rus_Ukr_war_data_ids.csv'):
    docs = pd.read_csv(csv_file, sep="\t", header=None)
    docs = docs.rename(columns={0:"doc_id",1:"tweet_id"})
    tweets = tweets.join(docs.set_index('tweet_id'), on='tweet_id')
    return tweets

### Indexing

In [7]:
term_docs = {1, 2, 3, 4, 6}
b = {1, 3}

c = term_docs.intersection(b)
print(c)

{1, 3}


In [17]:
class InvertedIndex():

    def __init__(self, ids, stemmed_text):
        
        self.index = defaultdict(list)

        stemmed_text = stemmed_text.tolist()
        ids = ids.tolist()

        for i in range(len(ids)):  # Remember, lines contain all documents from file

            tweet = stemmed_text[i]
            tweet_id = ids[i]

            terms = [word for word in tweet]
            page_id = int(tweet_id)

            ## ===============================================================
            ## create the index for the current page and store it in current_page_index (current_page_index)
            ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}

            ## Example: if the curr_doc has id 1 and its text is "web retrieval information retrieval":

            ## current_page_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

            ## the term ‘web’ appears in document 1 in positions 0,
            ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
            ## ===============================================================

            current_page_index = {}

            for position, term in enumerate(terms): # terms contains page_title + page_text. Loop over all terms
                try:
                    # if the term is already in the index for the current page (current_page_index)
                    # append the position to the corresponding list (elemento 1 del arreglo, el 0 es la id del documento)
                    current_page_index[term][1].append(position)
                except:
                    # Add the new term as dict key and initialize the array of positions and add the position
                    current_page_index[term] = [page_id, array('I', [position])]  #'I' indicates unsigned int (int in Python)

            # merge the current page index with the main index
            for term_page, posting_page in current_page_index.items():
                self.index[term_page].append(posting_page)

    def search(self, query):
        query = clean_tweet(query) #so that stemed terms are matched in the index
        docs = set()
        for term in query:
            try:
                # store in term_docs the ids of the docs that contain "term"
                term_docs = [posting[0] for posting in self.index[term]]
                # docs = docs Union term_docs
                # docs |= set(term_docs)
                # MARC: Documents information: Since we are dealing with conjunctive queries (AND),
                # each of the returned documents should contain all the words in the query. -> The intersection
                term_docs = set(term_docs)
                docs = term_docs.intersection(term_docs)
            except:
                #term is not in index
                pass
        docs = list(docs)
        return docs

class TfIdfIndex():

    def __init__(self):
        pass

        # TODO: Copy the create_index_tfidf function from lab 1
        # You can also use the class InvertedIndex above as reference as to how to code classes in Python
        # Have in mind that __init__ should not return anything. The variables that...
        # create_index_tfidf returns (index, tf, df, idf) should be stored in the class using:
        # self.index, self.tf, etc. We don't need to store title_index, it was just for the lab1


    def rank(self, stemmed_query, unranked_results):
        pass
        # TODO: Copy the rank_documents function from lab 1
        # Have in mind that we don't need to pass index, idf nor tf as they are stored in the class using
        # self.index, self.idf, etc.
        # Note that terms is called here stemmed query and docs is called here unranked_results
        # Also have in mind that this function is called from self.query(), so the unranked_results input, 
        # that in lab1 is called docs, are the results of the query that need to be sorted by importance

    def query(self, query):
        pass
        # TODO: copy the search_tf_idf() function from lab1
        # Remember that we dn't give the index as input because we save it in self.index
        # Remember that to call what in the lab was rank_documents() we have to call self.rank()


# Execution

In [13]:
doc_path = '../data/Rus_Ukr_war_data.json'
csv_path = '../data/Rus_Ukr_war_data_ids.csv'

# Import from JSON file
raw_df = from_json_to_dataframe(doc_path)

# Clean raw DataFrame to have a more convenient structure
clean_df = clean_raw_dataset(raw_df)

# 
clean_df["stemmed_tweet"] = process_text_column(clean_df["tweet"])


clean_df = join_docs_tweets_dfs(clean_df, csv_path)


print("Total number of Tweets in the corpus: {}".format(len(clean_df)))
clean_df.head()

Total number of Tweets in the corpus: 4000


Unnamed: 0,date,tweet_id,tweet,likes,retweets,hashtags,user_id,url,tags,stemmed_tweet,doc_id
0,2022-09-30 18:39:17+00:00,1575918221013979136,@MelSimmonsFCDO Wrong. Dictator Putin's Fascis...,0,0,"[RussiainvadesUkraine, UkraineRussiaWar]",1404526426330701825,https://twitter.com/1404526426330701825/status...,[MelSimmonsFCDO],"[melsimmonsfcdo, wrong, dictat, putin, fascist...",doc_1
1,2022-09-30 18:38:44+00:00,1575918081461080064,🇺🇦❤️ The Armed Forces liberated the village of...,0,0,"[Drobysheve, Lymansk, Donetsk, UkraineRussiaWa...",1257116113898536961,https://twitter.com/1257116113898536961/status...,[],"[arm, forc, liber, villag, drobyshev, lymansk,...",
2,2022-09-30 18:38:23+00:00,1575917992390823936,ALERT 🚨Poland preps anti-radiation tablets ove...,0,0,"[NATO, Putin, Russia, RussiaInvadedUkraine, Uk...",1460003892415053828,https://twitter.com/1460003892415053828/status...,[],"[alert, poland, prep, antiradi, tablet, nuclea...",doc_3
3,2022-09-30 18:38:03+00:00,1575917907774967808,I’m still waiting for my google map 🗺️ to upda...,0,0,"[Putin, UkraineRussiaWar]",285766081,https://twitter.com/285766081/status/157591790...,[],"[im, still, wait, googl, map, updat, russia, n...",
4,2022-09-30 18:37:56+00:00,1575917878410301440,@EmmanuelMacron probably you're right or you h...,0,0,"[European, UkraineRussiaWar]",1537193346107686915,https://twitter.com/1537193346107686915/status...,[EmmanuelMacron],"[emmanuelmacron, probabl, your, right, say, an...",


In [19]:
inverted_index = InvertedIndex(ids=clean_df["tweet_id"], stemmed_text = clean_df["stemmed_tweet"])

In [23]:
print("Insert your query:\n")
query = input()
docs = inverted_index.search(query)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(d_id, clean_df[clean_df["tweet_id"]==d_id]["tweet"].item()))
    print("\n\n-------------------------------------------------------------------------------------------\n\n")

Insert your query:


Sample of 10 results out of 530 for the searched query:

page_id= 1575915581278420992 - page_title: Russian bombers capable of carrying nukes detected near #Finland
The bombers, capable of carrying cruise missiles and strategic nuclear weapons
#Ukraine-#UkraineRussianWar -#UkraineWar -#UkraineRussiaWar
https://t.co/LxV4dGSpfW


-------------------------------------------------------------------------------------------


page_id= 1575908722077229056 - page_title: Big #BreakingNews | Russian bombers capable of carrying nukes detected near Finland

#RussianMobilization #RussianBombers #Finland #UkraineRussiaWar #nuclearweapons 

https://t.co/aLPdcrNWdX


-------------------------------------------------------------------------------------------


page_id= 1575821705695535104 - page_title: #Russia #Ukraine 
NATO: How Finland will fight in a war with Russia after Ukraine invasion
#Finland accepted into #NATO

#Putin #Russian #RussianArmy #Ukrainian #UkraineRussiaWar #Ru

In [None]:
# Execute after implementing TFIDF INDEX

num_tweets = len(clean_df)
tf_idf_index = TfIdfIndex(clean_df, num_tweets)

In [None]:
# Also execute after having implemented TFIDF INDEX

print("Insert your query (i.e.: presidents visiting Kyiv):\n")
query = input()
ranked_docs = tf_idf_index.search(query)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    print("page_id= {} - page_title: {}".format(d_id, clean_df[clean_df["tweet_id"]==d_id]["tweet"].item()))
    print("\n\n-------------------------------------------------------------------------------------------\n\n")

In [None]:
### EVALUATION PART

# We need to import the evaluation_gt file
# I think it is a just a csv, so just pandas read csv

# We have to do two separate evaluations

# First, running the queries in the pdf (they call them information needs)
# and then computing the P@K, R@K, etc. for the 3 queries they propose

# Second, inventing two new queries, and assessing ourselves if the top N results given by 
# our algorithm are relevant (1) or not (0), and then computing P@K, R@K, etc.