# Information Retrieval and Web Analytics

# Indexing + Modeling (TF-IDF)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Load Python packages
Let's first import all the packages that you will need during this assignment.

In [2]:
import nltk
nltk.download('stopwords')
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import string
import json
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#### Load data into memory
The dataset is stored in the json file, and it contains 4,000 tweets related to Hurrican Ian. Each line readed represents a tweet, and for each tweet we have its corresponding information.

In [3]:
docs_path = 'drive/Shareddrives/IR/Project/data/tw_hurricane_data.json'
with open(docs_path) as fp:
    lines = fp.readlines()

In [4]:
print("Total number of tweets: {}".format(len(lines)))

Total number of tweets: 4000


First, we will implement the mandatory preprocessing in the function ```clean_tweet(line)```.

It will take as an input a line, and it will:

- Transform into lower case
- Remove punctiation marks
- Tokenize the text to get a list of terms (*split function*)
- Remove stop words
- Stem terms (example: to stem the term 'researcher', we will use ```stemming.stem(researcher)```)

The extra tasks such as removing emojis 

In [5]:
def clean_tweet(line, emojis):
    """
    Preprocess the article text (title + body) removing stop words, stemming,
    transforming in lowercase and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    line=str(line)  # by default everything assumed as string

    line = line.lower()  # transform into lower case

    line = re.sub(r'\\n', ' ', line)  #remove new lines

    line = line.translate(str.maketrans('', '', string.punctuation))  # remove punctuation marks

    line = re.sub(r'http\S+', '', line)  # remove links

    if emojis: # remove emojis
        ### https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
        remove_emojis = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               u"\u2026" # ...
                               u"\u2019" # '
                               u"\u2066"
                               u"\u2069"
                               u"\u231b"
                               "]+", flags=re.UNICODE)
        line = remove_emojis.sub(r'', str(line))

                     
    line = line.split()  # Tokenize the text to get a list of terms

    stop_words = set(stopwords.words("english")) # remove the stopwords
    line = [x for x in line if x not in stop_words]  

    stemmer = PorterStemmer() #stem terms
    line = [stemmer.stem(word) for word in line]    
        
    return line

In [6]:
def process_json_line(json_line):   #given a json line iterates throuhg its content and creates another json with values formated

  new_dict = {}

  index_stored = 0
  for key, value in json_line.items():

    if (key=="full_text"): new_dict["Tweet"] = clean_tweet(value, True) #process tweet

    elif(key=="created_at"): new_dict["Date"] = clean_tweet(value, False) #process tweet date      

    elif(key=="retweet_count"): new_dict["Retweets"] = value #number of retweet, no need of processing

    elif(key=="favorite_count"): new_dict["Likes"] = value #number of likes, no need of processing
    elif(key=="id"): new_dict["Tweet_id"] = [str(value)] #store tweet id as str

    elif(key=="user"): new_dict["Username"] = clean_tweet(value["screen_name"], True) #process username

    elif(key=="entities"): #process hashtags
      hashtags=[]
      for i in range(len(value["hashtags"])):
        hashtags.append(clean_tweet(value["hashtags"][i]["text"], False))
      new_dict["Hashtags"] = hashtags

  new_dict["Url"] = 'https://twitter.com/'+ new_dict["Username"][0] + '/status/' + str(new_dict["Tweet_id"][0]) #generate url

      
  return new_dict

In [7]:
json_processed=[]

for line in lines:

  data=json.loads(line)
  
  json_line=process_json_line(data) #format json

  json_processed.append(json_line) #store line

#### Mapping ids
Mapping tweets IDs with the document IDS.

In [8]:
import csv
mapping_dict = {} # create empty dictionary to store the documents id

docs_path = 'drive/Shareddrives/IR/Project/data/tweet_document_ids_map.csv'
csv_file = csv.reader(open(docs_path, 'rU'), delimiter="\t", quotechar='|') # open csv

for line in csv_file: mapping_dict[line[1]] = line[0]   # store the indexes in the dictionary (tweet id, document id)

for j in json_processed:
  m = []
  m.append(str(mapping_dict[j['Tweet_id'][0]]))
  j['doc_id'] = m   # add the document id in the json dictionary

  """


In [9]:
json_processed[0]

{'Date': ['fri', 'sep', '30', '183908', '0000', '2022'],
 'Tweet_id': ['1575918182698979328'],
 'Tweet': ['keep',
  'spin',
  'us',
  '7',
  'pmgo',
  'away',
  'alreadi',
  'hurricaneian'],
 'Hashtags': [['hurricaneian']],
 'Username': ['suzjdean'],
 'Retweets': 0,
 'Likes': 0,
 'Url': 'https://twitter.com/suzjdean/status/1575918182698979328',
 'doc_id': ['doc_1']}