# IRWA-2022-u210426-part-1

## 1) Import modules

In [62]:
import datetime
import time

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import regex as re
import numpy as np
import json

After importing all modules I have to download the stop words from nltk module

In [63]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 2) Load data and build map
After the default part, I create a function load_data(path_tweets, path_docs_tweet) when path_tweets is the path to the file where are stored the tweets in the json format and path_docs_tweet is the path to another file that contains doc_id and the corrisponding tweet_id

In [64]:
def load_data(path_tweets, path_docs_tweet):
    id_tweet = {}
    doc_tweet = {}
    with open(path_tweets) as tp:
        for line in tp.readlines():
            tweet = json.loads(line)
            id_tweet[tweet['id']] = tweet

    with open(path_docs_tweet) as dp:
        for line in dp.readlines():
            line = line.split()
            doc_tweet[line[0]] = id_tweet[int(line[1])]
    return doc_tweet

Alter load data process we have to extract the information from the tweet to show it in the format:

    Tweet | Username | Date | Hashtags | Likes | Retweets | Url
    
Where:
- by tweet we mean the full text,
- by username the screen_name of the user (so it identify the user),
- by date we use the format 'name_day number_day name_month year'(e.g. Friday 30 September 2022)
- by hashtags a string of the hashtags inside the text (double ##)
- by likes the number of the likes
- by retweets the number of retweets
- by url the tweet link in the format 'https://twitter.com/username/status/tweet_id'

In [65]:
def get_text(tweet):
    try:
        return tweet['full_text']
    except KeyError:
        return ' '

def get_username(tweet):
    try:
        return tweet['user']['screen_name']
    except KeyError:
        return ' '

def get_date(tweet):
    try:
        created_at = datetime.datetime.strptime(tweet['created_at'], "%a %b %d %X %z %Y" )
        return created_at.strftime('%A %d %B %Y')
    except KeyError:
        return ' '

def get_hashtags(tweet):
    try:
        hashtags = []
        for hash in  tweet['entities']['hashtags']:
                hashtags.append('##' + hash['text'])
        return ' '.join(hashtags)
    except KeyError:
        return ' '

def get_likes(tweet):
    try:
        return str(tweet['favorite_count'])
    except KeyError:
        return ' '

def get_retweets(tweet):
    try:
        return str(tweet['retweet_count'])
    except KeyError:
        return ' '

def get_url(tweet):
    try:
        return 'https://twitter.com/' + tweet['user']['screen_name'] + '/status/'+ str(tweet['id'])
    except KeyError:
        return ' '

In [66]:
def build_map(dict_docs_tweet):
    doc_map = {}
    for doc in dict_docs_tweet.keys():
        tweet = dict_docs_tweet[doc]
        
        items_list = [get_text(tweet), get_username(tweet), get_date(tweet), get_hashtags(tweet), get_likes(tweet), get_retweets(tweet), get_url(tweet)] 
        doc_map[doc] = " | ".join(items_list)
        
    return doc_map

## 3) See some results

I initialize the paths and than load the data using tha function created before: 

In [67]:
TWEETS_PATH = 'data/tw_hurricane_data.json'
DOCS_PATH = 'data/tweet_document_ids_map.csv'
doc_to_tweet = load_data(TWEETS_PATH, DOCS_PATH)
print("Total number of docs of tweets: {}".format(len(doc_to_tweet)))

Total number of docs of tweets: 4000


In the next part I build the map of the future original data to show, and show some of the items:

In [68]:
docs_map = build_map(doc_to_tweet)
for index in range(2):
    doc = list(docs_map.keys())[index]
    print("Original {} text line:\n    {} \n".format(doc, docs_map[doc]))

Original doc_1 text line:
    So this will keep spinning over us until 7 pm…go away already. #HurricaneIan https://t.co/VROTxNS9rz | suzjdean | Friday 30 September 2022 | ##HurricaneIan | 0 | 0 | https://twitter.com/suzjdean/status/1575918182698979328 

Original doc_2 text line:
    Our hearts go out to all those affected by #HurricaneIan. We wish everyone on the roads currently braving the conditions safe travels. 💙 | lytx | Friday 30 September 2022 | ##HurricaneIan | 0 | 0 | https://twitter.com/lytx/status/1575918151862304768 



## 4) Preprocess the text
After creating the map we have to create another map with al the processed text. To do that I create another function that takes a string and transform it by:
- Making all the text lower
- Removing punctuation marksù
- Removing links
- Tokenize the text to get a list of terms
- Eliminate the stopwords
- Perform stemming

I choose not to remove # and @ because I want that they appear different (like hashtags and users). However I just preprocess the hashtag

In [69]:
def preprocess(str_line):
    """
    Argument:
    line -- string (text) to be preprocessed

    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    str_line = str_line.lower()
    str_line = re.sub(r'(\s)(#)[^\s]+', , str_line) # Preprocess the hashtags
    str_line = re.sub(r'(\s)(http)[^\s]+', ' ', str_line) # Removing links
    str_line = re.sub(r'[^\w\s#@]+', ' ', str_line) # Removing punctuation marks
    str_line = str_line.split()  # Tokenize the text to get a list of terms
    str_line = [x for x in str_line if x not in stop_words]  # Eliminate the stopwords
    str_line = [stemmer.stem(word) for word in str_line]  # Perform stemming
    return str_line

Now I can make a map with preprocess text (I choose the tweet text, the username and date for the research):

In [76]:
def build_prep_map(dict_docs_tweet):
    prep_doc_map = {}
    for doc in dict_docs_tweet.keys():
        tweet = dict_docs_tweet[doc]
        prep_doc_map[doc] = preprocess(get_text(tweet)) + preprocess(get_username(tweet)) + preprocess(get_date(tweet))
    return prep_doc_map

In [77]:
start_time = time.time()
prep_docs_map = build_prep_map(doc_to_tweet)
print("Total time to preprocess tweets: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to preprocess tweets: 4.65 seconds


## 5) Final result of preprocessing
Now all the text is ready to be insert in the index, but before we can just see some results

In [72]:
for index in range(2):
    doc = list(prep_docs_map.keys())[index]
    print("Preprocess {} text line:\n   {}\n".format(doc, prep_docs_map[doc]))

Preprocess doc_1 text line:
   ['keep', 'spin', 'us', '7', 'pm', 'go', 'away', 'alreadi', '#hurricaneian', 'suzjdean', 'friday', '30', 'septemb', '2022', '0', '0']

Preprocess doc_2 text line:
   ['heart', 'go', 'affect', '#hurricaneian', 'wish', 'everyon', 'road', 'current', 'brave', 'condit', 'safe', 'travel', 'lytx', 'friday', '30', 'septemb', '2022', '0', '0']

