# Information Retrieval and Web Analytics Project
## Text Processing

#### Packages

We first import all the packages that we need for text processing, such as:
- Demoji
- Re
- Deep translator

In [1]:
import time
import string
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import json
import nltk
import demoji
import re
nltk.download('stopwords');
from deep_translator import GoogleTranslator

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/martinacarres/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load data 

**Read the file with the tweets:**

In [2]:
docs_path = 'dataset_tweets_WHO.txt'

# Read the JSON file in a unique string
with open(docs_path) as fp:
    corpus = fp.readlines()[0]

# Load the JSON file as a dictionary
corpus = json.loads(corpus)

In [3]:
print("There are \033[1m%i tweets\033[0m in the dataset" %len(corpus))

There are [1m2399 tweets[0m in the dataset


### Functions

In [4]:
def italics_to_plaintext(text):
    #difference between an italic lowecase character and its corresponding plaintext lowercase character
    diff_lower = ord('𝘢') - ord('a')
    #difference between an italic uppercase character and its corresponding plaintext uppercase character
    diff_upper = ord('𝘈') - ord('A')
    
    plaintext = ""
    for c in text:
        # if the character is italic lowercase, get the corresponding plaintext lowercase character
        if ord(c) >= ord('𝘢') and ord(c) <= ord('𝘻'):
            plaintext += chr(ord(c) - diff_lower)
        # else if the character is italic uppercase, get the corresponding plaintext uppercase character
        elif ord(c) >= ord('𝘈') and  ord(c) <= ord('𝘡'):
            plaintext += chr(ord(c) - diff_upper)
        else:
            plaintext += c
    
    return plaintext

def bold_to_plaintext(text):
    #difference between a bold lowecase character and its corresponding plaintext lowercase character
    diff_lower = ord('𝐚') - ord('a')
    #difference between a bold uppercase character and its corresponding plaintext uppercase character
    diff_upper = ord('𝐀') - ord('A')
    
    plaintext = ""
    for c in text:
        # if the character is bold lowercase, get the corresponding plaintext lowercase character
        if ord(c) >= ord('𝐚') and ord(c) <= ord('𝐳'):
            plaintext += chr(ord(c) - diff_lower)
        # else if the character is bold uppercase, get the corresponding plaintext uppercase character
        elif ord(c) >= ord('𝐀') and  ord(c) <= ord('𝐙'):
            plaintext += chr(ord(c) - diff_upper)
        else:
            plaintext += c
    
    return plaintext

def getTerms(text, stemming, stops):
    # Text to lowercase
    text = text.lower()
    # Text delete italic letter type if needed
    text = italics_to_plaintext(text)
    # Text delete bold letter type if needed
    text = bold_to_plaintext(text)
    # Delete all urls
    text = re.sub(r'http\S+', ' ', text) 
    # Delete all non-alphanumerical characters (it includes emojis) except '#' and '@'
    text = re.sub(r'[^A-Za-z0-9#@]+', ' ', text)
    # Text tokenization
    words = text.split()
    # Remove stopwords
    words = [word for word in words if word not in stops]
    # Get the stem of each word
    words = [stemming.stem(word) for word in words]
    
    return words

### Process tweets

In [5]:
stemming = PorterStemmer()
# Delete also "amp" (&) and "rt"
stops = set(stopwords.words("english")).union(set({'amp', 'rt'}))

# Dictionary where we'll save all the processed tweets
data = {}
for tweet in corpus:
    
    #In case that the tweet is not in english, we traduce it
    lang = corpus[tweet]['lang']
    if lang != 'en':
        text_tweet = GoogleTranslator(target='en').translate(corpus[tweet]['full_text'])
    else:
        text_tweet = corpus[tweet]['full_text']
    
    # Get the text tokenized and cleaned 
    text_tweet_processed = getTerms(text_tweet, stemming, stops)
    
    if text_tweet_processed != []: #In case that the text is not null                       
        data[tweet] = {}
        data[tweet]['text'] = text_tweet_processed

        # Save all emojis used with its meaning
        data[tweet]['emojis'] = demoji.findall(corpus[tweet]['full_text'])

        # Save creation data
        data[tweet]['date'] = corpus[tweet]['created_at']

        # Save the number of retweets of this tweet
        data[tweet]['retweets'] = corpus[tweet]['retweet_count']

        # Save the number of 'favorites' of this tweet
        data[tweet]['favorites'] = corpus[tweet]['favorite_count']

        # Save the full name of all the users mentioned
        data[tweet]['user_mentions'] = list()
        ## List of dictionaries, each with information of a user mentioned
        users_data = corpus[tweet]['entities']['user_mentions']
        for user in users_data:
            data[tweet]['user_mentions'].append(user['name'])

### Examples

**Spanish Tweet with emojis:**

In [6]:
# Before processing
features = ['full_text', 'created_at', 'retweet_count', 'favorite_count', 'entities']
for feature in features:
    print(feature)
    print(corpus['4'][feature])
    print('-'*50)

full_text
RT @opsoms: Si está completamente vacunado 💉💉, ¿aún puede contraer COVID-19? 

🚨 No importa si está vacunado o si todavía está esperando, s…
--------------------------------------------------
created_at
Wed Oct 13 05:47:10 +0000 2021
--------------------------------------------------
retweet_count
43
--------------------------------------------------
favorite_count
0
--------------------------------------------------
entities
{'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'opsoms', 'name': 'OPS/OMS', 'id': 22276965, 'id_str': '22276965', 'indices': [3, 10]}], 'urls': []}
--------------------------------------------------


In [7]:
# After processing
data['4']

{'text': ['@opsom',
  'fulli',
  'vaccin',
  'still',
  'get',
  'covid',
  '19',
  'matter',
  'vaccin',
  'still',
  'wait',
  'ye'],
 'emojis': {'💉': 'syringe', '🚨': 'police car light'},
 'date': 'Wed Oct 13 05:47:10 +0000 2021',
 'retweets': 43,
 'favorites': 0,
 'user_mentions': ['OPS/OMS']}

**Example where saving the full name of the user can be useful:**

In [8]:
# Before processing
features = ['full_text', 'entities']
for feature in features:
    print(feature)
    print(corpus['15'][feature])
    print('-'*50)

full_text
RT @DrTedros: Was good to meet @Chikwe_I, who will soon head our work on health emergency surveillance, incl the @WHO Hub for Pandemic &amp; Ep…
--------------------------------------------------
entities
{'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'DrTedros', 'name': 'Tedros Adhanom Ghebreyesus', 'id': 189868631, 'id_str': '189868631', 'indices': [3, 12]}, {'screen_name': 'Chikwe_I', 'name': 'Chikwe Ihekweazu', 'id': 3387606493, 'id_str': '3387606493', 'indices': [31, 40]}, {'screen_name': 'WHO', 'name': 'World Health Organization (WHO)', 'id': 14499829, 'id_str': '14499829', 'indices': [113, 117]}], 'urls': []}
--------------------------------------------------


In [9]:
# After processing
data['15']

{'text': ['@drtedro',
  'good',
  'meet',
  '@chikw',
  'soon',
  'head',
  'work',
  'health',
  'emerg',
  'surveil',
  'incl',
  '@who',
  'hub',
  'pandem',
  'ep'],
 'emojis': {},
 'date': 'Tue Oct 12 16:27:05 +0000 2021',
 'retweets': 84,
 'favorites': 0,
 'user_mentions': ['Tedros Adhanom Ghebreyesus',
  'Chikwe Ihekweazu',
  'World Health Organization (WHO)']}

**Example of tweet in italics:**

In [10]:
# Before processing
print(corpus['90']['full_text'])

𝘐𝘵'𝘴 𝘖𝘒 𝘵𝘰 𝘢𝘴𝘬 𝘴𝘰𝘮𝘦𝘰𝘯𝘦 𝘪𝘧 𝘵𝘩𝘦𝘺 𝘢𝘳𝘦 𝘵𝘩𝘪𝘯𝘬𝘪𝘯𝘨 𝘢𝘣𝘰𝘶𝘵 #𝘴𝘶𝘪𝘤𝘪𝘥𝘦 - 𝘪𝘵 𝘰𝘧𝘵𝘦𝘯 𝘳𝘦𝘥𝘶𝘤𝘦𝘴 𝘢𝘯𝘹𝘪𝘦𝘵𝘺 𝘢𝘯𝘥 𝘩𝘦𝘭𝘱𝘴 𝘱𝘦𝘰𝘱𝘭𝘦 𝘧𝘦𝘦𝘭 𝘶𝘯𝘥𝘦𝘳𝘴𝘵𝘰𝘰𝘥.
 
#WorldMentalHealthDay #LetsTalk https://t.co/KO4YKmGxGL


In [11]:
# After processing
data['90']

{'text': ['it',
  'ok',
  'ask',
  'someon',
  'think',
  '#suicid',
  'often',
  'reduc',
  'anxieti',
  'help',
  'peopl',
  'feel',
  'understood',
  '#worldmentalhealthday',
  '#letstalk'],
 'emojis': {},
 'date': 'Sun Oct 10 11:34:03 +0000 2021',
 'retweets': 127,
 'favorites': 286,
 'user_mentions': []}

**Example of tweet in bold:**

In [12]:
# Before processing
print(corpus['187']['full_text'])

@DrTedros @antonioguterres "𝐖𝐞 𝐡𝐚𝐯𝐞 𝐭𝐡𝐞 𝐭𝐨𝐨𝐥𝐬 𝐭𝐨 𝐛𝐫𝐢𝐧𝐠 𝐭𝐡𝐞 #𝐂𝐎𝐕𝐈𝐃𝟏𝟗 𝐩𝐚𝐧𝐝𝐞𝐦𝐢𝐜 𝐮𝐧𝐝𝐞𝐫 𝐜𝐨𝐧𝐭𝐫𝐨𝐥, 𝐢𝐟 𝐰𝐞 𝐮𝐬𝐞 𝐭𝐡𝐞𝐦 𝐩𝐫𝐨𝐩𝐞𝐫𝐥𝐲 𝐚𝐧𝐝 𝐬𝐡𝐚𝐫𝐞 𝐭𝐡𝐞𝐦 𝐟𝐚𝐢𝐫𝐥𝐲"-@DrTedros #VaccinEquity


In [13]:
# After processing
data['187']

{'text': ['@drtedro',
  '@antonioguterr',
  'we',
  'tool',
  'bring',
  '#covid',
  'pandem',
  'control',
  'use',
  'properli',
  'share',
  'fairli',
  '@drtedro',
  '#vaccinequ'],
 'emojis': {},
 'date': 'Thu Oct 07 13:53:01 +0000 2021',
 'retweets': 51,
 'favorites': 209,
 'user_mentions': ['Tedros Adhanom Ghebreyesus',
  'António Guterres',
  'Tedros Adhanom Ghebreyesus']}