# PreProcessing.ipynb

### This notebook pre-processes the Tweets for use in the SVM model.

Author: Erik Puijk <br>
Date  : February 17, 2022

In [282]:
import re
import json
import pyperclip as pc

In [283]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    print("Total Tweets read: %s" % (len(content)))

    return content

In [284]:
# Obtained from https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
def remove_emojis(tweet):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', tweet)

In [297]:
# NICE: https://www.w3schools.com/python/python_regex.asp
def remove_special_chars(tweet):
    # To lower case
    tweet = tweet.lower()
    
    # Remove URLs - VERIFIED
    tweet = re.sub('http[a-z0-9_:/.]+','', tweet)
    
    # Replace @mentions with 'NAME' - VERIFIED
    tweet = re.sub(r'@([a-z0-9_]+)', ' NAME', tweet)
    
    # Replace times with 'TIME' - VERIFIED
    tweet = re.sub(r'(\b[\d]{1,2} tot [\d]{1,2}\b)', 'TIME tot TIME', tweet)
    tweet = re.sub(r'(\b[\d]{1,2}[:\.][\d]{1,2} (uur|u)\b)', 'TIME', tweet)
    tweet = re.sub(r'(\b[\d]{1,2}[:\.][\d]{1,2}\b)', 'TIME', tweet)
    tweet = re.sub(r'(\b[\d]{1,2} (uur|u)\b)', 'TIME', tweet)
    tweet = re.sub(r'(\b[\d]{1,2}(uur|u)\b)', 'TIME', tweet)
    
    # Replace days with DAY - VERIFIED
    tweet = re.sub(r'\b(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)+(ochtend|morgen|middag|avond)*\b', 'DAY', tweet)
    
    # Replace dates with DATE - VERIFIED
    tweet = re.sub(r'(\b[\d]{1,2}\s(januari|februari|maart|april|juni|juli|augustus|september|oktober|november|december))', 'DATE', tweet)
    tweet = re.sub(r'(\b[\d]{1,2}\s(jan|feb|mrt|apr|mei|jun|jul|aug|sept|okt|nov|dec))', 'DATE', tweet)
    
    # Replace numbers with 'NUMBER' - TO BE VERIFIED
    #tweet = re.sub(r'$\d+\W+|\b\d+\b|\W+\d+$', 'NUMBER', tweet)
    # Still contains #NUMBER (with hashtag in front)
    tweet = re.sub(r'\b[\d]+\b', 'NUMBER', tweet)
    
    # Remove lt, amp, etc!
    
    
    return re.sub(r'[^\w#]', ' ', tweet)

In [298]:
#http://blog.refine-it.nl/begrijpend-lezen-met-python-en-nlp/
def remove_stop_words(tweet):
    
    return tweet

In [299]:
def pre_process(tweets):
    
    for tweet in tweets:
        tweet['text'] = remove_special_chars(tweet['text'])
        #tweet['text'] = remove_emojis(tweet['text'])
        #tweet['text'] = remove_stop_words(tweet['text'])

    return tweets

In [300]:
tweets = read_tweets('source/tweets_all.txt')
tweets = pre_process(tweets)

str_tweets = ""

for tweet in tweets:
    #print(tweet['text'] + '\n')
    str_tweets = str_tweets + tweet['text'] + '\n'

tokenized = str_tweets.split()
print(tokenized.count("NUMBER"))

#print(str_tweets)
pc.copy(str_tweets)

Total Tweets read: 4664
1449
