# PreProcessing.ipynb

### This notebook pre-processes the Tweets for use in the SVM model.

Author: Erik Puijk <br>
Date  : February 17, 2022

In [10]:
""" Install required packages. """
!pip install unidecode
!pip install nltk



In [11]:
import re
import json
import unidecode
import nltk
from nltk.corpus import stopwords
import pyperclip as pc

In [12]:
""" Download and load stop words list. """
nltk.download('stopwords')

stop_words = stopwords.words('dutch')

# Some stop words are interesting because they refer to an individual or a group
exceptions = ['mij', 'mijn', 'ik', 'jij', 'je', 'u', 'uw', 'ons', 'onze', 'hij', 'hem', 'zij', 'haar', 'we', 'wij', 'me']
stop_words = [word for word in stop_words if word not in exceptions]

[nltk_data] Downloading package stopwords to /home/erik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    print("Total Tweets read: %s" % (len(content)))

    return content

In [14]:
def regex_processing(tweet):
    """ Process a Tweet by removing or replacing certain elements using regular expressions. """
    
    # To lower case
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = re.sub('http[a-z0-9_:/.]+','', tweet)
    
    # Replace @mentions with 'NAME'
    tweet = re.sub(r'@([a-z0-9_]+)', ' NAME', tweet)
    
    # Replace times with 'TIME'
    tweet = re.sub(r'(\b[\d]{1,2} tot [\d]{1,2}\b)', 'TIME tot TIME', tweet)
    tweet = re.sub(r'(\b[\d]{1,2}[:\.][\d]{1,2} (uur|u)\b)', 'TIME', tweet)
    tweet = re.sub(r'(\b[\d]{1,2}[:\.][\d]{1,2}\b)', 'TIME', tweet)
    tweet = re.sub(r'(\b[\d]{1,2} (uur|u)\b)', 'TIME', tweet)
    tweet = re.sub(r'(\b[\d]{1,2}(uur|u)\b)', 'TIME', tweet)
    
    # Replace days with 'DAY'
    tweet = re.sub(r'\b(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)+(ochtend|morgen|middag|avond)*\b', 'DAY', tweet)
    
    # Replace dates with 'DATE'
    tweet = re.sub(r'(\b[\d]{1,2}\s(januari|februari|maart|april|juni|juli|augustus|september|oktober|november|december))', 'DATE', tweet)
    tweet = re.sub(r'(\b[\d]{1,2}\s(jan|feb|mrt|apr|mei|jun|jul|aug|sept|okt|nov|dec))', 'DATE', tweet)

    # Replace numbers with 'NUMBER'
    tweet = re.sub(r'\b[\d]+\b', 'NUMBER', tweet)
    
    # Remove HTML codes
    tweet = re.sub(r'&gt', '', tweet)
    tweet = re.sub(r'&lt', '', tweet)
    tweet = re.sub(r'&amp', '', tweet)
    
    # Replace contractions?
    # Remove hashtags?
    
    # Return only alphanumerical characters and hashtags (#)
    return re.sub(r'[^\w#]', ' ', tweet)

In [15]:
def remove_stop_words(tweet):
    """ Remove stop words from a Tweet using NLTK set of Dutch stop words. """
    
    filtered_tweet = ' '.join([w for w in tweet.split() if not w in stop_words])
    
    return filtered_tweet

In [16]:
def pre_process(tweets):
    """ Pre-process the list of Tweets by performing several separate manipulations. """

    for tweet in tweets:
        tweet['text'] = regex_processing(tweet['text'])
        
        # Remove words with one letter
        tweet['text'] = ' '.join([w for w in tweet['text'].split() if len(w)>1])
        
        # Replace accents
        tweet['text'] = unidecode.unidecode(tweet['text'])
        
        tweet['text'] = remove_stop_words(tweet['text'])

    return tweets

In [17]:
def write_tweets(tweets_w, path):
    """ Write obtained Tweets to a text file in JSON-format. """
    
    try:
        with open(path, 'w') as f:
            json.dump(tweets_w, f)
    except IOError:
        print("I/O error")

In [18]:
tweets = read_tweets('source/tweets_all.txt')
tweets = pre_process(tweets)

write_tweets(tweets, 'source/tweets_all_preprocessed_exc_stopwords.txt')

Total Tweets read: 4664
