# PreProcessing.ipynb

### This notebook pre-processes the Tweets for use in the SVM model.

Author: Erik Puijk <br>
Date  : February 17, 2022

In [8]:
import re
import json

In [37]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    print("Total Tweets read: %s" % (len(content)))

    return content

In [38]:
# Obtained from https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
def remove_emojis(tweet):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', tweet)

In [69]:
def remove_special_chars(tweet):
    # Remove URLs
    tweet = re.sub('http[A-Za-z0-9_:/.]+','', tweet)
    return re.sub(r'[^\w]', ' ', tweet)

In [70]:
def pre_process(tweets):
    
    for tweet in tweets:
        tweet['text'] = remove_special_chars(tweet['text']) #remove_emojis(tweet['text'])

    return tweets

In [71]:
tweets = read_tweets('source/tweets_all.txt')
tweets = pre_process(tweets)
print(tweets)

Total Tweets read: 4668
[{'public_metrics': {'retweet_count': 36, 'reply_count': 11, 'like_count': 232, 'quote_count': 4}, 'text': 'Zojuist zijn we de 4000 leden gepasseerd    Wil je ook lid worden van de misschien wel snelstgroeiende partij van dit moment    Ga dan snel naar        ToekomstMadeInEurope', 'author_id': '966790780869316608', 'id': '1367221739332268037', 'conversation_id': '1367221739332268037', 'created_at': '2021-03-03T21:13:59.000Z', 'author_handle': 'VoltNederland', 'total_engagement': 0.019810990549527476, 'category': ''}, {'public_metrics': {'retweet_count': 3, 'reply_count': 2, 'like_count': 4, 'quote_count': 0}, 'author_id': '1116624945000280072', 'created_at': '2021-02-27T19:43:24.000Z', 'text': 'Is  BBB nou links of rechts  is de meest gestelde vraag van mensen over  BoerBurgerB op Google   lientje1967 geeft het antwoord in Goedemorgen Nederland  WNLVandaag   ', 'id': '1365749393329111040', 'conversation_id': '1365749393329111040', 'author_handle': 'BoerBurgerB'