## Import libraries

In [1]:
import pandas as pd
import numpy as np
import random
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
import json
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, plot_confusion_matrix
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def print_msg_box(msg, indent=1, width=None, title=None):
    """Print message-box with optional title."""
    lines = msg.split('\n')
    space = " " * indent
    if not width:
        width = max(map(len, lines))
    box = f'╔{"═" * (width + indent * 2)}╗\n'  # upper_border
    if title:
        box += f'║{space}{title:<{width}}{space}║\n'  # title
        box += f'║{space}{"-" * len(title):<{width}}{space}║\n'  # underscore
    box += ''.join([f'║{space}{line:<{width}}{space}║\n' for line in lines])
    box += f'╚{"═" * (width + indent * 2)}╝'  # lower_border
    print(box)

## Set random seeds

In [4]:
def set_random_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)

## Constants

In [5]:
TRAINDATA_PATH = 'data/train_tweets_removed.csv'
TESTDATA_PATH = 'data/testdata.manual.2009.06.14_new.csv'
BASIC_COLUMN_HEADERS = ['polarity', 'tweet_id',
                        'date', 'query', 'user', 'content']

## Defining functions for cleaning text

In [6]:
f = open('contractions.json')
contractions = json.load(f)

In [7]:
def transform_repeated_letters(word: str):
    currentC = ''
    count = 1
    new_word = ""
    for i, c in enumerate(word):
        if c == currentC:
            count += 1
            if count <= 2:
                new_word += c
        else:
            new_word += c
            count = 1
        currentC = c
    return new_word

In [8]:
def preprocess_text(text: str, remove_usernames: bool, remove_urls: bool):
    #lowercase
    text = text.lower()
    #remove usernames
    if remove_usernames:
        text = re.sub(r'@[^\s]+', 'USERNAME', text)
    #remove urls https://www.geeksforgeeks.org/python-check-url-string/
    if remove_urls:
        text = re.sub(
            r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", 'URL', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [9]:
def preprocessing_tokens(token_array, remove_repeated_letters: bool, translate_contractions: bool):
    for i, word in enumerate(token_array):
        processed_word = word
        #translate contractions
        if translate_contractions:
            if word in contractions:
                processed_word = contractions[word]
        #repeated letters
        if remove_repeated_letters:
            processed_word = transform_repeated_letters(processed_word)

        token_array[i] = processed_word
    return token_array

In [10]:
detokenizer = TreebankWordDetokenizer()

def clean_text(text: str, remove_usernames: bool, remove_urls: bool, remove_repeated_letters: bool, translate_contractions: bool):
    text = preprocess_text(text, remove_usernames, remove_urls)
    #tokenize
    tokenized_text = nltk.word_tokenize(text)
    tokenized_text = preprocessing_tokens(tokenized_text, remove_repeated_letters, translate_contractions)
    #detokenize for Vectorizer because it can only work with full strings
    detokenized_text = detokenizer.detokenize(tokenized_text)
    return detokenized_text

## Load testdata

In [11]:
def load_test_data():
    testdata = pd.read_csv(TESTDATA_PATH, delimiter=';', header=None)
    testdata.columns = BASIC_COLUMN_HEADERS
    #delete all with polarity 2 (neutral)
    testdata = testdata[testdata['polarity'] != 2]
    return testdata

## Main function

In [12]:
def preprocess(remove_usernames: bool, remove_urls: bool, remove_repeated_letters: bool, translate_contractions: bool, remove_stop_words: bool, preprocessing_row: int):
    
    msg = "Removing Usernames: %s\n" \
        "Removing URL's: %s\n" \
        "Removing repeated letters: %s \n" \
        "Tranlate contractions: %s \n" \
        "Removing stop words: %s\n" \
        "Resulting preprocessing row: %d\n" % (remove_usernames, remove_urls, remove_repeated_letters,
                                               translate_contractions, remove_stop_words, preprocessing_row)


    print_msg_box(msg=msg, indent=2, title='Chosen parameters:')


    #load train data
    print("\nLoad train data from %s" % TRAINDATA_PATH)
    train_data = pd.read_csv(TRAINDATA_PATH)
    train_data['content'] = train_data['content'].values.astype('str')

    #apply cleaning to train data
    print("Preprocess train data...")
    train_data['content'] = train_data['content'].apply(lambda x: clean_text(
        x, remove_usernames=remove_usernames, remove_urls=remove_urls, 
        remove_repeated_letters=remove_repeated_letters, translate_contractions=translate_contractions))

    #save preprocessed data to csv
    train_data.to_csv("data/csv_rows/train_row_%d.csv" % (preprocessing_row))
    print("Save preprocessed train data to: data/csv_rows/train_row_%d.csv" %
          preprocessing_row)

    #print out data head
    sw_message_str = "(stopwords will be removed later)" if remove_stop_words else ""
    print('Train dataset after cleaning %s:' % sw_message_str)
    print(train_data.loc[:10, 'content'])

    #load test data
    print("\nLoad test data from %s" % TESTDATA_PATH)
    test_data = load_test_data()
    #apply cleaning to testdata
    print("Preprocess test data...")
    test_data['content'] = test_data['content'].apply(lambda x: clean_text(
        x, remove_usernames=remove_usernames, remove_urls=remove_urls,
        remove_repeated_letters=remove_repeated_letters, translate_contractions=translate_contractions))

    #save preprocessed data to csv
    print("Save preprocessed test data to: data/csv_rows/test_row_%d.csv" %
          (preprocessing_row))
    test_data.to_csv("data/csv_rows/test_row_%d.csv" % (preprocessing_row))

## Row 1: No preprocessing

In [13]:
preprocess(remove_usernames=False, remove_urls=False, remove_repeated_letters=False,
                   translate_contractions=False, remove_stop_words=False, preprocessing_row=1)

╔═════════════════════════════════════╗
║  Chosen parameters:                 ║
║  ------------------                 ║
║  Removing Usernames: False          ║
║  Removing URL's: False              ║
║  Removing repeated letters: False   ║
║  Tranlate contractions: False       ║
║  Removing stop words: False         ║
║  Resulting preprocessing row: 1     ║
║                                     ║
╚═════════════════════════════════════╝

Load train data from data/train_tweets_removed.csv
Preprocess train data...
Save preprocessed train data to: data/csv_rows/train_row_1.csv
Train dataset after cleaning :
0     switchfoot httptwitpiccom2y1zl awww thats a bu...
1     is upset that he cant update his facebook by t...
2     kenichan i dived many times for the ball manag...
3        my whole body feels itchy and like its on fire
4     nationwideclass no its not behaving at all im ...
5                           kwesidei not the whole crew
6                                            need a h

## Row 2: u, l, rl

In [14]:
preprocess(remove_usernames=True, remove_urls=True, remove_repeated_letters=True,
                   translate_contractions=False, remove_stop_words=False, preprocessing_row=2)

╔════════════════════════════════════╗
║  Chosen parameters:                ║
║  ------------------                ║
║  Removing Usernames: True          ║
║  Removing URL's: True              ║
║  Removing repeated letters: True   ║
║  Tranlate contractions: False      ║
║  Removing stop words: False        ║
║  Resulting preprocessing row: 2    ║
║                                    ║
╚════════════════════════════════════╝

Load train data from data/train_tweets_removed.csv
Preprocess train data...
Save preprocessed train data to: data/csv_rows/train_row_2.csv
Train dataset after cleaning :
0     USERNAME URL aww thats a bummer you shoulda go...
1     is upset that he cant update his facebook by t...
2     USERNAME i dived many times for the ball manag...
3        my whole body feels itchy and like its on fire
4     USERNAME no its not behaving at all im mad why...
5                           USERNAME not the whole crew
6                                            need a hug
7     US

## Row 3: u, l, rl, sw

In [15]:
preprocess(remove_usernames=True, remove_urls=True, remove_repeated_letters=True,
                   translate_contractions=False, remove_stop_words=True, preprocessing_row=3)

╔════════════════════════════════════╗
║  Chosen parameters:                ║
║  ------------------                ║
║  Removing Usernames: True          ║
║  Removing URL's: True              ║
║  Removing repeated letters: True   ║
║  Tranlate contractions: False      ║
║  Removing stop words: True         ║
║  Resulting preprocessing row: 3    ║
║                                    ║
╚════════════════════════════════════╝

Load train data from data/train_tweets_removed.csv
Preprocess train data...
Save preprocessed train data to: data/csv_rows/train_row_3.csv
Train dataset after cleaning (stopwords will be removed later):
0     USERNAME URL aww thats a bummer you shoulda go...
1     is upset that he cant update his facebook by t...
2     USERNAME i dived many times for the ball manag...
3        my whole body feels itchy and like its on fire
4     USERNAME no its not behaving at all im mad why...
5                           USERNAME not the whole crew
6                              

## Row 4: u, l, rl, abk

In [16]:
preprocess(remove_usernames=True, remove_urls=True, remove_repeated_letters=True,
                   translate_contractions=True, remove_stop_words=False, preprocessing_row=4)

╔════════════════════════════════════╗
║  Chosen parameters:                ║
║  ------------------                ║
║  Removing Usernames: True          ║
║  Removing URL's: True              ║
║  Removing repeated letters: True   ║
║  Tranlate contractions: True       ║
║  Removing stop words: False        ║
║  Resulting preprocessing row: 4    ║
║                                    ║
╚════════════════════════════════════╝

Load train data from data/train_tweets_removed.csv
Preprocess train data...
Save preprocessed train data to: data/csv_rows/train_row_4.csv
Train dataset after cleaning :
0     USERNAME URL aww that is a bummer you shoulda ...
1     is upset that he cannot update his facebook by...
2     USERNAME i dived many times for the ball manag...
3      my whole body feels itchy and like it is on fire
4     USERNAME no it is not behaving at all i am mad...
5                           USERNAME not the whole crew
6                                            need a hug
7     US

## Row 5: u, l, rl, sw, abk

In [17]:
preprocess(remove_usernames=True, remove_urls=True, remove_repeated_letters=True,
                   translate_contractions=True, remove_stop_words=True, preprocessing_row = 5)

╔════════════════════════════════════╗
║  Chosen parameters:                ║
║  ------------------                ║
║  Removing Usernames: True          ║
║  Removing URL's: True              ║
║  Removing repeated letters: True   ║
║  Tranlate contractions: True       ║
║  Removing stop words: True         ║
║  Resulting preprocessing row: 5    ║
║                                    ║
╚════════════════════════════════════╝

Load train data from data/train_tweets_removed.csv
Preprocess train data...
Save preprocessed train data to: data/csv_rows/train_row_5.csv
Train dataset after cleaning (stopwords will be removed later):
0     USERNAME URL aww that is a bummer you shoulda ...
1     is upset that he cannot update his facebook by...
2     USERNAME i dived many times for the ball manag...
3      my whole body feels itchy and like it is on fire
4     USERNAME no it is not behaving at all i am mad...
5                           USERNAME not the whole crew
6                              