In [1]:
# import required modules for preprocessing
import os
from os.path import join
import time
import pickle
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords as Stopwords
from scipy.sparse import csr_matrix

# import required modules for classification
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# import required modules for RNN classification
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score

---
## Load training set, dev set and testing set

In [2]:
# Load training set, dev set and testing set

dataDir = '../semeval-tweets'  # change to the proper directory
datasetStrings = ['twitter-training-data.txt', 'twitter-test1.txt', 'twitter-test2.txt', 'twitter-test3.txt', 'twitter-dev-data.txt']
datasets = [join(dataDir, t) for t in datasetStrings]
print(f"Extracting data from: \n\t{datasets[0]}\n\t{datasets[1]}\n\t{datasets[2]}\n\t{datasets[3]}\n\t{datasets[4]}")

tweet_IDs = {}          # init dictionary with tweet IDs
tweet_sentiments = {}   # init dictionary with sentiments
tweet_texts = {}        # init dictionary with tweet texts

for DatasetString in datasets:
    data_ID, data_sent, data_text  = {}, {}, {}    # temp dictionaries
    with open(DatasetString, 'r', encoding='utf8') as f1:
        for i, line in enumerate(f1):
            fields = line.split('\t')
            data_ID[i] = fields[0]            # tweet IDs
            data_sent[fields[0]] = fields[1]  # sentiments
            data_text[fields[0]] = fields[2]  # tweet text
    tweet_IDs[DatasetString] = data_ID
    tweet_sentiments[DatasetString] = data_sent
    tweet_texts[DatasetString] = data_text

# sentiment dictionaries
sent_train = tweet_sentiments[datasets[0]]
sent_test1 = tweet_sentiments[datasets[1]]
sent_test2 = tweet_sentiments[datasets[2]]
sent_test3 = tweet_sentiments[datasets[3]]
sent_dev = tweet_sentiments[datasets[4]]

# tweet text dictionaries
text_train = tweet_texts[datasets[0]]
text_test1 = tweet_texts[datasets[1]]
text_test2 = tweet_texts[datasets[2]]
text_test3 = tweet_texts[datasets[3]]
text_dev = tweet_texts[datasets[4]]

# tweet IDs dictionaries
IDs_train = tweet_IDs[datasets[0]]
IDs_test1 = tweet_IDs[datasets[1]]
IDs_test2 = tweet_IDs[datasets[2]]
IDs_test3 = tweet_IDs[datasets[3]]
IDs_dev = tweet_IDs[datasets[4]]

# saving the sentiments and IDs as a pickle file
if not os.path.isfile("preprocessing-sent-ID.pkl"):
    temp = [sent_train, sent_test1, sent_test2, sent_test3, sent_dev, IDs_train, IDs_test1, IDs_test2, IDs_test3, IDs_dev]
    with open("preprocessing-sent-ID.pkl", 'wb') as out_file:
        pickle.dump(temp, out_file, protocol=-1)

Extracting data from: 
	../semeval-tweets/twitter-training-data.txt
	../semeval-tweets/twitter-test1.txt
	../semeval-tweets/twitter-test2.txt
	../semeval-tweets/twitter-test3.txt
	../semeval-tweets/twitter-dev-data.txt


In [3]:
# auxiliary functions

# Skeleton: Evaluation code for the test sets
def read_test(testset):
    '''
    reading the testset and return a dictionary with: ID -> sentiment
    :param testset: str, the file name of the testset to compare
    '''
    id_gts = {}  # init the dictionary
    with open(testset, 'r', encoding='utf8') as fh:
        for line in fh:
            fields = line.split('\t')
            tweetid = fields[0]
            gt = fields[1]
            id_gts[tweetid] = gt

    return id_gts


def confusion(id_preds, testset, classifier):
    '''
    print the confusion matrix of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    gts = []
    for m, c1 in id_gts.items():
        if c1 not in gts:
            gts.append(c1)
    gts = ['positive', 'negative', 'neutral']

    conf = {}
    for c1 in gts:
        conf[c1] = {}
        for c2 in gts:
            conf[c1][c2] = 0

    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'
        conf[pred][gt] += 1

    print(''.ljust(12) + '  '.join(gts))

    for c1 in gts:
        print(c1.ljust(12), end='')
        for c2 in gts:
            if sum(conf[c1].values()) > 0:
                print('%.3f     ' % (conf[c1][c2] / float(sum(conf[c1].values()))), end='')
            else:
                print('0.000     ', end='')
        print('')
    print('')


def evaluate(id_preds, testset, classifier):
    '''
    print the macro-F1 score of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    acc_by_class = {}
    for gt in ['positive', 'negative', 'neutral']:
        acc_by_class[gt] = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}

    catf1s = {}
    ok = 0
    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'

        if gt == pred:
            ok += 1
            acc_by_class[gt]['tp'] += 1
        else:
            acc_by_class[gt]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    catcount = 0
    itemcount = 0
    microtp = 0
    microfp = 0
    microtn = 0
    microfn = 0
    macro = {'p': 0, 'r': 0, 'f1': 0}
    micro = {'p': 0, 'r': 0, 'f1': 0}
    semevalmacro = {'p': 0, 'r': 0, 'f1': 0}
    for cat, acc in acc_by_class.items():
        catcount += 1
        microtp += acc['tp']
        microfp += acc['fp']
        microtn += acc['tn']
        microfn += acc['fn']

        p = 0
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        catf1s[cat] = f1
        n = acc['tp'] + acc['fn']
        macro['p'] += p
        macro['r'] += r
        macro['f1'] += f1
        if cat in ['positive', 'negative']:
            semevalmacro['p'] += p
            semevalmacro['r'] += r
            semevalmacro['f1'] += f1
        itemcount += n
    micro['p'] = float(microtp) / float(microtp + microfp)
    micro['r'] = float(microtp) / float(microtp + microfn)
    micro['f1'] = 2 * float(micro['p']) * micro['r'] / float(micro['p'] + micro['r'])

    semevalmacrof1 = semevalmacro['f1'] / 2

    print(testset + ' (' + classifier + '): %.3f' % semevalmacrof1)
    return semevalmacrof1

# removing stop words variables
stopwords = Stopwords.words('english')
stopwords = [word.replace('\'', '') for word in stopwords]

# auxiliary ftion which takes list of words and returns its BoW representation as np.array
def text2BOW(text_list, vocabulary, vocab2num, stopwords):
    BOW_vec = np.zeros(len(vocabulary) + 1)
    for word in text_list:
        if not word in stopwords:
            if word in vocabulary:
                BOW_vec[vocab2num[word]-1] += 1
            else:
                BOW_vec[vocab2num['<OOV>']-1] += 1
    return BOW_vec

# auxiliary ftion which takes list of words and returns its TFIDF representation as np.array
def text2TFIDF(text_list, vocabulary, vocab2num, stopwords, DFfreq, Ntexts):
    TFIDF_vec = np.zeros(len(vocabulary) + 1)
    for word in np.unique(text_list):
        if not word in stopwords:
            if word in vocabulary:
                if DFfreq[word] == 0:    ###
                    print('oh no:', word)###
                tf = np.count_nonzero(np.array(text_list) == word) / len(text_list)
                idf = np.log2(Ntexts / DFfreq[word])
                TFIDF_vec[vocab2num[word]-1] = tf * idf
            else:
                tf = np.count_nonzero(np.array(text_list) == word) / len(text_list)
                idf = np.log2(Ntexts / 0.000001 )
                TFIDF_vec[vocab2num['<OOV>']-1] = tf * idf
    return TFIDF_vec

# convenience ftion for sentiment -> num
def sent2num(sent):
    if sent == 'negative':
        return -1
    if sent == 'neutral':
        return 0
    if sent == 'positive':
        return 1

# convenience ftion for num -> sentiment
def num2sent(num):
    if num == -1:
        return 'negative'
    if num == 0:
        return 'neutral'
    if num == 1:
        return 'positive'

# convert list of tokens (tweet) to an array of indexes
def text_list2array(text_list, vocabulary_list, word2index_dict, max_length):
    output_array = np.zeros(max_length, dtype=np.int16)
    for i, word in enumerate(text_list):
        if word in vocabulary_list:
            output_array[i] = word2ID[word]     # update the index in vocab
        else:
            output_array[i] = word2ID['<OOV>']  # provide the index of OOV
    return output_array

---
## Data Preprocessing
* lowercase text
* regex cleaning
   * Remove URLs
   * Process emoticons
   * Remove non-alphanumeric characters (leave hashtags and usernames)
   * Process usernames and hashtags
   * Remove numbers that are fully made of digits
   * (Remove words with only 1 character)
* Tokenisation
* POS tagging
* Lemmatization
* Saving the processed output

In [4]:
## Preprocessing 1: Plain - keeping all usernames, hashtags and emojis -> this preprocessing is for BOW and TFIDF-weighted BOW

# loading preprocessed datasets - if you already have the preprocessed file
file_to_load = "preprocessing-plain.pkl"
if os.path.isfile(file_to_load):
    with open(file_to_load, 'rb') as inp_file:
        temp_dicts = pickle.load(inp_file)
        txt_dicts = temp_dicts[0:5]
        txtlist_dicts = temp_dicts[5:]

else:
    top100 = ['com', 'net', 'org', 'jp', 'de', 'uk', 'fr', 'br', 'it', 'ru', 'es', 'me', 'gov', 'pl', 'ca', 'au', 'cn', 'co', 'in', 'nl', 'edu', 'info', 'eu', 'ch', 'id', 'at', 'kr', 'cz', 'mx', 'be', 'tv', 'se', 'tr', 'tw', 'al', 'ua', 'ir', 'vn', 'cl', 'sk', 'ly', 'cc', 'to', 'no', 'fi', 'us', 'pt', 'dk', 'ar', 'hu', 'tk', 'gr', 'il', 'news', 'ro', 'my', 'biz', 'ie', 'za', 'nz', 'sg', 'ee', 'th', 'io', 'xyz', 'pe', 'bg', 'hk', 'rs', 'lt', 'link', 'ph', 'club', 'si', 'site', 'mobi', 'by', 'cat', 'wiki', 'la', 'ga', 'xxx', 'cf', 'hr', 'ng', 'jobs', 'online', 'kz', 'ug', 'gq', 'ae', 'is', 'lv', 'pro', 'fm', 'tips', 'ms', 'sa', 'app', 'lat']

    # emoticons extracted from the tweets
    emoticons = ['%)', ':&', '8-)', '=/', ':c', ':#', ':)))', ';)', 'd:', '=3', ':O', '8D', 'oO', ':o)', '*)', 'QQ', ':S', '=)', 'D8', ':]', 'O:)', 'XD', 'Q_Q', ":'(", ':$', ':3', ':L', 'XP', ':-(', ':(', ':-)', ':-))', 'o.O', ':*', '0:3', ';;', ':D', ';D', '=]', ':@', ':)', ':))', ':/', '>:)', ':P', ':-)))', ';]', '^_^', ":')", ':x', 'D:', ':^)', ':|', ';_;', '=p', ':b', '=D', ':o', 'DX']
    emoticon_strings = ['emoticon' + str(num) for num in range(len(emoticons))]
    emoticon2string = dict(zip(emoticons, emoticon_strings))
    string2emoticon = dict(zip(emoticon_strings, emoticons))


    ID_dicts = [IDs_train, IDs_test1, IDs_test2, IDs_test3, IDs_dev]
    txt_dicts = [text_train, text_test1, text_test2, text_test3, text_dev]
    txtlist_dicts = []

    lemmatizer = WordNetLemmatizer()  # init the lemmatizer
    POSconvert = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'

    for i, IDdict in enumerate(ID_dicts):
        output = txt_dicts[i]
        output_txt = {}
        for id in IDdict.values():
            text = output[id].lower()

            # replace/delete all URLs starting with 'http' and 'www'
            new_text = re.sub("http[^\s]*", '', text)
            new_text = re.sub("www[^\s]*", '', new_text)

            # delete all URLs which have one of 100 most common extensions ('.com', '.net', ...)
            for ext in top100:
                re_string = "[^\s]*\." + ext + "[^\s]*"
                new_text = re.sub(re_string, '', new_text)

            # replace all emoticons with an emoticon string:  #emoticon42
            for em in emoticons:
                re_string = '\s' + re.escape(em) + '\s'
                replace_string = ' ' + emoticon2string[em] + ' '
                new_text = re.sub(re_string, replace_string, new_text)

            # removing '&amp'
            new_text = re.sub('&amp','', new_text)

            # remove all non-alphanumeric chars except for '# and @'
            new_text = re.sub('[^\w\s@#]','', new_text)

            # remove strings with '#' not on the beginning (to keep only hashtags)
            new_text = re.sub('\s[\w]+#[\w]*','', new_text)

            # numbers fully made of digits
            new_text = re.sub('\s[\d]+\s','', new_text)

            # remove words with only 1 character
            new_text = re.sub('\\b\\w{1}\\b','', new_text)

            # remove newline chars -> just aesthetics for printing, it doesn't matter with tokenizer
            new_text = new_text.replace('\n', ' ')

            # replace a multiple spaces with a single space -> just aesthetics for printing
            new_text = re.sub('\s+',' ', new_text)

            # do not delete @usernames
            # do not delete #hashtags


            # using the lemmatizer
            txt_list = nltk.word_tokenize(new_text)     # tokenise the tweet
            for k, word in enumerate(txt_list):         # fixing the separation of hashtags by the tokenizer
                if word == '#' or word == '@':
                    if k < len(txt_list) - 1:
                        txt_list[k] = txt_list[k] + txt_list[k+1]
                        txt_list.pop(k+1)
            POS = nltk.pos_tag(txt_list)                  # POS tags from nltk
            WordNetPOS = [POSconvert(P[1]) for P in POS]  # POS tags for lemmatizer
            for j in range(len(txt_list)):
                word = txt_list[j]
                lemmatized = lemmatizer.lemmatize(word, WordNetPOS[j])  # process each token/word one by one
                if lemmatized in emoticon_strings:                      # put the emoticons back in
                    lemmatized = string2emoticon[lemmatized]
                txt_list[j] = lemmatized                                # update the word in the txt_list

            # UPDATE the dictionary
            output_txt[id] = ' '.join(txt_list)
            output[id] = txt_list

        txt_dicts[i] = output_txt
        txtlist_dicts.append(output)

text_train = txt_dicts[0]
text_test1 = txt_dicts[1]
text_test2 = txt_dicts[2]
text_test3 = txt_dicts[3]
text_dev = txt_dicts[4]
txtlist_train = txtlist_dicts[0]
txtlist_test1 = txtlist_dicts[1]
txtlist_test2 = txtlist_dicts[2]
txtlist_test3 = txtlist_dicts[3]
txtlist_dev = txtlist_dicts[4]

# saving preprocessing.pkl
file_to_save = "preprocessing-plain.pkl"
if not os.path.isfile(file_to_save):
    txt_dicts = [text_train, text_test1, text_test2, text_test3, text_dev, txtlist_train, txtlist_test1, txtlist_test2, txtlist_test3, txtlist_dev]
    with open(file_to_save, 'wb') as out_file:
        pickle.dump(txt_dicts, out_file, protocol=-1)

In [5]:
# checking the preprocessed output
for id in list(IDs_train.values())[0:100]:
    print(text_train[id])

felt privilege to play foo fighter song on guitar today with one of the plectrum from the gig on saturday
@aaqibafzaal pakistan may be an islamic country but der be lot true muslim in india who love their country and can sacrifice all for it
happy birthday to the coolest golfer in bali @tjvictoriacnd :) may you become cooler and cooler everyday stay humble little sister xx
@simpplya tmills be go to tucson but the 29th and it on thursday :(
hmmmmm where be the #blacklivesmatter when matter like this rise kid be disgrace
@hypable all good im excite about 3rd season find home on netflix just want to make sure the reader have the news a it develop
told my mom want to stay in hotel for my 18th with people but my birthday on valentine :-)) lucky me
1st thing do after baggage claim be get up to date with @ronnaandbeverly bad blood ronna bev style make be home bit more okay
bobby jindal want you to assimilate to heritage of the old confederacy even though his parent be from january dot dot dot

In [5]:
## Preprocessing 2: GloVe - replacing usernames with 'username', hashtags with 'hashtag' and keeping only GloVe emoticons

# loading preprocessed datasets - if you already have the preprocessed file
file_to_load = "preprocessing-glove.pkl"
if os.path.isfile(file_to_load):
    with open(file_to_load, 'rb') as inp_file:
        temp_dicts = pickle.load(inp_file)
        txt_dicts = temp_dicts[0:5]
        txtlist_dicts = temp_dicts[5:]

else:
    top100 = ['com', 'net', 'org', 'jp', 'de', 'uk', 'fr', 'br', 'it', 'ru', 'es', 'me', 'gov', 'pl', 'ca', 'au', 'cn', 'co', 'in', 'nl', 'edu', 'info', 'eu', 'ch', 'id', 'at', 'kr', 'cz', 'mx', 'be', 'tv', 'se', 'tr', 'tw', 'al', 'ua', 'ir', 'vn', 'cl', 'sk', 'ly', 'cc', 'to', 'no', 'fi', 'us', 'pt', 'dk', 'ar', 'hu', 'tk', 'gr', 'il', 'news', 'ro', 'my', 'biz', 'ie', 'za', 'nz', 'sg', 'ee', 'th', 'io', 'xyz', 'pe', 'bg', 'hk', 'rs', 'lt', 'link', 'ph', 'club', 'si', 'site', 'mobi', 'by', 'cat', 'wiki', 'la', 'ga', 'xxx', 'cf', 'hr', 'ng', 'jobs', 'online', 'kz', 'ug', 'gq', 'ae', 'is', 'lv', 'pro', 'fm', 'tips', 'ms', 'sa', 'app', 'lat']

    # emoticons in the glove embeddings
    glove_emoticons = [';)', '=)', ':]', ':3', ':(', ':-)', '0:3', ':@', ':)', ':|', '=p']
    glove_emoticon_strings = ['emoticon' + str(num) for num in range(len(emoticons))]
    emoticon2string = dict(zip(glove_emoticons, glove_emoticon_strings))
    string2emoticon = dict(zip(glove_emoticon_strings, glove_emoticons))

    ID_dicts = [IDs_train, IDs_test1, IDs_test2, IDs_test3, IDs_dev]
    txt_dicts = [text_train, text_test1, text_test2, text_test3, text_dev]
    txtlist_dicts = []

    lemmatizer = WordNetLemmatizer()  # init the lemmatizer
    POSconvert = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'

    for i, IDdict in enumerate(ID_dicts):
        output = txt_dicts[i]
        output_txt = {}
        for id in IDdict.values():
            text = output[id].lower()

            # replace/delete all URLs starting with 'http' and 'www'
            new_text = re.sub("http[^\s]*", '', text)
            new_text = re.sub("www[^\s]*", '', new_text)

            # delete all URLs which have one of 100 most common extensions ('.com', '.net', ...)
            for ext in top100:
                re_string = "[^\s]*\." + ext + "[^\s]*"
                new_text = re.sub(re_string, '', new_text)

            #replace all emoticons with an emoticon string:  #emoticon42
            for em in glove_emoticons:
                re_string = '\s' + re.escape(em) + '\s'
                replace_string = ' ' + emoticon2string[em] + ' '
                new_text = re.sub(re_string, replace_string, new_text)

            # removing '&amp'
            new_text = re.sub('&amp','', new_text)

            # remove all non-alphanumeric chars except for '# and @'
            new_text = re.sub('[^\w\s@#]','', new_text)

            # replace all @usernames with 'username'
            new_text = re.sub('\s@[^\s]+',' username', new_text)  # middle
            new_text = re.sub('^@[^\s]+','username', new_text)    # start

            # remove strings with '#' not on the beginning (to keep only hashtags)
            new_text = re.sub('\s[\w]+#[\w]*','', new_text)

            # replace #hashtags with 'hashtag' and '#hashtag1 #hashtag2' with 'hashtags'
            new_text = re.sub('#[^\s]*\s',' hashtag ', new_text)
            new_text = re.sub('\s#[^\s]*$',' hashtag ', new_text)
            new_text = re.sub('(\s+hashtag){2,}', ' hashtags', new_text)

            # remove all non-alphanumeric chars
            new_text = re.sub('[^\w\s]','', new_text)

            # numbers fully made of digits
            new_text = re.sub('\s[\d]+\s','', new_text)

            # remove words with only 1 character
            new_text = re.sub('\\b\\w{1}\\b','', new_text)

            # remove newline chars
            new_text = new_text.replace('\n', ' ')

            # replace a multiple spaces with a single space
            new_text = re.sub('\s+',' ', new_text)

            # using the lemmatizer
            txt_list = nltk.word_tokenize(new_text)       # tokenise the tweet
            POS = nltk.pos_tag(txt_list)                  # POS tag the tweet
            WordNetPOS = [POSconvert(P[1]) for P in POS]  # convert POS tags to use in lemmatizer
            for j in range(len(txt_list)):
                word = txt_list[j]
                lemmatized = lemmatizer.lemmatize(word, WordNetPOS[j])  # process each token/word one by one
                if lemmatized in glove_emoticon_strings:                # replace the emoticon strings
                    lemmatized = string2emoticon[lemmatized]
                txt_list[j] = lemmatized                                # update the word in the txt_list

            # UPDATE the dictionary
            output_txt[id] = ' '.join(txt_list)
            output[id] = txt_list

        txt_dicts[i] = output_txt
        txtlist_dicts.append(output)

text_train = txt_dicts[0]
text_test1 = txt_dicts[1]
text_test2 = txt_dicts[2]
text_test3 = txt_dicts[3]
text_dev = txt_dicts[4]
txtlist_train = txtlist_dicts[0]
txtlist_test1 = txtlist_dicts[1]
txtlist_test2 = txtlist_dicts[2]
txtlist_test3 = txtlist_dicts[3]
txtlist_dev = txtlist_dicts[4]

# saving the preprocessed dictionaries as preprocessing-glove.pkl
file_to_save = "preprocessing-glove.pkl"
if not os.path.isfile(file_to_save):
    txt_dicts = [text_train, text_test1, text_test2, text_test3, text_dev, txtlist_train, txtlist_test1, txtlist_test2, txtlist_test3, txtlist_dev]
    with open(file_to_save, 'wb') as out_file:
        pickle.dump(txt_dicts, out_file, protocol=-1)

In [7]:
# checking the preprocessed output
for id in list(IDs_train.values())[0:100]:
    print(text_train[id])

felt privilege to play foo fighter song on guitar today with one of the plectrum from the gig on saturday
username pakistan may be an islamic country but der be lot true muslim in india who love their country and can sacrifice all for it
happy birthday to the coolest golfer in bali username :) may you become cooler and cooler everyday stay humble little sister xx
username tmills be go to tucson but the 29th and it on thursday
hmmmmm where be the hashtag when matter like this rise kid be disgrace
username all good im excite about 3rd season find home on netflix just want to make sure the reader have the news it develop
told my mom want to stay in hotel for my 18th with people but my birthday on valentine lucky me
1st thing do after baggage claim be get up to date with username bad blood ronna bev style make be home bit more okay
bobby jindal want you to assimilate to heritage of the old confederacy even though his parent be from january dot dot dot
username watch itthe 1st time votedit 

---
## Feature Extraction: Bag of words
Bag of words vectorisation: please note that the following code can take up to 5-10 minutes to run

In [6]:
# Bag of Words (BoW) feature extraction - my implementation:

# loading preprocessed BoW file if it exists
file_to_load =  "BOWsparse.pkl"
if os.path.isfile(file_to_load):
    with open(file_to_load, 'rb') as inp_file:
        temp = pickle.load(inp_file)  # BOWsparse-plain.pkl has only 6 parts, doesn't have vocabularyin it
        [sparse_train, sparse_test1, sparse_test2, sparse_test3, sparse_dev, sparse_train_dev, vocabulary] = temp
else:
    # loading preprocessing data
    file_to_load = "preprocessing-plain.pkl"
    if os.path.isfile(file_to_load):
        with open(file_to_load, 'rb') as inp_file:
            t = pickle.load(inp_file)
            text_train, text_test1, text_test2, text_test3, text_dev = t[0], t[1], t[2], t[3], t[4]
            txtlist_train, txtlist_test1, txtlist_test2, txtlist_test3, txtlist_dev = t[5], t[6], t[7], t[8], t[9]

    ## 1) removing stop words
    stopwords = Stopwords.words('english')
    stopwords = [word.replace('\'', '') for word in stopwords]

    ## 2) extracting the dictionary/vocabulary
    freq = FreqDist()   # frequency distribution
    txtlist_dicts = [txtlist_train, txtlist_dev]
    for Dict in txtlist_dicts:
        for tweet in Dict.values():
            for word in tweet:
                if not word in stopwords:
                    freq[word] += 1

    nums = range(1, len(freq.keys())+1)
    vocabulary = list(freq.keys())              # creating the dictionary
    vocabularyOOV = vocabulary + ['<OOV>']      # dictionary with 'out of vocabulary' word
    vocab2num = dict(zip(vocabulary, nums))     # word to index mapping
    vocab2num['<OOV>'] = max(vocab2num.values()) + 1  # out of vocabulary words -> len: 69742

    BOW_train = {}
    for ID, tweet in txtlist_train.items():
        BOW = text2BOW(tweet, vocabulary=vocabulary, stopwords=stopwords, vocab2num=vocab2num)
        BOW_train[ID] = BOW

    BOW_test1 = {}
    for ID, tweet in txtlist_test1.items():
        BOW = text2BOW(tweet, vocabulary=vocabulary, stopwords=stopwords, vocab2num=vocab2num)
        BOW_test1[ID] = BOW

    BOW_test2 = {}
    for ID, tweet in txtlist_test2.items():
        BOW = text2BOW(tweet, vocabulary=vocabulary, stopwords=stopwords, vocab2num=vocab2num)
        BOW_test2[ID] = BOW

    BOW_test3 = {}
    for ID, tweet in txtlist_test3.items():
        BOW = text2BOW(tweet, vocabulary=vocabulary, stopwords=stopwords, vocab2num=vocab2num)
        BOW_test3[ID] = BOW

    BOW_dev = {}
    for ID, tweet in txtlist_dev.items():
        BOW = text2BOW(tweet, vocabulary=vocabulary, stopwords=stopwords, vocab2num=vocab2num)
        BOW_dev[ID] = BOW


    print("Starting sparse processing.")

    # sparse representation -> BOW vectors are sparse, so sparse representation saves memory and time
    vector_list = [BOW_train[id] for id in BOW_train.keys()]
    dense_train = np.vstack(vector_list)    # shape (45101, 67761)
    sparse_train = csr_matrix(dense_train)  # getting the sparse matrix
    del dense_train # remove from memory

    vector_list = [BOW_test1[id] for id in BOW_test1.keys()]
    dense_test1 = np.vstack(vector_list)    # shape (3531, 67761)
    sparse_test1 = csr_matrix(dense_test1)  # getting the sparse matrix
    del dense_test1 # remove from memory

    vector_list = [BOW_test2[id] for id in BOW_test2.keys()]
    dense_test2 = np.vstack(vector_list)    # shape (1853, 67761)
    sparse_test2 = csr_matrix(dense_test2)  # getting the sparse matrix
    del dense_test2 # remove from memory

    vector_list = [BOW_test3[id] for id in BOW_test3.keys()]
    dense_test3 = np.vstack(vector_list)    # shape (2379, 67761)
    sparse_test3 = csr_matrix(dense_test3)  # getting the sparse matrix
    del dense_test3 # remove from memory

    vector_list = [BOW_dev[id] for id in BOW_dev.keys()]
    dense_dev = np.vstack(vector_list)      # shape (2000, 67761)
    sparse_dev = csr_matrix(dense_dev)      # getting the sparse matrix
    del dense_dev # remove from memory

    # train + dev together (combined)
    vector_list1 = [BOW_train[id] for id in BOW_train.keys()]
    vector_list2 = [BOW_dev[id] for id in BOW_dev.keys()]
    temp1 = np.vstack(vector_list1)
    temp2 = np.vstack(vector_list2)
    dense_train_dev = np.vstack((temp1, temp2))     # shape (48632, 67761)
    sparse_train_dev = csr_matrix(dense_train_dev)  # getting the sparse matrix
    del dense_train_dev # remove from memory


# save the sparse representation
file_to_save = "BOWsparse.pkl"
if not os.path.isfile(file_to_save):
    sparse_dicts = [sparse_train, sparse_test1, sparse_test2, sparse_test3, sparse_dev, sparse_train_dev, vocabulary]
    with open(file_to_save, 'wb') as out_file:
        pickle.dump(sparse_dicts, out_file, protocol=-1)

---
## Feature Extraction: TF-IDF weighted Bag of words
Weighted BOW vectorisation - each word in a tweet is weighted according to its TFIDF
Please note that the following code can take up to 5-10 minutes to run

In [13]:
# TFIDF feature extraction

# loading preprocessed BoW file if it exists
file_to_load =  "TFIDFsparse.pkl"
if os.path.isfile(file_to_load):
    with open(file_to_load, 'rb') as inp_file:
        temp = pickle.load(inp_file)
        [sparse_train, sparse_test1, sparse_test2, sparse_test3, sparse_dev, sparse_train_dev, vocabulary] = temp
else:
    # loading preprocessing data
    file_to_load = "preprocessing-plain.pkl"
    if os.path.isfile(file_to_load):
        with open(file_to_load, 'rb') as inp_file:
            t = pickle.load(inp_file)
            text_train, text_test1, text_test2, text_test3, text_dev = t[0], t[1], t[2], t[3], t[4]
            txtlist_train, txtlist_test1, txtlist_test2, txtlist_test3, txtlist_dev = t[5], t[6], t[7], t[8], t[9]

    # extracting the dictionary
    freq = FreqDist()   # frequency distribution
    txtlist_dicts = [txtlist_train, txtlist_dev]
    for Dict in txtlist_dicts:
        for tweet in Dict.values():
            for word in tweet:
                if not word in stopwords:
                    freq[word] += 1

    nums = range(1,len(freq.keys())+1)
    vocabulary = list(freq.keys())              # creating the dictionary
    vocabulary_array = np.array(vocabulary)     # np.array of the dictionary
    vocabularyOOV = vocabulary + ['<OOV>']      # dictionary with 'out-of-vocabulary' word
    vocab2num = dict(zip(vocabulary, nums))     # word to index mapping
    vocab2num['<OOV>'] = max(vocab2num.values()) + 1  # out of vocabulary words

    # extracting the dictionary
    DFfreq = FreqDist()   # document frequency distribution
    Ntexts = len(IDs_train) + len(IDs_dev)
    for Dict in txtlist_dicts:
        for tweet in Dict.values():
            for word in np.unique(tweet):
                if not word in stopwords:
                    DFfreq[word] += 1


    # TFIDF-weighted Bag of Words for each tweet
    TFIDF_train = {}
    for ID, tweet in txtlist_train.items():
        tfidf = text2TFIDF(tweet, vocabulary=vocabulary, stopwords=stopwords, vocab2num=vocab2num, DFfreq=DFfreq, Ntexts=Ntexts)
        TFIDF_train[ID] = tfidf

    TFIDF_test1 = {}
    for ID, tweet in txtlist_test1.items():
        tfidf = text2TFIDF(tweet, vocabulary=vocabulary, stopwords=stopwords, vocab2num=vocab2num, DFfreq=DFfreq, Ntexts=Ntexts)
        TFIDF_test1[ID] = tfidf

    TFIDF_test2 = {}
    for ID, tweet in txtlist_test2.items():
        tfidf = text2TFIDF(tweet, vocabulary=vocabulary, stopwords=stopwords, vocab2num=vocab2num, DFfreq=DFfreq, Ntexts=Ntexts)
        TFIDF_test2[ID] = tfidf

    TFIDF_test3 = {}
    for ID, tweet in txtlist_test3.items():
        tfidf = text2TFIDF(tweet, vocabulary=vocabulary, stopwords=stopwords, vocab2num=vocab2num, DFfreq=DFfreq, Ntexts=Ntexts)
        TFIDF_test3[ID] = tfidf

    TFIDF_dev = {}
    for ID, tweet in txtlist_dev.items():
        tfidf = text2TFIDF(tweet, vocabulary=vocabulary, stopwords=stopwords, vocab2num=vocab2num, DFfreq=DFfreq, Ntexts=Ntexts)
        TFIDF_dev[ID] = tfidf

    print("Starting sparse processing.")

    # sparse TFIDF representation
    vector_list = [TFIDF_train[id] for id in TFIDF_train.keys()]
    TFIDFdense_train = np.vstack(vector_list)
    TFIDFsparse_train = csr_matrix(TFIDFdense_train)  # getting the sparse matrix
    del TFIDFdense_train # remove from memory

    vector_list = [TFIDF_test1[id] for id in TFIDF_test1.keys()]
    TFIDFdense_test1 = np.vstack(vector_list)
    TFIDFsparse_test1 = csr_matrix(TFIDFdense_test1)  # getting the sparse matrix
    del TFIDFdense_test1 # remove from memory

    vector_list = [TFIDF_test2[id] for id in TFIDF_test2.keys()]
    TFIDFdense_test2 = np.vstack(vector_list)
    TFIDFsparse_test2 = csr_matrix(TFIDFdense_test2)  # getting the sparse matrix
    del TFIDFdense_test2 # remove from memory

    vector_list = [TFIDF_test3[id] for id in TFIDF_test3.keys()]
    TFIDFdense_test3 = np.vstack(vector_list)
    TFIDFsparse_test3 = csr_matrix(TFIDFdense_test3)  # getting the sparse matrix
    del TFIDFdense_test3 # remove from memory

    vector_list = [TFIDF_dev[id] for id in TFIDF_dev.keys()]
    TFIDFdense_dev = np.vstack(vector_list)
    TFIDFsparse_dev = csr_matrix(TFIDFdense_dev)      # getting the sparse matrix
    del TFIDFdense_dev # remove from memory

    vector_list1 = [TFIDF_train[id] for id in TFIDF_train.keys()]
    vector_list2 = [TFIDF_dev[id] for id in TFIDF_dev.keys()]
    temp1 = np.vstack(vector_list1)
    temp2 = np.vstack(vector_list2)
    TFIDFdense_train_dev = np.vstack((temp1, temp2))  # shape (45101, 59559)
    TFIDFsparse_train_dev = csr_matrix(TFIDFdense_train_dev)  # getting the sparse matrix
    del TFIDFdense_train_dev # remove from memory

    # save the sparse representation of TFIDF features
    file_to_save = "TFIDFsparse.pkl"
    if not os.path.isfile(file_to_save):
        sparse_dicts = [TFIDFsparse_train, TFIDFsparse_test1, TFIDFsparse_test2, TFIDFsparse_test3, TFIDFsparse_dev, TFIDFsparse_train_dev, vocabulary]
        with open(file_to_save, 'wb') as out_file:
            pickle.dump(sparse_dicts, out_file, protocol=-1)

# loading preprocessed TFIDF sparse data
# with open("TFIDFsparse.pkl", 'rb') as inp_file:
#     temp = pickle.load(inp_file)
#     [sparse_train, sparse_test1, sparse_test2, sparse_test3, sparse_dev, sparse_train_dev] = temp

---
## Feature Extraction: GloVe

In [36]:
# APPROACH 1: Loading the word embeddings vectors from GloVE: selecting 5000 words based on frequency

print('Extracting the word vectors.')

## loading preprocessed embedding file if it exists (embedding matrix, word to index map, embedding dictionary)
file_to_load =  "embeddings.pkl"
if os.path.isfile(file_to_load):
    with open(file_to_load, 'rb') as inp_file:
        temp = pickle.load(inp_file)
        [embedding_matrix, word2ID, embedding_dict] = temp
else:
    ## loading preprocessed data for glove
    file_to_load = "preprocessing-glove.pkl"
    if os.path.isfile(file_to_load):
        with open(file_to_load, 'rb') as inp_file:
            t = pickle.load(inp_file)
            text_train, text_test1, text_test2, text_test3, text_dev = t[0], t[1], t[2], t[3], t[4]
            txtlist_train, txtlist_test1, txtlist_test2, txtlist_test3, txtlist_dev = t[5], t[6], t[7], t[8], t[9]

    full_embedding_dict = {}
    glove_path = join('..','glove', 'glove.6B.100d.txt')
    with open(glove_path, 'r', encoding='utf-8') as File:
        for line in File:
            vec = line.split()
            word = vec[0]
            coefs = np.asarray(vec[1:], dtype='float32')
            full_embedding_dict[word] = coefs

    print(f"Extracted {len(full_embedding_dict)} word embedding vectors.")

    sorted_vocabulary = sorted([it for it in freq.items()], key=lambda data: data[1], reverse=True)
    full_vocabulary = [ tup[0] for tup in sorted_vocabulary ]

    embedding_dict = {}  # word embeddings of 6000 words from vocabulary
    temp = 0
    for word in full_vocabulary:
        if word in full_embedding_dict.keys():
            embedding_dict[word] = full_embedding_dict[word]
            temp += 1
        if temp == 4998:
            break
    vocabulary5000 = list(embedding_dict.keys())  # obtain the dictionary of 6000 most common words

    print(f"Created dictionary of {len(embedding_dict)} most common words.")

    ## extract the <OOV> vector by setting it to be the weighted avg of unused words
    Total = np.zeros(100)
    Sum = 0
    for word in full_vocabulary:
        if word not in vocabulary5000:                          # if word is not among 6000 words
            if word in full_embedding_dict.keys():              # and it is in glove
                Total += freq[word] * full_embedding_dict[word] # take the weighted avg
                Sum += freq[word]
    OOV_vector = Total / Sum
    embedding_dict['<OOV>'] = OOV_vector

    print(f"The embedding dictionary has {len(embedding_dict)} words, the last one is: {list(embedding_dict.keys())[-1]}")

    ## Build an embedding matrix
    word_list = list(embedding_dict.keys())
    nums = range(1,len(word_list)+1)
    word2ID = dict(zip(word_list, nums))     # the index of the embedding vector
    num2vocab = dict(zip(nums, word_list))   # the index to word
    vector_list = [embedding_dict[word] for word in word_list]

    embedding_matrix = np.vstack(vector_list)
    embedding_matrix = np.vstack((np.zeros(100), embedding_matrix))

    print(f"Created matrix with shape {embedding_matrix.shape}")  # the first row is a dummy row
    del full_embedding_dict # delete from memory
    ## save the embeddings
    with open("embeddings.pkl", 'wb') as out_file:
        temp = [embedding_matrix, word2ID, embedding_dict]
        pickle.dump(temp, out_file, protocol=-1)

Extracting the word vectors.


In [37]:
# ALTERNATIVE APPROACH to preprocessing of embeddings: Weighted Approach - considering both semantic and frequency value
# -> in the first approach, only frequency was the criterion to select 5000 words
# -> but that way, we may select words which have minimal semantic value
# -> the weighted approach will select both the semantic as well as frequency contribution


# loading preprocessed embedding file if it exists (embedding matrix, word to index map, embedding dictionary)
file_to_load =  "embeddings-weighted.pkl"
if os.path.isfile(file_to_load):
    with open("embeddings-weighted.pkl", 'rb') as inp_file:
        temp = pickle.load(inp_file)
        [embedding_matrix, word2ID, embedding_dict] = temp
else:
    ## 0) loading preprocessing data for glove
    file_to_load = "preprocessing-glove.pkl"
    if os.path.isfile(file_to_load):
        with open(file_to_load, 'rb') as inp_file:
            t = pickle.load(inp_file)
            text_train, text_test1, text_test2, text_test3, text_dev = t[0], t[1], t[2], t[3], t[4]
            txtlist_train, txtlist_test1, txtlist_test2, txtlist_test3, txtlist_dev = t[5], t[6], t[7], t[8], t[9]

    ## 1) finding the frequency weights
    txtlist_dicts = [txtlist_train, txtlist_dev]
    freq = FreqDist()   # frequency distribution
    for Dict in txtlist_dicts:
        for tweet in Dict.values():
            for word in tweet:
                if not word in stopwords:
                    freq[word] += 1

    nums = range(1, len(freq.keys())+1)
    vocabulary = list(freq.keys())              # creating the dictionary
    vocab2num = dict(zip(vocabulary, nums))     # word to index mapping
    num2vocab = dict(zip(nums, vocabulary))     # word to index mapping

    sorted_vocabulary = sorted([it for it in freq.items()], key=lambda data: data[1], reverse=True)
    maxF = sorted_vocabulary[0][1]  # maximal frequency of a word
    normalized_vocabulary = { word: f / maxF for (word, f) in sorted_vocabulary}  # dictionary

    ## 2) use SVM coeffs (make sure to have 'SVMcoefficients.pkl' which stores clf.coef_ from SVM)
    with open("SVMcoefficients.pkl", 'rb') as inp_file:
        SVM_coef = pickle.load(inp_file)
    coefs = []
    coefs_nums = []
    for i, word in enumerate(vocabulary):
        c = np.abs(SVM_coef[2,i] - SVM_coef[0,i])  # coefficent: absolute difference between positive and negative:
        word_coef = (word, c)                            #  ->> "most negative/most positive" words have higher index
        coefs_nums.append(c)
        coefs.append(word_coef)
    minC, maxC = np.min(coefs_nums), np.max(coefs_nums)
    coefs = [(word, (c - minC) / maxC) for (word, c) in coefs]     # normalize coefs
    coefs = sorted(coefs, key=lambda data: data[1], reverse=True)  # sort coefs

    ## 3) apply the weight formula: 2 * freq_value + 1 * sentiment_value ~ 2 f + 1 s
    weighted_scores = [(word, 1 * cn + 2 * normalized_vocabulary[word]) for (word, cn) in coefs]  # list of tuples
    sorted_weighted_vocabulary = sorted([it for it in weighted_scores], key=lambda data: data[1], reverse=True)

    ## 4) load GloVe embeddings
    full_embedding_dict = {}
    glove_path = join('..','glove', 'glove.6B.100d.txt')
    with open(glove_path, 'r', encoding='utf-8') as File:
        for line in File:
            vec = line.split()
            word = vec[0]
            coefs = np.asarray(vec[1:], dtype='float32')
            full_embedding_dict[word] = coefs
    print(f"Extracted {len(full_embedding_dict)} word embedding vectors.")

    ## 5) extract the word embeddings of 5000 words based on the order from the weighted approach
    embedding_dict = {}
    temp = 0
    for (word,_) in sorted_weighted_vocabulary:
        if word in full_embedding_dict.keys():
            embedding_dict[word] = full_embedding_dict[word]
            temp += 1
        if temp == 4998:
            break
    vocabulary5000 = list(embedding_dict.keys())  # obtain the dictionary of 5000 most common words

    ## 6) extract the <OOV> vector by setting it to be the weighted avg of unused words
    Total = np.zeros(100)
    Sum = 0
    for word in full_vocabulary:
        if word not in vocabulary5000:                          # if word is not among 5000 words
            if word in full_embedding_dict.keys():              # and it is in glove
                Total += freq[word] * full_embedding_dict[word] # take the weighted avg
                Sum += freq[word]
    OOV_vector = Total / Sum
    embedding_dict['<OOV>'] = OOV_vector

    ## 7) Build the weighted-embedding matrix
    word_list = list(embedding_dict.keys())
    nums = range(1,len(word_list)+1)
    word2ID = dict(zip(word_list, nums))     # the index of the embedding vector
    num2vocab = dict(zip(nums, word_list))   # the index to word
    vector_list = [embedding_dict[word] for word in word_list]
    embedding_matrix = np.vstack(vector_list)
    embedding_matrix = np.vstack((np.zeros(100), embedding_matrix))

    print(f"Created matrix with shape {embedding_matrix.shape}")  # the first row is a dummy row
    del full_embedding_dict # delete from memory

    # save the embeddings
    with open("embeddings-weighted.pkl", 'wb') as out_file:
        temp = [embedding_matrix, word2ID, embedding_dict]
        pickle.dump(temp, out_file, protocol=-1)


Extracted 400000 word embedding vectors.
Created matrix with shape (5000, 100)


In [43]:
# preparing matrices for torch: matrix_train, matrix_dev, matrix_test1, matrix_test2, matrix_test3

# loading preprocessed embeddings
embedding_file = "embeddings-weighted.pkl"  # alternatively use "embeddings.pkl"
with open(embedding_file, 'rb') as inp_file:
    temp = pickle.load(inp_file)
    [embedding_matrix, word2ID, embedding_dict] = temp

# converting the text lists into vectors of ints
word_list = list(embedding_dict.keys())

with open("preprocessing-glove.pkl", 'rb') as inp_file:   # loading preprocessed data for glove
    t = pickle.load(inp_file)
    text_train, text_test1, text_test2, text_test3, text_dev = t[0], t[1], t[2], t[3], t[4]
    txtlist_train, txtlist_test1, txtlist_test2, txtlist_test3, txtlist_dev = t[5], t[6], t[7], t[8], t[9]

with open("preprocessing-sent-ID.pkl", 'rb') as inp_file: # loading preprocessed data
    temp = pickle.load(inp_file)
    [sent_train, sent_test1, sent_test2, sent_test3, sent_dev, IDs_train, IDs_test1, IDs_test2, IDs_test3,
     IDs_dev] = temp

max_len = np.max([len(tweet) for tweet in txtlist_train.values()])      # longest tokenized sentence
matrix_train = np.zeros((len(txtlist_train), max_len), dtype=np.int16)  # training datapoints
y_train = np.zeros(len(txtlist_train), dtype=np.int8)                   # training labels
for i, (id, text_list) in enumerate(txtlist_train.items()):
    x = text_list2array(text_list, vocabulary_list=word_list, word2index_dict=word2ID, max_length=max_len)
    y = sent_train[id]
    matrix_train[i] = x
    y_train[i] = sent2num(y)

matrix_test1 = np.zeros((len(txtlist_test1), max_len), dtype=np.int16)  # training datapoints
y_test1 = np.zeros(len(txtlist_test1), dtype=np.int8)                   # training labels
for i, (id, text_list) in enumerate(txtlist_test1.items()):
    x = text_list2array(text_list, vocabulary_list=word_list, word2index_dict=word2ID, max_length=max_len)
    y = sent_test1[id]
    matrix_test1[i] = x
    y_test1[i] = sent2num(y)

matrix_test2 = np.zeros((len(txtlist_test2), max_len), dtype=np.int16)  # training datapoints
y_test2 = np.zeros(len(txtlist_test2), dtype=np.int8)                   # training labels
for i, (id, text_list) in enumerate(txtlist_test2.items()):
    x = text_list2array(text_list, vocabulary_list=word_list, word2index_dict=word2ID, max_length=max_len)
    y = sent_test2[id]
    matrix_test2[i] = x
    y_test2[i] = sent2num(y)

matrix_test3 = np.zeros((len(txtlist_test3), max_len), dtype=np.int16)  # training datapoints
y_test3 = np.zeros(len(txtlist_test3), dtype=np.int8)                   # training labels
for i, (id, text_list) in enumerate(txtlist_test3.items()):
    x = text_list2array(text_list, vocabulary_list=word_list, word2index_dict=word2ID, max_length=max_len)
    y = sent_test3[id]
    matrix_test3[i] = x
    y_test3[i] = sent2num(y)

matrix_dev = np.zeros((len(txtlist_dev), max_len), dtype=np.int16)  # training datapoints
y_dev = np.zeros(len(txtlist_dev), dtype=np.int8)  # training labels
for i, (id, text_list) in enumerate(txtlist_dev.items()):
    x = text_list2array(text_list, vocabulary_list=word_list, word2index_dict=word2ID, max_length=max_len)
    y = sent_dev[id]
    matrix_dev[i] = x
    y_dev[i] = sent2num(y)

# save the data as pickle files
with open("xy_train.pkl", 'wb') as out_file:
    temp = [matrix_train, y_train]
    pickle.dump(temp, out_file, protocol=-1)
with open("xy_dev.pkl", 'wb') as out_file:
    temp = [matrix_dev, y_dev]
    pickle.dump(temp, out_file, protocol=-1)
with open("xy_test1.pkl", 'wb') as out_file:
    temp = [matrix_test1, y_test1]
    pickle.dump(temp, out_file, protocol=-1)
with open("xy_test2.pkl", 'wb') as out_file:
    temp = [matrix_test2, y_test2]
    pickle.dump(temp, out_file, protocol=-1)
with open("xy_test3.pkl", 'wb') as out_file:
    temp = [matrix_test3, y_test3]
    pickle.dump(temp, out_file, protocol=-1)

# load the saved data from pickle files
with open("xy_train.pkl", 'rb') as inp_file:
    temp = pickle.load(inp_file)
    [matrix_train, y_train] = temp
with open("xy_dev.pkl", 'rb') as inp_file:
    temp = pickle.load(inp_file)
    [matrix_dev, y_dev] = temp
with open("xy_test1.pkl", 'rb') as inp_file:
    temp = pickle.load(inp_file)
    [matrix_test1, y_test1] = temp
with open("xy_test2.pkl", 'rb') as inp_file:
    temp = pickle.load(inp_file)
    [matrix_test2, y_test2] = temp
with open("xy_test3.pkl", 'rb') as inp_file:
    temp = pickle.load(inp_file)
    [matrix_test3, y_test3] = temp

print("Saved matrices for torch.")


Saved matrices for torch.


---
## Sentiment Classifiers

In [19]:
# classical classifiers data loading:


with open("preprocessing-sent-ID.pkl", 'rb') as inp_file: # loading preprocessed data
    temp = pickle.load(inp_file)
    [sent_train, sent_test1, sent_test2, sent_test3, sent_dev, IDs_train, IDs_test1, IDs_test2, IDs_test3,
     IDs_dev] = temp


for classifier in ['NearestNeighbour', 'NaiveBayes']:#,'SVM', 'MaxEnt']:
    for features in ['BOW', 'TFIDF']:

        if features == 'BOW':
            with open("BOWsparse.pkl", 'rb') as inp_file:
                temp = pickle.load(inp_file)  # BOWsparse-plain.pkl has only 6 parts, doesn't have vocabularyin it
                [sparse_train, sparse_test1, sparse_test2, sparse_test3, sparse_dev, sparse_train_dev, vocabulary] = temp

            Xtrain = sparse_train_dev                      # combining the two datasets
            ID_train = list(IDs_train.values())            # list of IDs in train set
            ID_dev = list(IDs_dev.values())                # list of IDs in dev set
            ID_train_dev = ID_train + ID_dev               # combined train, dev
            sn_train = [sent_train[id] for id in ID_train] # training labels train
            sn_dev = [sent_dev[id] for id in ID_dev]       # training labels dev
            Ytrain = np.array(sn_train + sn_dev)           # combining both labels
            Ytrain_numeric = np.array([sent2num(y) for y in Ytrain])                      # numerical labels for train
            y_test1_numeric = np.array([sent2num(sent) for sent in sent_test1.values()])  # numerical labels for test1
            y_test2_numeric = np.array([sent2num(sent) for sent in sent_test2.values()])  # numerical labels for test2
            y_test3_numeric = np.array([sent2num(sent) for sent in sent_test3.values()])  # numerical labels for test3
            testset_sparse_samples = [sparse_test1, sparse_test2, sparse_test3]           # list of testing examples
            testset_IDs = [list(IDs_test1.values()), list(IDs_test2.values()), list(IDs_test3.values())]  # IDs in test1, test2, test3

        if features == 'TFIDF':
            with open("TFIDFsparse.pkl", 'rb') as inp_file:
                temp = pickle.load(inp_file)
                [sparse_train, sparse_test1, sparse_test2, sparse_test3, sparse_dev, sparse_train_dev, vocabulary] = temp

            Xtrain = sparse_train_dev                      # combining the two datasets
            ID_train = list(IDs_train.values())            # list of IDs in train set
            ID_dev = list(IDs_dev.values())                # list of IDs in dev set
            ID_train_dev = ID_train + ID_dev               # combined train, dev
            sn_train = [sent_train[id] for id in ID_train] # training labels train
            sn_dev = [sent_dev[id] for id in ID_dev]       # training labels dev
            Ytrain = np.array(sn_train + sn_dev)           # combining both labels
            Ytrain_numeric = np.array([sent2num(y) for y in Ytrain])                      # numerical labels for train
            y_test1_numeric = np.array([sent2num(sent) for sent in sent_test1.values()])  # numerical labels for test1
            y_test2_numeric = np.array([sent2num(sent) for sent in sent_test2.values()])  # numerical labels for test2
            y_test3_numeric = np.array([sent2num(sent) for sent in sent_test3.values()])  # numerical labels for test3
            testset_sparse_samples = [sparse_test1, sparse_test2, sparse_test3]           # list of testing examples
            testset_IDs = [list(IDs_test1.values()), list(IDs_test2.values()), list(IDs_test3.values())]  # IDs in test1, test2, test3

        # Skeleton: Creation and training of the classifiers
        if classifier == 'NearestNeighbour':
            print('--> ' + classifier.upper() + ': ', end='')
            t0 = time.time()                # timing the run
            clf = KNeighborsClassifier(n_neighbors=9, metric='cosine', weights='uniform')  # the best params selected by GridSearch
            clf.fit(Xtrain, Ytrain_numeric)
            t1 = time.time()                # timing the run
            print(f"training time: {t1-t0:.3f}")


        elif classifier == 'NaiveBayes':
            print('--> ' + classifier.upper() + ': ', end='')
            t0 = time.time()                # timing the run
            clf = MultinomialNB(alpha = 0.4)# the best params selected by GridSearch
            clf.fit(Xtrain, Ytrain_numeric)
            t1 = time.time()                # timing the run
            print(f"training time: {t1-t0:.3f}")
        elif classifier == 'SVM':
            # write the classifier 3 here
            print('--> ' + classifier.upper(), end='')
        elif classifier == 'LSTM':
            # write the LSTM classifier here
            print('--> ' + classifier.upper(), end='')
        else:
            print('Unknown classifier name' + classifier)
            continue

        # Prediction performance of the classifiers
        testsets = datasets[1:4]
        for i in range(len(testsets)):
            testset = testsets[i]
            X, IDs = testset_sparse_samples[i], testset_IDs[i]
            y_pred_numeric = clf.predict(X)
            y_pred = [num2sent(num) for num in y_pred_numeric]
            pred_dict = dict(zip(IDs, y_pred))
            evaluate(pred_dict, testset, classifier=features + '-' + classifier)


# # evaluation test2
# ID_test2 = list(BOW_test2.keys())
#
# y_pred_numeric = clf.predict(sparse_test2)
# y_pred = [num2sent(num) for num in y_pred_numeric]
# pred_dict = dict(zip(ID_test2, y_pred))
# s2 = evaluate(pred_dict, '../semeval-tweets/twitter-test2.txt', classifier="KNN")  # best score 0.435
#
# # evaluation test3
# ID_test3 = list(BOW_test3.keys())
# y_test3_numeric = np.array([sent2num(sent) for sent in sent_test3.values()])
# y_pred_numeric = clf.predict(sparse_test3)
# y_pred = [num2sent(num) for num in y_pred_numeric]
# pred_dict = dict(zip(ID_test3, y_pred))
# s3 = evaluate(pred_dict, '../semeval-tweets/twitter-test3.txt', classifier="KNN")  # best score 0.435
# t1 = time.time()  # timing the run
# print('overall run time:', t1-t0)
# print("average F1 score:", (s1 + s2 + s3)/3)

--> NEARESTNEIGHBOUR: training time: 0.004
../semeval-tweets/twitter-test1.txt (BOW-NearestNeighbour): 0.386
../semeval-tweets/twitter-test2.txt (BOW-NearestNeighbour): 0.400
../semeval-tweets/twitter-test3.txt (BOW-NearestNeighbour): 0.376
--> NEARESTNEIGHBOUR: training time: 0.007
../semeval-tweets/twitter-test1.txt (TFIDF-NearestNeighbour): 0.404
../semeval-tweets/twitter-test2.txt (TFIDF-NearestNeighbour): 0.439
../semeval-tweets/twitter-test3.txt (TFIDF-NearestNeighbour): 0.415
