# First Year Project
## Project 4 - Natural Language Processing
### Professor - Christian Hardmeier

This notebook contains all of the code developed for project 4. We will be using a data set of tweets to perform machine learning for binary and multiclass classification.

For **binary** classification, we evaluate tweets based on *'ironic'* or *'not ironic'*. [Learn More](https://www.aclweb.org/anthology/S18-1005.pdf)
<br>
For **multiclass** classification, we evaluate predict which emojis are used based on the text data. [Learn More](https://www.aclweb.org/anthology/S18-1003.pdf)

Group 3:<br>
Crisanna Cornish (ccor@itu.dk)<br>
Danielle Dequin (ddeq@itu.dk)<br>
Gino Franco Fazzi (gifa@itu.dk)<br>
Moneeca Abru Iftikhar Latif (abml@itu.dk)<br>
Carl August Wismer (cwis@itu.dk)

Created: 27-04-2021<br>
Last Modified: 04-05-2021

# Data Source

We use the TweetEval repository, a collection of 7 datasets for different classification tasks based on social media post. The repository can be found here: https://github.com/cardiffnlp/tweeteval.git

Each dataset is presented in the same format and with fixed training, validation and test splits.

# Imports

In [None]:
import re
import sys
import csv
import numpy as np
import pandas as pd
from collections import Counter
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.util import ngrams
from nltk.lm import NgramCounter
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.lm import Laplace
from nltk.lm import KneserNeyInterpolated
from nltk.lm import WittenBellInterpolated


import os
import matplotlib.pyplot as plt
from math import log

# Constants

### Paths

In [None]:
# irony text:
IRONY_RAW_PATH = '../datasets/irony/raw/'
IRONY_INTERIM_PATH = '../datasets/irony/interim/'

# emoji:
EMOJI_RAW_PATH = '../datasets/emoji/raw/'
EMOJI_INTERIM_PATH = '../datasets/emoji/interim/'

### FILES:

In [None]:
TEST = 'test_text.txt'
TRAIN = 'train_text.txt'
VAL = 'val_text.txt'

TEST_LABELS = 'test_labels.txt'
TRAIN_LABELS = 'train_labels.txt'
VAL_LABELS = 'val_labels.txt'

TEST_INTERIM = 'test_seperated.csv'
TRAIN_INTERIM = 'train_seperated.csv'
VAL_INTERIM = 'val_seperated.csv'

# FUNCTIONS:

In [None]:
def tokenise(line):
    # Initialise lists
    tokens = []
    unmatchable = []

    # Compile patterns for speedup
    token_pat = re.compile(r'\w+|#+|\'|@|\.\.+|!+|\?+')
    skippable_pat = re.compile(r',|\|http://t.co/+')  # typically spaces

    # As long as there's any material left...
    while line:
        # Try finding a skippable token delimiter first.
        skippable_match = re.search(skippable_pat, line)
        if skippable_match and skippable_match.start() == 0:
            # If there is one at the beginning of the line, just skip it.
            line = line[skippable_match.end():]
        else:
            # Else try finding a real token.
            token_match = re.search(token_pat, line)
            #print(token_match)
            if token_match and token_match.start() == 0:
                #print(line[token_match.start():token_match.end()])
                if line[token_match.start():token_match.end()] == '#': #keep hash tags together and seperate
                    try:
                        token_match2 = re.search(token_pat, line[1:])
                        if ' ' in line[token_match2.start():token_match2.end()]:
                            line = line[token_match.end():]
                        else:
                            tokens.append(line[:token_match2.end()+1])
                            line = line[token_match2.end()+1:]
                    except:
                        line = line[token_match.end():]

                elif line[token_match.start():token_match.end()] == '@': # keep @ tags together and seperate
                    try:
                        token_match2 = re.search(token_pat, line[1:])
                        if ' ' in line[token_match2.start():token_match2.end()]:
                            line = line[token_match.end():]
                        
                        else: 
                            tokens.append(line[:token_match2.end()+1])
                            line = line[token_match2.end()+1:]
                    except:
                        line = line[token_match.end():]

                elif line[token_match.start():token_match.end()] == "'": # handle contractions as a single word
                    try:
                        token_match2 = re.search(token_pat, line[1:])
                        if ' ' in line[token_match2.start():token_match2.end()]:
                            line = line[token_match.end():]
                        
                        else: 
                            tokens.append(line[:token_match2.end()+1])
                            line = line[token_match2.end()+1:]
                    except:
                        line = line[token_match.end():]

                # If there is one at the beginning of the line, tokenise it.
                else:
                    tokens.append(line[:token_match.end()])
                    line = line[token_match.end():]
            else:
                # Else there is unmatchable material here.
                # It ends where a skippable or token match starts, or at the end of the line.
                unmatchable_end = len(line)
                if skippable_match:
                    unmatchable_end = skippable_match.start()
                if token_match:
                    unmatchable_end = min(unmatchable_end, token_match.start())
                # Add it to unmatchable and discard from line.
                unmatchable.append(line[:unmatchable_end])
                line = line[unmatchable_end:]

    final_tokens = []

    while len(tokens) > 0:
        temp1 = tokens.pop(0)
        try:
            temp2 = tokens.pop(0)
            if temp2[0] == "'":
                temp1 += temp2
                final_tokens.insert(0, temp1)
                #print('a', temp1)
            else:
                final_tokens.insert(0, temp1)
                tokens.insert(0, temp2)
                #print('b', temp1, temp2)
        except:
            final_tokens.insert(0, temp1)
            #print('d', temp1)
        
    final_tokens = final_tokens[::-1]

    #print(final_tokens)
    #print(unmatchable)
    return final_tokens

def token_data(data, interim, tokenizer=None):
    """Function to tokenize from raw text file. Takes a reading file path, a writing file path and a
    tokenizer argument (None for default tokenizer, 'Compare' for TweetTokenizer).
    Writes a file with the tokenize lines and returns a list (lines) of lists (tokens)."""
    if tokenizer == None:
        # Open Irony raw data set text and tokenize
        f = open(data, "r", encoding="utf-8")
        token_list = []
        for line in f:
            token_list.append(tokenise(line))
        f.close()
    
    elif tokenizer == "compare":
        tknzr = TweetTokenizer()

        f = open(data, "r", encoding="utf-8")
        token_list = []
        for line in f:
            token_list.append(tknzr.tokenize(line))
        f.close()
        
    # Write the tokenized data to an interim csv file
    with open(interim, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerows(token_list)
        
    return token_list

def token_counter(tokens, n= 10, inverse=False):
    plain = [t for l in tokens for t in l]
    counter = Counter(plain)
    if not inverse:
        return counter.most_common(n)
    else:
        return counter.most_common()[:-n-1:-1]

def zipf_law(tokens, ds_name=''):
    x = [log(x) for x in range(1, len(tokens)+1)]
    y = []

    ordered = token_counter(tokens, n=len(tokens))

    for tup in ordered:
        word, count = tup
        y.append(log(count))
    
    fig, ax = plt.subplots()
    ax.scatter(x, y)
    ax.set_xlabel('Log(Rank)')
    ax.set_ylabel('Log(Freq)')
    ax.set_title(f'Zipf Law for {ds_name} dataset')

# Load data: Task 1

## Load Irony Data Set

In [None]:
# Open Irony data set, tokenize, and write to a csv file
irony = token_data(IRONY_RAW_PATH + TRAIN, IRONY_INTERIM_PATH+TRAIN_INTERIM)

In [None]:
irony[0]

## Load Emoji Data Set

In [None]:
# Open Emoji raw dadta set, tokenize, and write to an interim csv file
emoji = token_data(EMOJI_RAW_PATH + TRAIN, EMOJI_INTERIM_PATH+TRAIN_INTERIM)

In [None]:
emoji[0]

## Compare tokeniser’s output 
With the baseline tokenisation from the socialmedia tokeniser in the NLTK library (nltk.tokenize.TweetTokenizer)

In [None]:
# Open emoji data, tokenize with NLTK tokenizer, write to interim csv for comparison
compare_emoji = token_data(EMOJI_RAW_PATH + TRAIN, EMOJI_INTERIM_PATH+'train_seperated_compared.csv',"compare")

In [None]:
print(compare_emoji[3])
print(emoji[3])

In [None]:
# Open emoji data, tokenize with NLTK tokenizer, write to interim csv for comparison
compare_irony = token_data(IRONY_RAW_PATH + TRAIN, IRONY_INTERIM_PATH+'train_seperated_compared.csv',"compare")

In [None]:
print(compare_irony[3])
print(irony[3])

# Characterize Data: Task 2

In [None]:
temp1 = pd.read_csv(IRONY_INTERIM_PATH+TRAIN_INTERIM, delimiter="\n", names=['tweet'])
temp2 = pd.read_csv(IRONY_RAW_PATH+TRAIN_LABELS, names=['label'])
irony_df = temp1.merge(temp2, left_index=True, right_index=True)

## Irony Data

### Number of Tweets

In [None]:
print(f"There are {irony_df.shape[0]} tweets in the Irony data set")
print('Ironic:', (irony_df[irony_df['label'] == 1]).shape[0], '\nNon-ironic:', (irony_df[irony_df['label'] == 0]).shape[0])

### Most common Tokens

In [None]:
token_counter(irony)

### Least Common Tokens

In [None]:
token_counter(irony, inverse=True)

In [None]:
irony_tokens = [t for l in irony for t in l] # Get tokens for each line of the irony data set
irony_one_timers = []
irony_mult_timers = []

for tup in token_counter(irony, n=len(irony_tokens)):
    k, v = tup # Unpack
    if v == 1:
        irony_one_timers.append(k)
    else:
        irony_mult_timers.append(k)

In [None]:
print(f'Number of words that occur only once:\
{len(irony_one_timers)} ({len(irony_one_timers)/(len(irony_one_timers)+len(irony_mult_timers)):.0%})\n\
Number of words that occur more than once:\
{len(irony_mult_timers)} ({len(irony_mult_timers)/(len(irony_one_timers)+len(irony_mult_timers)):.0%})\n\
Total word count, including repeated words: {len(irony_tokens)}')

### Check Zipf's Law

In [None]:
zipf_law(emoji, 'Emoji');

$$\frac{Type}{Token} ratio$$

In [None]:
len(token_counter(irony, n=len(irony_tokens))) / len(irony_tokens)

## Emoji Data

### Number of Tweets

In [None]:
temp1 = pd.read_csv(EMOJI_INTERIM_PATH+TRAIN_INTERIM, delimiter="\n", names=['tweet'])
temp2 = pd.read_csv(EMOJI_RAW_PATH+TRAIN_LABELS, names=['label'])
emoji_df = temp1.merge(temp2, left_index=True, right_index=True)   

In [None]:
print(f"There are {emoji_df.shape[0]} Tweets in the Emoji data set.") #should be 45000
for i in range(20):
    print(f'{i}: ', (emoji_df[emoji_df['label'] == i].shape[0]))

### Most Common Tokens

In [None]:
token_counter(emoji)

### Least Common Tokens

In [None]:
token_counter(emoji, inverse=True)

In [None]:
emoji_tokens = [t for l in emoji for t in l] # Get tokens for each line of the emoji data set
emoji_one_timers = []
emoji_mult_timers = []

for tup in token_counter(emoji, n=len(emoji_tokens)):
    k, v = tup # Unpack
    if v == 1:
        emoji_one_timers.append(k)
    else:
        emoji_mult_timers.append(k)

In [None]:
print(f'Number of words that occur only once:\
{len(emoji_one_timers)} ({len(emoji_one_timers)/(len(emoji_one_timers)+len(emoji_mult_timers)):.0%})\n\
Number of words that occur more than once:\
{len(emoji_mult_timers)} ({len(emoji_mult_timers)/(len(emoji_one_timers)+len(emoji_mult_timers)):.0%})\n\
Total word count, including repeated words: {len(emoji_tokens)}')

In [None]:
zipf_law(irony, 'Irony');

$$\frac{Type}{Token} ratio$$

In [None]:
len(token_counter(emoji, n=len(emoji_tokens))) / len(emoji_tokens)

# Irony

## Maximum Likelyhood

### Training the model

In [None]:
train, vocab = padded_everygram_pipeline(2, irony)
lm = MLE(2) # Maximum likelyhood estimator or order 2
len(lm.vocab) # Initializes an empty vocab

In [None]:
lm.fit(train, vocab) # which is filled with model data
len(lm.vocab)

In [None]:
print(lm.vocab)

In [None]:
lm.vocab.lookup(irony[0])
print(lm.counts)

In [None]:
lm.score("<UNK>") == lm.score("aliens") # The token 'aliens' is not in our list

In [None]:
lm.score("a") # returns the relative frequency of 'a'

In [None]:
lm.logscore("a") # This method avoids underflow

In [None]:
lm.score('A')

In [None]:
lm.score("@user") #most common word

In [None]:
lm.score("a", ["be"]) # Chance that 'a' is preceeded by 'be'

### Testing the Model

In [None]:
#load the validation set
irony_val = token_data(IRONY_RAW_PATH + VAL, IRONY_INTERIM_PATH+VAL_INTERIM)

In [None]:
lm.entropy(irony_val)

In [None]:
lm.perplexity(irony_val)

## Laplace

### Training the model

In [None]:
train, vocab = padded_everygram_pipeline(2, irony)
lm2 = Laplace(1)
len(lm2.vocab) # Initializes an empty vocab
lm2.fit(train, vocab) # which is filled with model data
len(lm2.vocab)

In [None]:
print(lm2.vocab)

In [None]:
lm2.score("<UNK>") == lm.score("aliens")

In [None]:
lm2.score("@user") #most common word

In [None]:
lm2.score("a", ["be"]) # Chance that 'a' is preceeded by 'be'

### Testing the Model

In [None]:
lm2.entropy(irony_val)

In [None]:
lm2.perplexity(irony_val)

## KneserNeyInterpolated

### Training the model

In [None]:
train, vocab = padded_everygram_pipeline(2, irony)
lm3 = KneserNeyInterpolated(1)
print(len(lm3.vocab)) # Initializes an empty vocab
lm3.fit(train, vocab) # which is filled with model data
len(lm3.vocab)

In [None]:
lm3.score("<UNK>") == lm.score("aliens")

In [None]:
lm3.score("@user") #most common word

In [None]:
lm3.score("a", ["be"]) # Chance that 'a' is preceeded by 'be'

### Testing the Model

In [None]:
lm3.entropy(irony_val)

In [None]:
lm3.perplexity(irony_val)

## WittenBellInterpolated

### Training the model

In [None]:
train, vocab = padded_everygram_pipeline(2, irony)
lm4 = WittenBellInterpolated(1)
print(len(lm4.vocab)) # Initializes an empty vocab
lm4.fit(train, vocab) # which is filled with model data
print(len(lm4.vocab))

In [None]:
lm4.score("<UNK>") == lm.score("aliens")

In [None]:
lm4.score("@user") #most common word

In [None]:
lm4.score("a", ["be"]) # Chance that 'a' is preceeded by 'be'

### Testing the model

In [None]:
lm4.entropy(irony_val)

In [None]:
lm4.perplexity(irony_val)

## Bit of fun

In [None]:
ironic = irony_df[irony_df['label'] == 1]['tweet'].reset_index().drop('index', axis=1)
ironic_list = [t.split(',') for t in ironic['tweet']]

In [None]:
train, vocab = padded_everygram_pipeline(2, ironic_list)
lm_ironic = Laplace(1)
lm_ironic.fit(train, vocab)
lm_ironic.vocab;

In [None]:
#lm_ironic.generate(12, random_seed=4)

## Interrater

In [None]:
interrater_df = pd.read_csv('../datasets/iaa-sets/irony/iaa_labels.txt', names = ['True_label'])
rater1 = pd.read_csv('../datasets/iaa-sets/irony/DeeAnnotation.txt', names = ['Dee'])
rater2 = pd.read_csv('../datasets/iaa-sets/irony/SannaAnnotation.txt', names = ['Sanna'])
rater3 = pd.read_csv('../datasets/iaa-sets/irony/GinoAnnotation.csv', names = ['Gino'])
rater4 = pd.read_csv('../datasets/iaa-sets/irony/augunotation.txt', names = ['August'])

interrater_df = interrater_df.merge(rater1, left_index=True, right_index = True)
interrater_df = interrater_df.merge(rater2, left_index=True, right_index = True)
interrater_df = interrater_df.merge(rater3, left_index=True, right_index = True)
interrater_df = interrater_df.merge(rater4, left_index=True, right_index = True)

rater_list = ['Dee', 'Sanna', 'Gino', 'August']

interrater_df.shape

In [None]:
print(interrater_df['True_label'].sum())
print(interrater_df['Dee'].sum())
print(interrater_df['Sanna'].sum())
print(interrater_df['Gino'].sum())

In [None]:
ratios = {}
for i in rater_list:
    ratios[i] = sum(np.bitwise_and(interrater_df['True_label'], interrater_df[i])) / len(interrater_df['True_label'])


In [None]:
ratios

In [None]:
interrater_df['agree'] =  ((interrater_df['Dee'] + interrater_df['Sanna'] + interrater_df['Gino']) == 0) |\
((interrater_df['Dee'] + interrater_df['Sanna'] + interrater_df['Gino']) == 3)

agreed = interrater_df['agree'].sum()
print(agreed)

In [None]:
## total matches/total = interrater agreement
a_0 = agreed/len(interrater_df)
print(f'{a_0:.1%}')

In [None]:
## adjust for chance, assumption that there is a uniform distribution where p = 0.5 to choose 0 or 1.
p = 0.5
a_c = (p)**len(rater_list)

a_adj = (a_0 - a_c)/(1-a_c)

print(f'{a_adj:.1%}')

Let's explore our interagreement annotations to find our limitations.

In [None]:
interrater_df[interrater_df['agree'] == False]