# First Year Project
## Project 4 - Natural Language Processing
### Professor - Christian Hardmeier

This notebook contains all of the code developed for project 4. We will be using a data set of tweets to perform machine learning for binary and multiclass classification.

For **binary** classification, we evaluate tweets based on *'ironic'* or *'not ironic'*.<br>
For **multiclass** classification, we evaluate predict which emojis are used based on the text data.

Group 3:<br>
Crisanna Cornish (ccor@itu.dk)<br>
Danielle Dequin (ddeq@itu.dk)<br>
Gino Franco Fazzi (gifa@itu.dk)<br>
Moneeca Abru Iftikhar Latif (abml@itu.dk)<br>
Carl August Wismer (cwis@itu.dk)

Created: 27-04-2021<br>
Last Modified: 04-05-2021

# Data Source

We use the TweetEval repository, a collection of 7 datasets for different classification tasks based on social media post. The repository can be found here: https://github.com/cardiffnlp/tweeteval.git

Each dataset is presented in the same format and with fixed training, validation and test splits.

# Imports

In [8]:
import re
import sys
import csv
import pandas as pd
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import os
import matplotlib.pyplot as plt
from math import log

# Constants

### Paths

In [9]:
# irony text:
IRONY_RAW_PATH = '../datasets/irony/raw/'
IRONY_INTERIM_PATH = '../datasets/irony/interim/'

# emoji:
EMOJI_RAW_PATH = '../datasets/emoji/raw/'
EMOJI_INTERIM_PATH = '../datasets/emoji/interim/'

### FILES:

In [10]:
TEST = 'test_text.txt'
TRAIN = 'train_text.txt'
VAL = 'val_text.txt'

TEST_LABELS = 'test_labels.txt'
TRAIN_LABELS = 'train_labels.txt'
VAL_LABELS = 'val_labels.txt'

TEST_INTERIM = 'test_seperated.csv'
TRAIN_INTERIM = 'train_seperated.csv'
VAL_INTERIM = 'val_seperated.csv'

# FUNCTIONS:

In [30]:
def tokenise(line):
    # Initialise lists
    tokens = []
    unmatchable = []

    # Compile patterns for speedup
    token_pat = re.compile(r'\w+|#+|\'|@|\.\.+|!+|\?+')
    skippable_pat = re.compile(r',|\|http://t.co/+')  # typically spaces

    # As long as there's any material left...
    while line:
        # Try finding a skippable token delimiter first.
        skippable_match = re.search(skippable_pat, line)
        if skippable_match and skippable_match.start() == 0:
            # If there is one at the beginning of the line, just skip it.
            line = line[skippable_match.end():]
        else:
            # Else try finding a real token.
            token_match = re.search(token_pat, line)
            #print(token_match)
            if token_match and token_match.start() == 0:
                #print(line[token_match.start():token_match.end()])
                if line[token_match.start():token_match.end()] == '#': #keep hash tags together and seperate
                    try:
                        token_match2 = re.search(token_pat, line[1:])
                        if ' ' in line[token_match2.start():token_match2.end()]:
                            line = line[token_match.end():]
                        else:
                            tokens.append(line[:token_match2.end()+1])
                            line = line[token_match2.end()+1:]
                    except:
                        line = line[token_match.end():]

                elif line[token_match.start():token_match.end()] == '@': # keep @ tags together and seperate
                    try:
                        token_match2 = re.search(token_pat, line[1:])
                        if ' ' in line[token_match2.start():token_match2.end()]:
                            line = line[token_match.end():]
                        
                        else: 
                            tokens.append(line[:token_match2.end()+1])
                            line = line[token_match2.end()+1:]
                    except:
                        line = line[token_match.end():]

                elif line[token_match.start():token_match.end()] == "'": # handle contractions as a single word
                    try:
                        token_match2 = re.search(token_pat, line[1:])
                        if ' ' in line[token_match2.start():token_match2.end()]:
                            line = line[token_match.end():]
                        
                        else: 
                            tokens.append(line[:token_match2.end()+1])
                            line = line[token_match2.end()+1:]
                    except:
                        line = line[token_match.end():]

                # If there is one at the beginning of the line, tokenise it.
                else:
                    tokens.append(line[:token_match.end()])
                    line = line[token_match.end():]
            else:
                # Else there is unmatchable material here.
                # It ends where a skippable or token match starts, or at the end of the line.
                unmatchable_end = len(line)
                if skippable_match:
                    unmatchable_end = skippable_match.start()
                if token_match:
                    unmatchable_end = min(unmatchable_end, token_match.start())
                # Add it to unmatchable and discard from line.
                unmatchable.append(line[:unmatchable_end])
                line = line[unmatchable_end:]

    final_tokens = []

    while len(tokens) > 0:
        temp1 = tokens.pop(0)
        try:
            temp2 = tokens.pop(0)
            if temp2[0] == "'":
                temp1 += temp2
                final_tokens.insert(0, temp1)
                #print('a', temp1)
            else:
                final_tokens.insert(0, temp1)
                tokens.insert(0, temp2)
                #print('b', temp1, temp2)
        except:
            final_tokens.insert(0, temp1)
            #print('d', temp1)
        
    final_tokens = final_tokens[::-1]

    #print(final_tokens)
    #print(unmatchable)
    return final_tokens

def token_data(data, interim, tokenizer=None):
    
    if tokenizer == None:
        # Open Irony raw data set text and tokenize
        f = open(data, "r", encoding="utf-8")
        token_list = []
        for line in f:
            token_list.append(tokenise(line))
        f.close()
    
    elif tokenizer == "compare":
        tknzr = TweetTokenizer()

        f = open(data, "r", encoding="utf-8")
        token_list = []
        for line in f:
            token_list.append(tknzr.tokenize(line))
        f.close()
        
    # Write the tokenized data to an interim csv file
    with open(interim, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerows(token_list)
        
    return token_list

def zipf_law(tokens, ds_name=''):
    x = [log(x) for x in range(1, len(tokens)+1)]
    y = []

    ordered = token_counter(tokens, n=len(tokens))

    for tup in ordered:
        word, count = tup
        y.append(log(count))
    
    fig, ax = plt.subplots()
    ax.scatter(x, y)
    ax.set_xlabel('Log(Rank)')
    ax.set_ylabel('Log(Freq)')
    ax.set_title(f'Zipf Law for {ds_name} dataset')

# Load data: Task 1

## Load Irony Data Set

In [31]:
# Open Irony data set, tokenize, and write to a csv file
final_irony = token_data(IRONY_RAW_PATH + TRAIN, IRONY_INTERIM_PATH+TRAIN_INTERIM)

In [52]:
final_irony[1]

['look',
 'for',
 'the',
 'girl',
 'with',
 'the',
 'broken',
 'smile',
 'ask',
 'her',
 'if',
 'she',
 'wants',
 'to',
 'stay',
 'while',
 'and',
 'she',
 'will',
 'be',
 'loved']

In [33]:
temp1 = pd.read_csv(IRONY_INTERIM_PATH+TRAIN_INTERIM, delimiter="\n", names=['tweet'])
temp2 = pd.read_csv(IRONY_RAW_PATH+TRAIN_LABELS)

# Merge Tokenized data with labels
irony_df = temp1.merge(temp2, left_index=True, right_index=True)

## Load Emoji Data Set

In [34]:
# Open Emoji raw dadta set, tokenize, and write to an interim csv file
emoji = token_data(EMOJI_RAW_PATH + TRAIN, EMOJI_INTERIM_PATH+TRAIN_INTERIM)

In [35]:
#final_emoji[16038]

In [36]:
temp1 = pd.read_csv(EMOJI_INTERIM_PATH+TRAIN_INTERIM, delimiter="\n", names=['tweet'])
temp2 = pd.read_csv(EMOJI_RAW_PATH+TRAIN_LABELS, names=['label'])

# Merge emoji data with labels
emoji_df = temp1.merge(temp2, left_index=True, right_index=True)

## Compare tokeniser’s output 
With the baseline tokenisation from the socialmedia tokeniser in the NLTK library (nltk.tokenize.TweetTokenizer)

In [49]:
# Open emoji data, tokenize with NLTK tokenizer, write to interim csv for comparison
compare_emoji = token_data(EMOJI_RAW_PATH + TRAIN, EMOJI_INTERIM_PATH+'train_seperated_compared.csv',"compare")

In [41]:
#compare_emoji[16038]

In [50]:
# Open emoji data, tokenize with NLTK tokenizer, write to interim csv for comparison
compare_irony = token_data(IRONY_RAW_PATH + TRAIN, IRONY_INTERIM_PATH+'train_seperated_compared.csv',"compare")

In [56]:
#compare_irony[1]

# Characterize Data: Task 2

## Irony Data

### Number of Tweets

In [None]:
print(f"There are {irony_df.shape[0]} tweets in the Irony data set")

### Most common Tokens

In [None]:
count_irony = Counter()
for line in final_irony:
    count_irony.update(line)

# 10 most common tokens in Irony data set
count_irony.most_common()[:10]

In [None]:
counter_irony_1 = 0
counter_irony_ = 0
for k,v in count_irony.items():
    if v == 1:
        counter_irony_1 += 1
    elif v > 1:
        counter_irony_ += 1

### Least Common Tokens

In [None]:
print(f'number of words that occur only once: {counter_irony_1}, ({counter_irony_1/(counter_irony_1 + counter_irony_):.0%})\nnumber of words that occur more than once: {counter_irony_}, \ntotal word count, including repeated words: {sum(count_irony.values())}')

In [None]:
count_irony.most_common()[:-10-1:-1] # 10 least common words

### Check Zipf's Law

## Emoji Data

### Number of Tweets

In [None]:
print(f"There are {emoji_df.shape[0]} Tweets in the Emoji data set.") #should be 45000

### Most Common Tokens

In [None]:
count_emoji = Counter()

for line in final_emoji:
    count_emoji.update(line)

# 10 most common tokens in emoji dataset
count_emoji.most_common()[:10]

In [None]:
counter_emoji_1 = 0
counter_emoji_ = 0
for k,v in count_emoji.items():
    if v == 1:
        counter_emoji_1 += 1
    elif v > 1:
        counter_emoji_ += 1

### Least Common Tokens

In [None]:
print(f'number of words that occur only once: {counter_emoji_1}, ({counter_emoji_1/(counter_emoji_1 + counter_emoji_):.0%})\nnumber of words that occur more than once: {counter_emoji_} \ntotal word count, including repeated words: {sum(count_emoji.values())}')

In [None]:
count_emoji['# #'], count_emoji['##']

In [None]:
count_emoji.most_common()[:-10-1:-1] # Least common words are often personal names or hashtags.

### Check Zipf's Law

In [None]:
for i in range(20):
    print(i, ': ', (emoji_df[emoji_df['label'] == i]).shape[0])

In [None]:
emoji_df.head()

In [None]:
temp1 = pd.read_csv(IRONY_INTERIM_PATH+TRAIN_INTERIM, delimiter="\n", names=['tweet'])
temp2 = pd.read_csv(IRONY_RAW_PATH+TRAIN_LABELS, names=['label'])
irony_df = temp1.merge(temp2, left_index=True, right_index=True)

In [None]:
print('Ironic:', (irony_df[irony_df['label'] == 1]).shape[0], '\n Other:', (irony_df[irony_df['label'] == 0]).shape[0])

In [None]:
irony_df.head()