In [None]:
import re
import difflib
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random #As of now only used for generating 100 random tweets for manual labelling

## Reading data
### The Offensive Training Data

In [None]:
f = open("../data/raw/offensive/train_text.txt", 'r', encoding = "utf-8")
inputlist = [line for line in f]
f.close()

training_data, validation_data = inputlist[:len(inputlist)//2], inputlist[len(inputlist)//2:]


## Tokenizer as Function:

In [None]:
## patterns
def tokenizer(tweets):
    """
    Function that takes a list of strings and returns the tokenized version of each string
    """

    token_pat = re.compile(r'[\w@’#]+')
    skippable_pat = re.compile(r'\s+')

    non_white_space = re.compile(r'[^@’#\w\s]') #Finds characters that are not white_space nor word characters (nor @’#)


    tokenlist = []
    for i in tweets:
        tokens = []
        unmatchable = []
        line = i
        while line:
            skippable_match = re.search(skippable_pat, line)
            nws_match = re.search(non_white_space, line) #Search for non-word && non-whitespace chars (nws = non_white_space)
            if skippable_match and skippable_match.start() == 0:
                # If there is one at the beginning of the line, just skip it.
                line = line[skippable_match.end():]

            elif nws_match and nws_match.start() == 0: # If a character is neither non_white_space nor a word-character
                tokens.append(line[:nws_match.end()]) #Append it to tokens
                line = line[nws_match.end():] #Move further along in line
            else:
                # Else try finding a real token.
                token_match = re.search(token_pat, line)
                if token_match and token_match.start() == 0:
                    # If there is one at the beginning of the line, tokenise it.
                    tokens.append(line[:token_match.end()])
                    line = line[token_match.end():]
                else:
                    # Else there is unmatchable material here.
                    # It ends where a skippable or token match starts, or at the end of the line.
                    unmatchable_end = len(line)
                    if skippable_match:
                        unmatchable_end = skippable_match.start()
                    if token_match:
                        unmatchable_end = min(unmatchable_end, token_match.start())
                    # Add it to unmatchable and discard from line.
                    unmatchable.append(line[:unmatchable_end])
                    line = line[unmatchable_end:]
        tokenlist.append(tokens)
    return(tokenlist)



In [None]:
token_tweets = tokenizer(training_data)
#[print(*i) for i in token_tweets]

In [None]:
#TweetTokenizer Initialisation
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
j = 0
for i in training_data: 
    temp = i
    diff = difflib.context_diff(tknzr.tokenize(i),token_tweets[j])
    #print("".join(diff), end = "")
    print(i,"tknzr:",tknzr.tokenize(i),"\ntokenlist:",token_tweets[j],"\n")
    j+=1

## Exercise 2:
### Corpus size of Offensive and sentiment training sets respectively::

In [None]:
%%bash
wc ../data/raw/offensive/train_text.txt
wc ../data/raw/sentiment/train_text.txt

<b>Offensive:</b> 11916 lines/tweets, 262370 words <br>
<b>Sentiment:</b> 35615 lines/tweets, 877516 words

### Running tokenizer func on offensive and sentiment training data to get token count right

In [None]:
# with open("../data/raw/offensive/train_text.txt", "r",  encoding = "utf-8",) as f:
#     offensive_raw = [line for line in f]

# with open("../data/raw/sentiment/train_text.txt", "r",  encoding = "utf-8",) as f:
#     sentiment_raw = [line for line in f]


In [None]:
offensive_tokens = tokenizer(offensive_raw)
sentiment_tokens = tokenizer(sentiment_raw)


### Saving tokens to file, running commandline operation on these new files

In [None]:
def make_token_files():
    with open("../data/interim/tokenized/offensive_tokens.txt","w", encoding = "utf-8") as f:
        for i in offensive_tokens:
            string = str(i)
            f.write(string[1:-1]+"\n")

    with open("../data/interim/tokenized/sentiment_tokens.txt","w", encoding = "utf-8") as f:
        for i in sentiment_tokens:
            string = str(i)
            f.write(string[1:-1]+"\n")

### Tokens and their counts:

In [None]:
%%bash
tr ' ' '\n' <../data/interim/tokenized/offensive_tokens.txt | sort | uniq -c | sort -r > ../data/interim/uniq/offensive_uniq.txt
tr " " "\n" <../data/interim/tokenized/sentiment_tokens.txt | sort | uniq -c | sort -r > ../data/interim/uniq/sentiment_uniq.txt


#### The ten most frequent tokens in "offensive_training":

In [None]:
%%bash
head ../data/interim/uniq/offensive_uniq.txt

#### The ten most frequent tokens in "sentiment_training":

In [None]:
%%bash
head ../data/interim/uniq/sentiment_uniq.txt

### Uniq txt files loaded in as dataframes

#### Offensive uniq dataframe

In [None]:
off_uniq = pd.read_csv("../data/interim/uniq/offensive_uniq.txt", sep="\n", names = ["count","token"])
off_uniq = off_uniq["count"].str.split(expand=True) #split the values to get count and tokens in different columns
off_uniq[0] = off_uniq[0].astype(int) #typecast elements in column 0 to integers
off_uniq.replace("^\'|[',]{1,2}$", "", regex=True, inplace=True)  #Regex to remove the '', in the tokens they're present

off_uniq


#"^\d{1,5}" for start 1-5 ints

#### Sentiment Uniq Dataframe

In [None]:
sent_uniq = pd.read_csv("../data/interim/uniq/sentiment_uniq.txt", sep="\n", names = ["count","token"])
sent_uniq = sent_uniq["count"].str.split(expand=True) #split the values to get count and tokens in different columns
sent_uniq[0] = sent_uniq[0].astype(int) #typecast elements in column 0 to integers
sent_uniq.replace("^\'|[',]{1,2}$", "", regex=True, inplace=True)  # "^\d{1,5}" for start 1-5 ints

sent_uniq


### type/token ratio

In [None]:
#Types == Amount of different Tokens in dataset
off_types = len(off_uniq[1])
sent_types = len(sent_uniq[1])
print("Offensive Types: {}\nSentiment types: {}\n".format(off_types,sent_types))

#Tokens == Amount of all "Words" in dataset
off_token_amount = off_uniq[0].sum()
sent_token_amount = sent_uniq[0].sum()
print("Offensive tokens, amount: {}\nSentiment tokens, amount: {}\n".format(off_token_amount, sent_token_amount))

#Type/token ratio (=ttratio)
off_ttratio = off_types/off_token_amount
sent_ttratio = sent_types/sent_token_amount
print("Offensive type/token ratio: {:.4f}\nSentiment type/token ratio: {:.4f}".format(off_ttratio, sent_ttratio))

#### Tokens that only occur 1, 2 or 3 times
<ul>
    <li>Things like Hashtags and emojis are prevalent, but they, more importantly, contain most of the types/vocabulary</li>
    <li>Tokens that occur only once make up 58% of the types in both datasets!</li>
</ul>

In [None]:
print("Offensive types w. freq. 1 divided by total types: {:.2f}".format(len(off_uniq.loc[off_uniq[0]==1])/off_types*100))
print("Sentiment types w. freq. 1 divided by total types: {:.2f}".format(len(sent_uniq.loc[sent_uniq[0]==1])/sent_types*100))

In [None]:
#Amount of types showing up x times in the offensive dataset (e.g 14000 tokens only showing up once, and so on)
#500 Most common tokens skipped, to make plot visible

fig, ax = plt.subplots(figsize=(16,9))
sns.countplot(x=0, data=off_uniq[500:]) #Sns counts the type frequency of each word, and plots it
sns.set_style("darkgrid")
ax.tick_params('x',rotation=45, labelsize = 10) #xlabels are rotated 45 degrees and made bigger

### Noticable difference in the two datasets

<ul>
    <li>Big difference in size, sentiment dataset over twice the amount of tokens (=library twice the size)</li>
    <li>otherwise quite similar, in both sets the percentage of the vocabulary made up of tokens w. frq. 1 is 58%</li>
    <ul><li>Both datasets also seem to follow Zipf's law (see below graphs)</li>
    </ul>
</ul>
    

### Corpus Statistics Consistent with Zipf's law?

In [None]:
#log-log plot of the rank of token frequency against against the frequency in the offensive dataset
off_uniq["log_frq"] = np.log(off_uniq[0])
off_uniq["log_rank"] = np.log(off_uniq[0].rank(ascending=False))
sns.relplot(x="log_rank",y="log_frq", data=off_uniq, color="red", edgecolor=(0.2,0,0,0.01)).set(title=
                "log-log plot of frequency against rank of frequency in Offensive dataset")
plt.show()

In [None]:
#log-log plot of the rank of token frequency against against the frequency in the sentiment dataset
sent_uniq["log_frq"] = np.log(sent_uniq[0])
sent_uniq["log_rank"] = np.log(sent_uniq[0].rank(ascending=False))
sns.relplot(x="log_rank",y="log_frq", data=sent_uniq, color="r", edgecolor=(0.2,0,0,0.01)).set(title=
                "log-log plot of frequency against rank of frequency in Sentiment dataset")
plt.show()

<b>As seen in the above plots, both datasets seem consistent with Zipf's law</b>

## Task 3: Manual Annotation & Inter-user Agreement

### Generating 100 random tweets for manual annotation

In [None]:
random.seed(42) #Seeded for consistency
random_tweets = random.sample(list(enumerate(sentiment_raw)),100)
rtweet_index = [i[0] for i in random_tweets]

# File-generation is commented out, as the randomness is seeded, thus Making the same "Random" file every time
# with open("../data/interim/random_tweets.txt","w", encoding="utf-8") as f:
#     [f.write(str(i[1])+"\n") for i in random_tweets]

In [None]:
sent_label = pd.read_csv('../data/raw/sentiment/train_labels.txt',header=None)
sent_raw = pd.read_csv("../data/raw/sentiment/train_text.txt",header=None, sep="\n",quoting=3)

In [None]:
# display(sent_label.iloc[rtweet_index])
# display(sent_raw.iloc[rtweet_index])
len(sentiment_raw)

In [None]:
#creating 100 random ints from the interval [0-2], for later testing

test_labels = random.choices([0,1,2], k=100)
with open("../data/interim/manual_annotation/random_test.txt","w", encoding="utf-8") as f:
    [f.write(str(i)+"\n") for i in test_labels]

### Putting the manually annotated labels into a single dataframe

In [None]:
man_label_list = os.listdir("../data/interim/manual_annotation")

df_list = []
for i in enumerate(man_label_list):
    temp_df = pd.read_csv("../data/interim/manual_annotation/"+i[1], header=None, dtype=int)
    df_list.append(temp_df)
    
all_labels = pd.concat(df_list,axis=1)
#all_labels
display(all_labels.eq(all_labels.iloc[:,0], axis=0).all(1))
    


In [None]:
same_label = all_labels.eq(all_labels.iloc[:,0], axis=0).all(1)
print(np.sum(same_label))

$$ \textbf{Calculating}\\  A_e$$