In [None]:
import re
import difflib
from nltk import agreement
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random #As of now only used for generating 100 random tweets for manual labelling
from sklearn.metrics import cohen_kappa_score

from collections import Counter

## Reading data
### The Offensive Training Data

In [None]:
f = open("../data/raw/offensive/train_text.txt", 'r', encoding = "utf-8")
inputlist = [line for line in f]
f.close()

with open("../data/raw/offensive/train_labels.txt", 'r', encoding = "utf-8") as f:
    offensive_labels = [int(i.strip("\n")) for i in f]



off_training_data, off_validation_data = inputlist[:len(inputlist)//2], inputlist[len(inputlist)//2:]
off_training_labels, off_validation_labels = offensive_labels[:len(inputlist)//2], offensive_labels[len(inputlist)//2:]

## Tokenizer as Function:

In [None]:
## patterns
def tokenizer(tweets):
    """
    Function that takes a list of strings and returns the tokenized version of each string
    """

    token_pat = re.compile(r'[\w@’#]+')
    skippable_pat = re.compile(r'\s+')

    non_white_space = re.compile(r'[^@’#\w\s]') #Finds characters that are not white_space nor word characters (nor @’#)


    tokenlist = []
    for i in tweets:
        tokens = []
        unmatchable = []
        line = i
        while line:
            skippable_match = re.search(skippable_pat, line)
            nws_match = re.search(non_white_space, line) #Search for non-word && non-whitespace chars (nws = non_white_space)
            if skippable_match and skippable_match.start() == 0:
                # If there is one at the beginning of the line, just skip it.
                line = line[skippable_match.end():]

            elif nws_match and nws_match.start() == 0: # If a character is neither non_white_space nor a word-character
                tokens.append(line[:nws_match.end()]) #Append it to tokens
                line = line[nws_match.end():] #Move further along in line
            else:
                # Else try finding a real token.
                token_match = re.search(token_pat, line)
                if token_match and token_match.start() == 0:
                    # If there is one at the beginning of the line, tokenise it.
                    tokens.append(line[:token_match.end()])
                    line = line[token_match.end():]
                else:
                    # Else there is unmatchable material here.
                    # It ends where a skippable or token match starts, or at the end of the line.
                    unmatchable_end = len(line)
                    if skippable_match:
                        unmatchable_end = skippable_match.start()
                    if token_match:
                        unmatchable_end = min(unmatchable_end, token_match.start())
                    # Add it to unmatchable and discard from line.
                    unmatchable.append(line[:unmatchable_end])
                    line = line[unmatchable_end:]
        tokenlist.append(tokens)
    return(tokenlist)



In [None]:
token_tweets = tokenizer(off_training_data)
#[print(*i) for i in token_tweets]

In [None]:
#TweetTokenizer Initialisation
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
j = 0
for i in off_training_data:
    temp = i
    diff = difflib.context_diff(tknzr.tokenize(i),token_tweets[j])
    #print("".join(diff), end = "")
    print(i,"tknzr:",tknzr.tokenize(i),"\ntokenlist:",token_tweets[j],"\n")
    j+=1

## Exercise 2:
### Corpus size of Offensive and sentiment training sets respectively::

In [None]:
%%bash
wc ../data/raw/offensive/train_text.txt
wc ../data/raw/sentiment/train_text.txt

<b>Offensive:</b> 11916 lines/tweets, 262370 words <br>
<b>Sentiment:</b> 35615 lines/tweets, 877516 words

### Running tokenizer func on offensive and sentiment training data to get token count right

In [None]:
with open("../data/raw/offensive/train_text.txt", "r",  encoding = "utf-8",) as f:
    offensive_raw = [line for line in f]

with open("../data/raw/sentiment/train_text.txt", "r",  encoding = "utf-8",) as f:
    sentiment_raw = [line for line in f]


In [None]:
offensive_tokens = tokenizer(offensive_raw)
sentiment_tokens = tokenizer(sentiment_raw)

In [None]:
#from https://stackoverflow.com/questions/45019607/count-occurrence-of-a-list-in-a-list-of-lists
off_uniq = pd.Series(offensive_tokens).explode().value_counts()
sent_uniq = pd.Series(sentiment_tokens).explode().value_counts()

#Turning above pd.series into dataframes, for ease of use later
#Transformation found at:https://stackoverflow.com/questions/40224319/pandas-series-to-dataframe-using-series-indexes-as-columns
off_uniq = off_uniq.to_frame().reset_index()
sent_uniq = sent_uniq.to_frame().reset_index()

#Renaming columns in dataframes
off_uniq.columns = ["token","count"]
sent_uniq.columns = ["token","count"]

print("Offensive dataset, top 10 tokens:","\n",off_uniq[:10],"\n")
print("Sentiment dataset, top 10 tokens:","\n",sent_uniq[:10])


### Saving tokens to file, running commandline operation on these new files

In [None]:
# def make_token_files():
#     """
#     Function that makes offensive_tokens.txt and sentiment_tokens.txt files
#     """
#     with open("../data/interim/tokenized/offensive_tokens.txt","w", encoding = "utf-8") as f:
#         for i in offensive_tokens:
#             string = str(i)
#             f.write(string[1:-1]+"\n")

#     with open("../data/interim/tokenized/sentiment_tokens.txt","w", encoding = "utf-8") as f:
#         for i in sentiment_tokens:
#             string = str(i)
#             f.write(string[1:-1]+"\n")

### Tokens and their counts:

In [None]:
%%bash
tr ' ' '\n' <../data/interim/tokenized/offensive_tokens.txt | sort | uniq -c | sort -r > ../data/interim/uniq/offensive_uniq.txt
tr " " "\n" <../data/interim/tokenized/sentiment_tokens.txt | sort | uniq -c | sort -r > ../data/interim/uniq/sentiment_uniq.txt


#### The ten most frequent tokens in "offensive_training":

In [None]:
%%bash
head ../data/interim/uniq/offensive_uniq.txt

#### The ten most frequent tokens in "sentiment_training":

In [None]:
%%bash
head ../data/interim/uniq/sentiment_uniq.txt

### Uniq txt files loaded in as dataframes

#### Offensive uniq dataframe

In [None]:
# off_uniq = pd.read_csv("../data/interim/uniq/offensive_uniq.txt", sep="\n", names = ["count","token"])
# off_uniq = off_uniq["count"].str.split(expand=True) #split the values to get count and tokens in different columns
# off_uniq[0] = off_uniq[0].astype(int) #typecast elements in column 0 to integers
# off_uniq.replace("^\'|[',]{1,2}$", "", regex=True, inplace=True)  #Regex to remove the '', in the tokens they're present

# off_uniq


# #"^\d{1,5}" for start 1-5 ints

#### Sentiment Uniq Dataframe

In [None]:
# sent_uniq = pd.read_csv("../data/interim/uniq/sentiment_uniq.txt", sep="\n", names = ["count","token"])
# sent_uniq = sent_uniq["count"].str.split(expand=True) #split the values to get count and tokens in different columns
# sent_uniq[0] = sent_uniq[0].astype(int) #typecast elements in column 0 to integers
# sent_uniq.replace("^\'|[',]{1,2}$", "", regex=True, inplace=True)  # "^\d{1,5}" for start 1-5 ints

# sent_uniq


### type/token ratio

In [None]:
#Types == Amount of different Tokens in dataset
off_types = len(off_uniq["token"])
sent_types = len(sent_uniq["token"])
print("Offensive Types: {}\nSentiment types: {}\n".format(off_types,sent_types))

#Tokens == Amount of all "Words" in dataset
off_token_amount = off_uniq["count"].sum()
sent_token_amount = sent_uniq["count"].sum()
print("Offensive tokens, amount: {}\nSentiment tokens, amount: {}\n".format(off_token_amount, sent_token_amount))

#Type/token ratio (=ttratio)
off_ttratio = off_types/off_token_amount
sent_ttratio = sent_types/sent_token_amount
print("Offensive type/token ratio: {:.4f}\nSentiment type/token ratio: {:.4f}".format(off_ttratio, sent_ttratio))

#### Tokens that only occur 1, 2 or 3 times
<ul>
    <li>Things like Hashtags and emojis are prevalent, but they, more importantly, contain most of the types/vocabulary</li>
    <li>Tokens that occur only once make up 58% of the types in both datasets!</li>
</ul>

In [None]:
print("offensive types w. freq 1, 2, or 3 divided by total types: {:.2f}".format(
    len(off_uniq.loc[(off_uniq["count"]==1) | (off_uniq["count"]==2) | (off_uniq["count"]==3)])/off_types*100))
print("Sentiment types w. freq 1, 2, or 3 divided by total types: {:.2f}".format(
    len(sent_uniq.loc[(sent_uniq["count"]==1) | (sent_uniq["count"]==2) | (sent_uniq["count"]==3)])/sent_types*100))

print()

print("Offensive types w. freq. just 1 divided by total types: {:.2f}".format(len(off_uniq.loc[off_uniq["count"]==1])/off_types*100))
print("Sentiment types w. freq. just 1 divided by total types: {:.2f}".format(len(sent_uniq.loc[sent_uniq["count"]==1])/sent_types*100))

In [None]:
len(off_uniq.loc[(off_uniq["count"]==1)])

In [None]:
# sns.displot(x="token", data="off_uniq")
sns.__version__

In [None]:
# Amount of types showing up x times in the offensive dataset (e.g 14000 tokens only showing up once, and so on)
# Skipping top 500 types, for visibility in plot (They're not impactful on the plot otherwise, the most frequent of 
#     these 500 entries is 13)
#Plotting visual and double y-axes found at https://stackoverflow.com/questions/33179122/seaborn-countplot-with-frequencies

fig, ax = plt.subplots(figsize=(16,9))
sns.countplot(x="count", data=off_uniq[500:]) #Sns counts the type frequency of each word, and plots it
sns.set_style("darkgrid")
plt.title("Frequency of 500 types showing up x times in the Offensive dataset")
plt.xlabel("Type occuring x time(s)")
plt.ylabel("sum of types occurring x time(s)")
ax.tick_params('x',rotation=45, labelsize = 10) #xlabels are rotated 45 degrees and made bigger

# Twin axes, creating and visualising
ax2 = ax.twinx()
ax2.set_ylabel("Frequency (percent)")

# Moving the ticks and labels of y-axes to opposite sides for more visually pleasing plot
ax2.yaxis.tick_left()
ax.yaxis.tick_right()
ax.yaxis.set_label_position('right')
ax2.yaxis.set_label_position('left')

# Setting appropriate limits for the y-axes, removing duplicate grid
ax.set_ylim(0,len(off_uniq))
ax2.set_ylim(0,100)
ax2.grid(None)

### Noticable difference in the two datasets

<ul>
    <li>Big difference in size, sentiment dataset over twice the amount of tokens (=library twice the size)</li>
    <li>otherwise quite similar, in both sets the percentage of the vocabulary made up of tokens w. frq. 1 is ~ 56-57%</li>
    <ul><li>Both datasets also seem to follow Zipf's law (see below graphs)</li>
    </ul>
</ul>
    

### Corpus Statistics Consistent with Zipf's law?

In [None]:
#log-log plot of the rank of token frequency against against the frequency in the offensive dataset
off_uniq["log_frq"] = np.log(off_uniq["count"])
off_uniq["log_rank"] = np.log(off_uniq["count"].rank(ascending=False))
sns.relplot(x="log_rank",y="log_frq", data=off_uniq, color="red", edgecolor=(0.2,0,0,0.01)).set(title=
                "log-log plot of frequency against rank of frequency in Offensive dataset")
plt.show()

In [None]:
#log-log plot of the rank of token frequency against against the frequency in the sentiment dataset
sent_uniq["log_frq"] = np.log(sent_uniq["count"])
sent_uniq["log_rank"] = np.log(sent_uniq["count"].rank(ascending=False))
sns.relplot(x="log_rank",y="log_frq", data=sent_uniq, color="r", edgecolor=(0.2,0,0,0.01)).set(title=
                "log-log plot of frequency against rank of frequency in Sentiment dataset")
plt.show()

<b>As seen in the above plots, both datasets seem consistent with Zipf's law</b>

## Task 3: Manual Annotation & Inter-user Agreement

### Generating 100 random tweets for manual annotation

In [None]:
random.seed(42) #Seeded for consistency
random_tweets = random.sample(list(enumerate(sentiment_raw)),100)
rtweet_index = [i[0] for i in random_tweets]

# #File-generation is commented out, as the randomness is seeded, thus Making the same "Random" file every time
# with open("../data/interim/random_tweets.txt","w", encoding="utf-8") as f:
#     [f.write(str(i[1])+"\n") for i in random_tweets]

In [None]:
sent_label = pd.read_csv('../data/raw/sentiment/train_labels.txt',header=None)
sent_raw = pd.read_csv("../data/raw/sentiment/train_text.txt",header=None, sep="\n",quoting=3)

In [None]:
# display(sent_label.iloc[rtweet_index])
# display(sent_raw.iloc[rtweet_index])
len(sent_raw)

In [None]:
#creating 100 random ints from the interval [0-2], for later testing

# test_labels = random.choices([0,1,2], k=100)
# with open("../data/interim/manual_annotation/random_test.txt","w", encoding="utf-8") as f:
#     [f.write(str(i)+"\n") for i in test_labels]

### Putting the manually annotated labels into a single dataframe

In [None]:
man_labels = pd.read_csv("../data/interim/manual_annotation/all_combined.csv", delimiter=",") #All manual labels
man_labels = man_labels.iloc[:,:-1] #Not using the _A0_value column from the file
display(man_labels) #The manually annotated labels, put into a dataframe

same_label = man_labels.eq(man_labels.iloc[:,0], axis=0).all(1) #Finding where all annotators agree on a label
display(same_label)
print("# of equal labels:",np.sum(same_label))
obs_agreement = np.sum(same_label)/len(man_labels.iloc[:,0])
print("observed agreement:",obs_agreement)

### Calculating Chance-corrected agreement

In [None]:
#Formatting manual label answers to calculate Scott's pi, Fleiss' kappa with nltk.agreement
tweets_len = len(man_labels.iloc[:,0])
formatted_answers = [] #Formatting of only the manually annotated data
for column in range(len(man_labels.columns)):
    for tweet_num in range(tweets_len):
        formatted_answers.append([column+1,tweet_num,man_labels.iloc[tweet_num,column]])

#adding the "True" labels to all_formatted:
all_formatted = formatted_answers.copy() #Formatting of BOTH the manually annotated data AND the "True" Annotation of the data
true_label_list = list(sent_label.iloc[rtweet_index][0])
for i in range(tweets_len):
    all_formatted.append([len(man_labels.columns)+1,i,true_label_list[i]])
    
print("lenght of formatted_answers:",len(formatted_answers))
print("length of all_formatted:",len(all_formatted)) #should be 100 characters longer than the above

#### Chance-corrected for just the manual labels

In [None]:
"""
Note that in the nltk.agreement documentation: https://www.nltk.org/_modules/nltk/metrics/agreement.html
the returned value is the chance-corrected agreement, not just A_e.
"""
ratingtask = agreement.AnnotationTask(data=formatted_answers)
print("Scott's pi: {:.4f}\nCohen's kappa: {:.4f}\nFleiss' kappa: {:.4f}".format(ratingtask.pi(),ratingtask.kappa(),ratingtask.multi_kappa()))


#### Chance-corrected for both the manual labels AND the "true" labels

In [None]:
all_label_rating = agreement.AnnotationTask(data=all_formatted)
print("Scott's pi: {:.4f}\nCohen's kappa: {:.4f}\nFleiss' kappa: {:.4f}".format(all_label_rating.pi(),all_label_rating.kappa(),all_label_rating.multi_kappa()))


### Showing the tweets with agreeing/disagreeing manual labels for later discussion, saved to file

In [None]:
same_label[same_label==False]
manual_tweets = sent_raw.loc[rtweet_index]

#tweets labels disagree on
annotation_disagree = manual_tweets.iloc[np.where(same_label==False)]
display(annotation_disagree[:10])#Showing the 10 first tweets with disagreeing manual annotation

#tweets labels agree on
annotation_agree = manual_tweets.iloc[np.where(same_label==True)]

# File creation commented out
# annotation_disagree.to_csv("../data/interim/man_anno_disagree.txt", header=None, index=False)
# annotation_agree.to_csv("../data/interim/man_anno_agree.txt", header=None, index=False)

man_labels[same_label==True]["anno_1"][:10]

### Showing the 10 pairs of inter-annotator agreements possible from 5 annotators (4 manual and the "True" labels)

In [None]:
man_and_true = man_labels.copy()
man_and_true["true"] = true_label_list
man_and_true


In [None]:
for i in range(len(man_and_true.columns)):
    for j in range(i+1, len(man_and_true.columns)):
        l1 = list(man_and_true.iloc[:,i])
        l2 = list(man_and_true.iloc[:,j])
        score = cohen_kappa_score(l1,l2)
        if j==4:
            print("Anno_{}, True:\n".format(i+1),score,"\n")
        else:
            print("Anno_{}, Anno_{}:\n".format(i+1,j+1),score,"\n")


## Automatic Prediction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics



In [None]:
count_vec = CountVectorizer(tokenizer = lambda x: x, lowercase = False)

### First model on Offensive Language

In [None]:
# NOT USING OWN TOKENIZER, DONT USE IN FINAL REPORT
WRONG_OFF = count_vec.fit_transform(offensive_raw[:len(offensive_raw)//2])
WRONG_OFF
#

In [None]:
# Looping through tokenized data, making it lower-case
for index, tweet in enumerate(token_tweets):
    token_tweets[index] = str(tweet).lower()
    
#Running Count_vectorizor (Pipeline for the coming commands)
off_train = count_vec.fit_transform(token_tweets)
off_train

In [None]:
#Running tf_idf on off_train to "balance" tweets
tf_idf_transformer = TfidfTransformer(use_idf = False)
tf_off_train = tf_idf_transformer.fit_transform(off_train)
tf_off_train

In [None]:
classifier = SGDClassifier(loss="log")

In [None]:
off_validation_tokens = tokenizer(off_validation_data)
for index, tweet in enumerate(off_validation_tokens):
    off_validation_tokens[index] = str(tweet).lower()

In [None]:
#Training model
off_clf = classifier.fit(tf_off_train, off_training_labels)

#Preparing validation data
off_pred = count_vec.transform(off_validation_tokens)
tf_off_pred = tf_idf_transformer.transform(off_pred)

#Fitting validation data over model
off_predicted = off_clf.predict(tf_off_pred)

# % of answers gotten right
sum(off_predicted == off_validation_labels) / len(off_validation_labels)

In [None]:
print(metrics.classification_report(off_validation_labels, off_predicted, target_names=["Not offensive","Offensive"]))

In [None]:
print(metrics.confusion_matrix(off_validation_labels, off_predicted))

As seen above, The recall is very close to 1 for non-offensive tweets and very close to 0 for offensive tweets.<br>
This means that the model predicts that most of the tweets are not offensive, and the only reason for our relatively high accuracy is that the training data is unbalanced.

In [None]:
#Checking offensive validation data compared to offensive predicted data
np_list_off = np.array(off_predicted)
np_off_validation = np.array(off_validation_labels)
print("Number of predicted non-offensive tweets: {}\nNumber of predicted offensive tweets: {}\n".format(
len(np_list_off[np.where(np_list_off == 0)]),
len(np_list_off[np.where(np_list_off == 1)])))

print("Number of actual non-offensive tweets: {}\nNumber of actual offensive tweets: {}".format(
len(np_off_validation[np.where(np_off_validation == 0)]),
len(np_off_validation[np.where(np_off_validation == 1)])))

<i>"For your final systems, I suggest you report overall Accuracy as well as Precision, Recall and F-score for all classes.
"</i> - Lecturer