In [None]:
import nltk
import re
import difflib
from nltk import agreement
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random #only used for generating 100 random tweets for manual labelling
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV

from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPClassifier
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics
from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score, classification_report, roc_auc_score, roc_curve, recall_score, precision_score, f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import SVC
from sklearn.metrics import cohen_kappa_score

from nltk.corpus import stopwords

## All functions for the notebook

During the tokenaziation punctuations, emojois, pointless strings and characters are removed

In [None]:
# Required for stopwords
# nltk.download("stopwords")

In [None]:
# Imported stop_words from nltk library (stopwords includes conjunctions, articles and so on)
stop_words = stopwords.words('english')
stop_words.append('th')
stop_words.append('st')
print("|".join(stop_words))

In [None]:
def tokenizer(tweets):
    """
    Function that takes a list of strings and returns the tokenized version of each string
    """
    #counter = 0
    #token_pat = re.compile(r'[\w@’#]+')
    token_pat = re.compile(r'\w+')
    skippable_pat = re.compile(r'[\s\d]+|@user|(\w+\d\w+)|\b(?:%s)\b' % '|'.join(stop_words))

    non_white_space = re.compile(r'[^@’#\w\s]') #Finds characters that are not white_space nor word characters (nor @’#)
    #print("these are the tweets")
    #print(tweets)
    
    # Initialise lists
    tokens = []
    unmatchable = []

# Compile patterns for speedup
    token_pat = re.compile(r'\w+')

    tokenlist = []
    for i in tweets:
        #counter = counter + 1
        #print(counter)
        #tokens = []
        #unmatchable = []
        line = i.lower()
        #print("this is i: ",i)
        
        while line:
            #print("this is the line")
            #print(line)
            skippable_match = re.search(skippable_pat, line)
            if skippable_match and skippable_match.start() == 0:
                line = line[skippable_match.end():]
            else:
                token_match = re.search(token_pat, line)
                #print("tokens_match")
                #print(token_match)
                #print(token_match.start())
                if token_match and token_match.start() == 0:
                    #print("\nAPPEND IS RUNNING\n")
                    #print(line[:token_match.end()])
                    tokens.append(line[:token_match.end()])
                    line = line[token_match.end():]
                else:
                    unmatchable_end = len(line)
                    if skippable_match:
                        unmatchable_end = skippable_match.start()
                    if token_match:
                        unmatchable_end = min(unmatchable_end, token_match.start())
                    unmatchable.append(line[:unmatchable_end])
                    line = line[unmatchable_end:]
        tokenlist.append(tokens)
        tokens = []
    return(tokenlist)


def compare_tokenizers(bool):
    if bool==True:
        tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
        j = 0
        for i in training_data: 
            temp = i
            diff = difflib.context_diff(tknzr.tokenize(i),token_tweets[j])
            #print("".join(diff), end = "")
            print(i,"tknzr:",tknzr.tokenize(i),"\ntokenlist:",token_tweets[j],"\n")
            j+=1

            
def import_(classification_task, file_name):
    with open("../data/raw/"+classification_task+"/"+file_name, "r", encoding="utf-8") as f:
        temp = [int(line.strip("\n")) for line in f]
    return(temp)


def import_and_tokenize(classification_task, file_name):
    with open("../data/raw/"+classification_task+"/"+file_name, "r", encoding="utf-8") as f:
        temp = [line for line in f]
    return(tokenizer(temp))


def report_clf_stats(predicted, test, classification_task):
    name_dict = {"offensive": ["Not offensive","Offensive"], "sentiment": ["Negative", "Neutral", "Positive"]}
    print(metrics.accuracy_score(test, predicted))
    print(metrics.classification_report(predicted, test, target_names=name_dict[classification_task]),"\n")
    print(metrics.confusion_matrix(test, predicted))

## Reading data
### The Offensive Training Data

In [None]:
f = open("../data/raw/offensive/train_text.txt", 'r', encoding = "utf-8")
inputlist = [line for line in f]
f.close()

training_data, validation_data = inputlist[:len(inputlist)//2], inputlist[len(inputlist)//2:]

In [None]:
token_tweets = tokenizer(training_data)
print(token_tweets)
#print(token_tweets[1])
#[print(*i) for i in token_tweets]

###  Comparing our own tokenizer with TweetTokenizer from nltk library
<b>Set below value 'see_output' = True for comparison <i>(It'll run for a while)</i></b>


In [None]:
# Comparing our own tokenizer with TweetTokenizer from nltk library
# Set below value 'see_output' = True for comparison
see_output = True
compare_tokenizers(see_output)

### Corpus size of Offensive and sentiment training sets respectively:

In [None]:
%%bash
wc ../data/raw/offensive/train_text.txt
wc ../data/raw/sentiment/train_text.txt

<b>Offensive:</b> 11916 lines/tweets, 262370 words <br>
<b>Sentiment:</b> 45615 lines/tweets, 877516 words

### Running tokenizer function on offensive and sentiment training data to get token count right

In [None]:
with open("../data/raw/offensive/train_text.txt", "r",  encoding = "utf-8",) as f:
    offensive_raw = [line for line in f]

with open("../data/raw/sentiment/train_text.txt", "r",  encoding = "utf-8",) as f:
    sentiment_raw = [line for line in f]


<i>Below cell line takes some time to run

In [None]:
offensive_tokens = tokenizer(offensive_raw)
sentiment_tokens = tokenizer(sentiment_raw)

## The top 10 most frequent words of each dataset

In [None]:
#from https://stackoverflow.com/questions/45019607/count-occurrence-of-a-list-in-a-list-of-lists
off_uniq = pd.Series(offensive_tokens).explode().value_counts()
sent_uniq = pd.Series(sentiment_tokens).explode().value_counts()

print("Offensive dataset, top 10 tokens:","\n",off_uniq[:10],"\n")
print("Sentiment dataset, top 10 tokens:","\n",sent_uniq[:10])

#Turning above pd.series into dataframes, for ease of use later
#Transformation found at:https://stackoverflow.com/questions/40224319/pandas-series-to-dataframe-using-series-indexes-as-columns
off_uniq = off_uniq.to_frame().reset_index()
sent_uniq = sent_uniq.to_frame().reset_index()

#Renaming columns in dataframes
off_uniq.columns = ["token","count"]
sent_uniq.columns = ["token","count"]

### type/token ratio

In [None]:
#Types == Amount of different Tokens in dataset
off_types = len(off_uniq["token"])
sent_types = len(sent_uniq["token"])
print("Offensive Types: {}\nSentiment types: {}\n".format(off_types,sent_types))

#Tokens == Amount of all "Words" in dataset
off_token_amount = off_uniq["count"].sum()
sent_token_amount = sent_uniq["count"].sum()
print("Offensive tokens, amount: {}\nSentiment tokens, amount: {}\n".format(off_token_amount, sent_token_amount))

#Type/token ratio (=ttratio)
off_ttratio = off_types/off_token_amount
sent_ttratio = sent_types/sent_token_amount
print("Offensive type/token ratio: {:.4f}\nSentiment type/token ratio: {:.4f}".format(off_ttratio, sent_ttratio))

#### Types that only occur 1, 2 or 3 times
<ul>
    <li>Things like Hashtags and misspelled nouns are prevalent, but they, more importantly, contain most of the Types in the vocabulary</li>
    <li>Tokens that occur only once make up ~ 50% of the types in both datasets!</li>
</ul>

In [None]:
print("Offensive types w. freq 1, 2, or 3 divided by total types: {:.2f}%".format(
    len(off_uniq.loc[(off_uniq["count"]==1) | (off_uniq["count"]==2) | (off_uniq["count"]==3)])/off_types*100))
print("Sentiment types w. freq 1, 2, or 3 divided by total types: {:.2f}%".format(
    len(sent_uniq.loc[(sent_uniq["count"]==1) | (sent_uniq["count"]==2) | (sent_uniq["count"]==3)])/sent_types*100))

print()

print("Offensive types w. freq. just 1 divided by total types: {:.2f}%".format(len(off_uniq.loc[off_uniq["count"]==1])/off_types*100))
print("Sentiment types w. freq. just 1 divided by total types: {:.2f}%".format(len(sent_uniq.loc[sent_uniq["count"]==1])/sent_types*100))

## Machine learning part

### Offensive dataset

In [None]:
random.seed(42)

#Loading in offensive x-train, x-test, y-train, y-test

# x-train
ox_train = import_and_tokenize("offensive", "train_text.txt")

# x-test
ox_test = import_("offensive", "train_labels.txt")

# y-train
oy_train = import_and_tokenize("offensive", "val_text.txt")

# y-test
oy_test = import_("offensive", "val_labels.txt")


In [None]:
#Pipeline for sgdclassifier
sgd_clf = Pipeline([
     ('vec', CountVectorizer(tokenizer = lambda x: x, lowercase = False,
                ngram_range=(1,3), max_df = 0.7, min_df = 5, max_features = 5000)),
     ('tfidf', TfidfTransformer(use_idf=False)),
     ('clf', SGDClassifier(loss="hinge")),
])

sgd_clf.fit(ox_train, ox_test)
sgd_predicted2 = sgd_clf.predict(oy_train)
#sgd_predicted2

report_clf_stats(sgd_predicted2, oy_test, "offensive")

<i> Highest Achieved accuracy score for SGDClassifier: 78.6% </i>

In [None]:
# MultinomialNB
multinb_clf = Pipeline([
     ('vec', CountVectorizer(tokenizer = lambda x: x, lowercase = False)),
     ('tfidf', TfidfTransformer(use_idf=False)),
     ('clf', MultinomialNB()),
])

multinb_clf.fit(ox_train, ox_test)
multinb_predict = multinb_clf.predict(oy_train)

report_clf_stats(multinb_predict, oy_test, "offensive")

In [None]:
#ComplementNB
complement_clf = Pipeline([
     ('vec', CountVectorizer(tokenizer = lambda x: x, lowercase = False)),
     ('tfidf', TfidfTransformer(use_idf=False)),
     ('clf', ComplementNB()),
])

complement_clf.fit(ox_train, ox_test)
complement_predict = complement_clf.predict(oy_train)

report_clf_stats(complement_predict, oy_test, "offensive")

It takes time to run SVC classifier

In [None]:
# SVC
SVC_clf = Pipeline([
     ('vec', CountVectorizer(tokenizer = lambda x: x, lowercase = False)),
     ('tfidf', TfidfTransformer(use_idf=False)),
     ('clf', SVC(kernel='poly', degree = 3)),
])

SVC_clf.fit(ox_train, ox_test)
SVC_predict = SVC_clf.predict(oy_train)

report_clf_stats(SVC_predict, oy_test, "offensive")
