In [1]:
import pandas as pd
import numpy as np

In [2]:
#put the path to the training and test directories on your device here
TRAINING_PATH = "mediaeval-2015-trainingset.txt"
TESTING_PATH = "mediaeval-2015-testset.txt"

In [3]:
original_training = pd.read_csv(TRAINING_PATH, delimiter = "\t")
original_testing = pd.read_csv(TESTING_PATH, delimiter = "\t")

In [4]:
#drop all columns apart from the text and the label as none of the other data appears to be useful
original_training = original_training.drop(["tweetId", "userId", "imageId(s)", "username", "timestamp"], axis = 1)
#Do the same for the testing set
original_testing = original_testing.drop(["tweetId", "userId", "imageId(s)", "username", "timestamp"], axis = 1)

In [5]:
#add a column to store the language, initially empty before langdetect populates it
original_training["lang"] = np.nan
original_testing["lang"] = np.nan

In [6]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from langdetect import detect

In [7]:
#Responsible for parsing tweets
class TweetHandler:
    
    def __init__(self):
        snowball_langs = list(SnowballStemmer.languages)
        #some languages are supported by stemming but NOT supported by language specific tokenizing,
        #only the tokens that are in this set are supported by language specific tokenizing
        self.tokenizer_langs = {"da", "nl", "en", "fi", "fr", "de", "it", "pt", "ru", "es", "sv"}
        langdetect_langs = ["ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "illegal", "pt", "ro", "ru", "es", "sv"]
        #a dictionary to map the corresponding snowball and langdetect properties
        self.lang_dict = dict(zip(langdetect_langs, snowball_langs))
        #declare some custom stop words
        self.custom_stops = ["http","nhttp","https"]

    #takes a tweet, detects its language, removes any stop words in the language, tokenizes and stems
    #specific to the detected language and returns the simplified tokens paired with the language
    def parse_tweet(self, tweet):
        
        try:
            lang_prediction = detect(tweet)
            #the nltk name for the predicted language
            nltkprop = self.lang_dict[lang_prediction]
        except:
            #assume english stopwords and stemming if the language cannot be detected
            lang_prediction = "unknown"
            nltkprop = "english"
            
        # if the language is not supported by the tokenizer (including unkown) then assume tokenizing in English, however stemming
        # and stopwords may still be supported in the language that does not support language specific tokenization
        # e.g. arabic, hungarian, romanian so tokenize with the english
        # version of the algorithm if this is the case and use the stemming and stopwords specific to 
        # the language if this is available even if the tokenization algorithm isnt
        # use a python ternary expression to do this
        tokens = word_tokenize(tweet, language = nltkprop if lang_prediction in self.tokenizer_langs else "english")
        
        #stop words specific to the language
        stop_words = set(stopwords.words(nltkprop))
        
        #stemming algorithm specific to the language detected
        stemmer = SnowballStemmer(nltkprop)
        
        # store all tokens to be output as a concatenated string here so that this string
        # can later be fed to a CountVectorizer or TfIDFVectorizer , filter out any unwanted tokens 
        # and don't add them 
        filtered_tokens = ""
        
        for tok in tokens:
            
            #remove any hashtags
            if tok[0] == '#':
                tok = tok[1:]
                
            #discard non alphanumeric strings containing symbols or pure digits, or stop words
            if (not tok.isalnum()) or tok.isdigit() or (tok in stop_words) or tok in self.custom_stops:
                continue;
            
            #carry out stemming specific to the language detected
            filtered_tokens += " " + stemmer.stem(tok)
        
        return filtered_tokens, lang_prediction

In [8]:
from copy import deepcopy

In [9]:
# Transform the dataset from a dataset of tweets into a dataset of labelled tokens in concatenated
# string form, along with the detected language

def transform_data(arg):

    #copy the argument given so we don't change the original instance and can keep it in memory and reuse it 
    #if necessary
    dataset = deepcopy(arg)
    
    th = TweetHandler()
    num_rows = dataset.label.size
    
    #the tweet text will be transformed into tokens so rename the column appropriately
    dataset = dataset.rename(columns = {"tweetText" : "tokens"})
    
    for i in range(num_rows):

        tweet = dataset.tokens[i]
        label = dataset.label[i]

        #disregard the humour information for now, map humor and fake to a single class
        if ("humor" in label) or ("fake" in label):
            label = 1
        else:
            label = 0
        
        tokens, lang = th.parse_tweet(tweet)
        
        #replace the row with the simplified tokens, the mapped labels and the detected language
        dataset.loc[i] = tokens, label, lang
    
    #make sure the label column is converted into a column of integers and not objects
    dataset.label = dataset.label.astype("int")
    return dataset

In [10]:
#transform the data and populate language column
simplified_training = transform_data(original_training)
simplified_testing = transform_data(original_testing)

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [19]:
sgd_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,7), max_features = 23000)),
    ("sgd", SGDClassifier(alpha = 0.0001, l1_ratio = 0.6, penalty = "elasticnet", random_state = 1, n_jobs = -1))
])

bnb_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,2), max_features = 5000)),
    ("bnb", BernoulliNB(alpha = 2))
])

rf_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,1), max_features = 18000)),
    ("rf", RandomForestClassifier(n_estimators = 800, max_depth = None, max_features = "auto", random_state = 1, n_jobs = -1))
])

mnb_pipe = Pipeline([
    ("tf", TfidfVectorizer(ngram_range = (1,9))),
    ("mnb", MultinomialNB(alpha = 0.25))
])

vcf = VotingClassifier([
    #("sgd", sgd_pipe),
    ("bnb", bnb_pipe),
    ("mnb", mnb_pipe),
    ("rf", rf_pipe)
], voting = "hard", n_jobs = -1)

vcf.fit(simplified_training.tokens, simplified_training.label)

VotingClassifier(estimators=[('bnb',
                              Pipeline(steps=[('cv',
                                               CountVectorizer(max_features=5000,
                                                               ngram_range=(1,
                                                                            2))),
                                              ('bnb', BernoulliNB(alpha=2))])),
                             ('mnb',
                              Pipeline(steps=[('tf',
                                               TfidfVectorizer(ngram_range=(1,
                                                                            9))),
                                              ('mnb',
                                               MultinomialNB(alpha=0.25))])),
                             ('rf',
                              Pipeline(steps=[('cv',
                                               CountVectorizer(max_features=18000)),
                              

In [20]:
true = simplified_testing.label
predictions = vcf.predict(simplified_testing.tokens)

In [21]:
def report(true, predictions):
    
    print("-------------------- REPORT --------------------\n")
    
    #Format the scores to 2 decimal places
    print("F1 score:", "%0.2f" % f1_score(true, predictions))
    print("\nPrecision score:", "%0.2f" % precision_score(true, predictions))
    print("\nRecall score:", "%0.2f" % recall_score(true, predictions))
        
    print("\nConfusion matrix:\n\n", confusion_matrix(true,predictions))

In [22]:
report(true, predictions)

-------------------- REPORT --------------------

F1 score: 0.90

Precision score: 0.87

Recall score: 0.94

Confusion matrix:

 [[ 850  359]
 [ 164 2382]]
