In [1]:
import pandas as pd
import numpy as np

In [2]:
#put the path to the training and test directories on your device here
TRAINING_PATH = "mediaeval-2015-trainingset.txt"
TESTING_PATH = "mediaeval-2015-testset.txt"

In [3]:
original_training = pd.read_csv(TRAINING_PATH, delimiter = "\t")
original_testing = pd.read_csv(TESTING_PATH, delimiter = "\t")

In [4]:
#drop all columns apart from the text and the label as none of the other data appears to be useful
original_training = original_training.drop(["tweetId", "userId", "imageId(s)", "username", "timestamp"], axis = 1)
#Do the same for the testing set
original_testing = original_testing.drop(["tweetId", "userId", "imageId(s)", "username", "timestamp"], axis = 1)

In [5]:
#add a column to store the language, initially empty before langdetect populates it
original_training["lang"] = np.nan
original_testing["lang"] = np.nan

In [None]:
original_training = original_training.sample(frac = 1)
original_testing = original_testing

In [6]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from langdetect import detect

In [7]:
#Responsible for parsing tweets
class TweetHandler:
    
    def __init__(self):
        snowball_langs = list(SnowballStemmer.languages)
        #some languages are supported by stemming but NOT supported by language specific tokenizing,
        #only the tokens that are in this set are supported by language specific tokenizing
        self.tokenizer_langs = {"da", "nl", "en", "fi", "fr", "de", "it", "pt", "ru", "es", "sv"}
        langdetect_langs = ["ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "illegal", "pt", "ro", "ru", "es", "sv"]
        #a dictionary to map the corresponding snowball and langdetect properties
        self.lang_dict = dict(zip(langdetect_langs, snowball_langs))
        #declare some custom stop words
        self.custom_stops = ["http","nhttp","https"]

    #takes a tweet, detects its language, removes any stop words in the language, tokenizes and stems
    #specific to the detected language and returns the simplified tokens paired with the language
    def parse_tweet(self, tweet):
        
        try:
            lang_prediction = detect(tweet)
            #the nltk name for the predicted language
            nltkprop = self.lang_dict[lang_prediction]
        except:
            #assume english stopwords and stemming if the language cannot be detected
            lang_prediction = "unknown"
            nltkprop = "english"
            
        # if the language is not supported by the tokenizer (including unkown) then assume tokenizing in English, however stemming
        # and stopwords may still be supported in the language that does not support language specific tokenization
        # e.g. arabic, hungarian, romanian so tokenize with the english
        # version of the algorithm if this is the case and use the stemming and stopwords specific to 
        # the language if this is available even if the tokenization algorithm isnt
        # use a python ternary expression to do this
        tokens = word_tokenize(tweet, language = nltkprop if lang_prediction in self.tokenizer_langs else "english")
        
        #stop words specific to the language
        stop_words = set(stopwords.words(nltkprop))
        
        #stemming algorithm specific to the language detected
        stemmer = SnowballStemmer(nltkprop)
        
        # store all tokens to be output as a concatenated string here so that this string
        # can later be fed to a CountVectorizer or TfIDFVectorizer , filter out any unwanted tokens 
        # and don't add them 
        filtered_tokens = ""
        
        for tok in tokens:
            
            #remove any hashtags
            if tok[0] == '#':
                tok = tok[1:]
                
            #discard non alphanumeric strings containing symbols or pure digits, or stop words
            if (not tok.isalnum()) or tok.isdigit() or (tok in stop_words) or tok in self.custom_stops:
                continue;
            
            #carry out stemming specific to the language detected
            filtered_tokens += " " + stemmer.stem(tok)
        
        return filtered_tokens, lang_prediction

In [8]:
from copy import deepcopy

In [9]:
# Transform the dataset from a dataset of tweets into a dataset of labelled tokens in concatenated
# string form, along with the detected language

def transform_data(arg):

    #copy the argument given so we don't change the original instance and can keep it in memory and reuse it 
    #if necessary
    dataset = deepcopy(arg)
    
    th = TweetHandler()
    num_rows = dataset.label.size
    
    #the tweet text will be transformed into tokens so rename the column appropriately
    dataset = dataset.rename(columns = {"tweetText" : "tokens"})
    
    for i in range(num_rows):

        tweet = dataset.tokens[i]
        label = dataset.label[i]

        #disregard the humour information for now, map humor and fake to a single class
        if ("humor" in label) or ("fake" in label):
            label = 1
        else:
            label = 0
        
        tokens, lang = th.parse_tweet(tweet)
        
        #replace the row with the simplified tokens, the mapped labels and the detected language
        dataset.loc[i] = tokens, label, lang
    
    #make sure the label column is converted into a column of integers and not objects
    dataset.label = dataset.label.astype("int")
    return dataset

In [10]:
#transform the data and populate language column
simplified_training = transform_data(original_training)
simplified_testing = transform_data(original_testing)

In [161]:
simplified_training = simplified_training.sample(frac = 1)
simplified_training

Unnamed: 0,tokens,label,lang
3446,cafeinoman acojona sandy,1,pt
13756,gambar ini bukan mh370 ini adalah gambar dari...,1,unknown
1583,better pic porch fish shark sandi,1,en
13647,sochi serv all food in ass olymp how putin ass,1,en
7200,dope hurricanesandi,1,en
...,...,...,...
6450,y asi pas tiburon call wildwood pas huracan s...,1,es
6484,huracan sandy new york dios cuid tod person,1,es
6411,sandi,1,en
11302,rememb presid bush vacat hurrican katrina thi...,0,en


In [162]:
simplified_testing = simplified_testing.sample(frac = 1)
simplified_testing

Unnamed: 0,tokens,label,lang
3314,syria syrian hero boy rescu girl shootout,1,en
1716,nepal histor dharahara tower collaps massiv e...,0,en
2398,syria syrian hero boy rescu girl shootout see...,1,unknown
1303,nepal amp histor dharahara tower collaps mass...,0,en
1636,nepal histor dharahara tower collaps massiv e...,0,en
...,...,...,...
3197,syria syrian hero boy rescu girl shootout مجه...,1,en
2583,syrian hero boy rescu girl while under sniper...,1,en
2713,syria syrian hero boy rescu girl shootout الط...,1,en
2957,awwww rt syrian boy appear brave sniper fire ...,1,en


In [163]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

# TESTING STUFF
from sklearn.neighbors import KNeighborsClassifier

In [164]:
from keras.wrappers.scikit_learn import KerasClassifier

In [173]:
sgd_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,7), max_features = 23000)),
    ("sgd", SGDClassifier(alpha = 0.0001, l1_ratio = 0.6, penalty = "elasticnet", random_state = 10, n_jobs = -1))
])

svc_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,1), max_features = 3000)),
    ("svc", LinearSVC(C = 1)),
])

mnb_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range = (1,9))),
    ("mnb", MultinomialNB(alpha = 0.25))
])

rf_pipe = Pipeline([
    ("cv", CountVectorizer(ngram_range = (1,1), max_features = 18000)),
    ("rf", RandomForestClassifier(n_estimators = 800, max_depth = None, max_features = "auto", random_state = 10, n_jobs = -1))
])

vcf = VotingClassifier([
    ("sgd", sgd_pipe),
    ("svc", svc_pipe),
    ("mnb", mnb_pipe),
    ("rf", rf_pipe)
], voting = "hard", n_jobs = -1)

vcf.fit(simplified_training.tokens, simplified_training.label)

VotingClassifier(estimators=[('sgd',
                              Pipeline(steps=[('cv',
                                               CountVectorizer(max_features=23000,
                                                               ngram_range=(1,
                                                                            7))),
                                              ('sgd',
                                               SGDClassifier(l1_ratio=0.6,
                                                             n_jobs=-1,
                                                             penalty='elasticnet',
                                                             random_state=10))])),
                             ('svc',
                              Pipeline(steps=[('cv',
                                               CountVectorizer(max_features=3000)),
                                              ('svc', LinearSVC(C=1))])),
                             ('mnb',
              

In [174]:
true = simplified_testing.label
predictions = vcf.predict(simplified_testing.tokens)

In [175]:
def report(true, predictions):
    
    print("-------------------- REPORT --------------------\n")
    
    #Format the scores to 2 decimal places
    print("F1 score:", "%0.2f" % f1_score(true, predictions))
    print("\nPrecision score:", "%0.2f" % precision_score(true, predictions))
    print("\nRecall score:", "%0.2f" % recall_score(true, predictions))
        
    print("\nConfusion matrix:\n\n", confusion_matrix(true,predictions))

In [176]:
print(classification_report(true, predictions))

              precision    recall  f1-score   support

           0       0.69      0.78      0.73      1209
           1       0.89      0.83      0.86      2546

    accuracy                           0.82      3755
   macro avg       0.79      0.81      0.80      3755
weighted avg       0.82      0.82      0.82      3755



In [29]:
knn_pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("knn", KNeighborsClassifier())
])

knn_pipe.fit(simplified_training.tokens, simplified_training.label)

true = simplified_testing.label
predictions = knn_pipe.predict(simplified_testing.tokens)

report(true, predictions)

-------------------- REPORT --------------------

F1 score: 0.80

Precision score: 0.68

Recall score: 0.97

Confusion matrix:

 [[  35 1174]
 [  85 2461]]


In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from random import shuffle

In [132]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(simplified_training.tokens)

raw_train = tokenizer.texts_to_sequences(simplified_training.tokens)
raw_test = tokenizer.texts_to_sequences(simplified_testing.tokens)

padded_train = pad_sequences(raw_train, padding = "post", maxlen = 24)
padded_test = pad_sequences(raw_train, padding = "post", maxlen = 24)

#pair each instance with it's label
unsplitwlabels = list(zip(padded_train, simplified_training.label))
testwlabels = list(zip(padded_test, simplified_testing.label))

#shuffle the training data before splitting it into a validation set
shuffle(unsplitwlabels)

In [133]:
#split into training and validation

#take the first n elements of the list
trainwlabels = unsplitwlabels[:12000]

#take the last n elements of the list
validationwlabels = unsplitwlabels[12000:]

In [134]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

11836

In [156]:

model = Sequential()

#word2vec library
model.add(layers.Embedding(input_dim = vocab_size, output_dim = 5, input_length = 24))

model.add(layers.AveragePooling1D(24))
model.add(layers.Flatten())
model.add(layers.Dense(5, activation = "relu"))
model.add(layers.Dense(1, activation = "sigmoid"))
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 24, 5)             59180     
_________________________________________________________________
average_pooling1d_9 (Average (None, 1, 5)              0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 5)                 0         
_________________________________________________________________
dense_22 (Dense)             (None, 5)                 30        
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 6         
Total params: 59,216
Trainable params: 59,216
Non-trainable params: 0
_________________________________________________________________


In [136]:
train_data = np.array(list(np.array(x[0]).astype(int) for x in trainwlabels))
train_label = np.array(list(np.array(x[1]).astype(int) for x in trainwlabels))
validation_data = np.array(list(np.array(x[0]).astype(int) for x in validationwlabels))
validation_label = np.array(list(np.array(x[1]).astype(int) for x in validationwlabels))
test_data = np.array(list(np.array(x[0]).astype(int) for x in testwlabels))
test_label = np.array(list(np.array(x[1]).astype(int) for x in testwlabels))

In [137]:
model.fit(
    train_data, train_label,
    epochs = 100, 
    verbose = False, #True, 
    validation_data = (validation_data, validation_label)
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [159]:
#trains the model and returns it for use with sklearn
def forskl():
    
    tokenizer = Tokenizer(num_words = 5000)
    tokenizer.fit_on_texts(simplified_training.tokens)

    raw_train = tokenizer.texts_to_sequences(simplified_training.tokens)
    raw_test = tokenizer.texts_to_sequences(simplified_testing.tokens)

    padded_train = pad_sequences(raw_train, padding = "post", maxlen = 24)
    padded_test = pad_sequences(raw_train, padding = "post", maxlen = 24)

    #pair each instance with it's label
    unsplitwlabels = list(zip(padded_train, simplified_training.label))
    testwlabels = list(zip(padded_test, simplified_testing.label))

    #shuffle the training data before splitting it into a validation set
    shuffle(unsplitwlabels)
    
    train_data = np.array(list(np.array(x[0]).astype(int) for x in trainwlabels))
    train_label = np.array(list(np.array(x[1]).astype(int) for x in trainwlabels))
    validation_data = np.array(list(np.array(x[0]).astype(int) for x in validationwlabels))
    validation_label = np.array(list(np.array(x[1]).astype(int) for x in validationwlabels))
    test_data = np.array(list(np.array(x[0]).astype(int) for x in testwlabels))
    test_label = np.array(list(np.array(x[1]).astype(int) for x in testwlabels))
    
    model = build_model()
    model.fit(
        train_data, train_label,
        epochs = 100, 
        verbose = False, #True, 
        validation_data = (validation_data, validation_label)
    )
    
    return model

In [138]:
model.predict(test_data).shape

(3755, 1)

In [101]:
len(test_data)
len(predictions)

3755

In [139]:
predictions = model.predict(train_data)
predictions = (predictions > 0.5).astype('int').reshape(-1)

In [103]:
sum(predictions > 0.5)/len(predictions)

0.66275

In [122]:
from sklearn.metrics import classification_report

In [140]:
print(classification_report(train_label, predictions))

              precision    recall  f1-score   support

           0       0.99      0.93      0.96      4189
           1       0.96      0.99      0.98      7811

    accuracy                           0.97     12000
   macro avg       0.97      0.96      0.97     12000
weighted avg       0.97      0.97      0.97     12000



In [141]:
predictions = model.predict(test_data)
predictions = (predictions > 0.5).astype('int').reshape(-1)

In [142]:
print(classification_report(test_label, predictions))

              precision    recall  f1-score   support

           0       0.37      0.02      0.04      1209
           1       0.68      0.98      0.80      2546

    accuracy                           0.67      3755
   macro avg       0.52      0.50      0.42      3755
weighted avg       0.58      0.67      0.56      3755

