In [1]:
import pandas as pd
import numpy as np

In [2]:
#put the path to the training and test directories on your device here
TRAINING_PATH = "mediaeval-2015-trainingset.txt"
TESTING_PATH = "mediaeval-2015-testset.txt"

In [3]:
original_training = pd.read_csv(TRAINING_PATH, delimiter = "\t")
original_testing = pd.read_csv(TESTING_PATH, delimiter = "\t")

In [4]:
#drop all columns apart from the text and the label as none of the other data appears to be useful
original_training = original_training.drop(["tweetId", "userId", "imageId(s)", "username", "timestamp"], axis = 1)
#Do the same for the testing set
original_testing = original_testing.drop(["tweetId", "userId", "imageId(s)", "username", "timestamp"], axis = 1)

In [5]:
#add a column to store the language, initially empty before langdetect populates it
original_training["lang"] = np.nan
original_testing["lang"] = np.nan

In [6]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from langdetect import detect

In [7]:
#Responsible for parsing tweets
class TweetHandler:
    
    def __init__(self):
        snowball_langs = list(SnowballStemmer.languages)
        #some languages are supported by stemming but NOT supported by language specific tokenizing,
        #only the tokens that are in this set are supported by language specific tokenizing
        self.tokenizer_langs = {"da", "nl", "en", "fi", "fr", "de", "it", "pt", "ru", "es", "sv"}
        langdetect_langs = ["ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "illegal", "pt", "ro", "ru", "es", "sv"]
        #a dictionary to map the corresponding snowball and langdetect properties
        self.lang_dict = dict(zip(langdetect_langs, snowball_langs))
        #declare some custom stop words
        self.custom_stops = ["http","nhttp","https"]

    #takes a tweet, detects its language, removes any stop words in the language, tokenizes and stems
    #specific to the detected language and returns the simplified tokens paired with the language
    def parse_tweet(self, tweet):
        
        try:
            lang_prediction = detect(tweet)
            #the nltk name for the predicted language
            nltkprop = self.lang_dict[lang_prediction]
        except:
            #assume english stopwords and stemming if the language cannot be detected
            lang_prediction = "unknown"
            nltkprop = "english"
            
        # if the language is not supported by the tokenizer (including unkown) then assume tokenizing in English, however stemming
        # and stopwords may still be supported in the language that does not support language specific tokenization
        # e.g. arabic, hungarian, romanian so tokenize with the english
        # version of the algorithm if this is the case and use the stemming and stopwords specific to 
        # the language if this is available even if the tokenization algorithm isnt
        # use a python ternary expression to do this
        tokens = word_tokenize(tweet, language = nltkprop if lang_prediction in self.tokenizer_langs else "english")
        
        #stop words specific to the language
        stop_words = set(stopwords.words(nltkprop))
        
        #stemming algorithm specific to the language detected
        stemmer = SnowballStemmer(nltkprop)
        
        # store all tokens to be output as a concatenated string here so that this string
        # can later be fed to a CountVectorizer or TfIDFVectorizer , filter out any unwanted tokens 
        # and don't add them 
        filtered_tokens = ""
        
        for tok in tokens:
            
            #remove any hashtags
            if tok[0] == '#':
                tok = tok[1:]
                
            #discard non alphanumeric strings containing symbols or pure digits, or stop words
            if (not tok.isalnum()) or tok.isdigit() or (tok in stop_words) or tok in self.custom_stops:
                continue;
            
            #carry out stemming specific to the language detected
            filtered_tokens += " " + stemmer.stem(tok)
        
        return filtered_tokens, lang_prediction

In [8]:
from copy import deepcopy

In [9]:
# Transform the dataset from a dataset of tweets into a dataset of labelled tokens in concatenated
# string form, along with the detected language

def transform_data(arg):

    #copy the argument given so we don't change the original instance and can keep it in memory and reuse it 
    #if necessary
    dataset = deepcopy(arg)
    
    th = TweetHandler()
    num_rows = dataset.label.size
    
    #the tweet text will be transformed into tokens so rename the column appropriately
    dataset = dataset.rename(columns = {"tweetText" : "tokens"})
    
    for i in range(num_rows):

        tweet = dataset.tokens[i]
        label = dataset.label[i]

        #disregard the humour information for now, map humor and fake to a single class
        if ("humor" in label) or ("fake" in label):
            label = 1
        else:
            label = 0
        
        tokens, lang = th.parse_tweet(tweet)
        
        #replace the row with the simplified tokens, the mapped labels and the detected language
        dataset.loc[i] = tokens, label, lang
    
    #make sure the label column is converted into a column of integers and not objects
    dataset.label = dataset.label.astype("int")
    return dataset

In [10]:
#transform the data and populate language column
simplified_training = transform_data(original_training)
simplified_testing = transform_data(original_testing)

In [42]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import tensorflow.keras as kers

In [30]:
#Dense matrices are required by keras
cv = CountVectorizer(max_features = 10000)
cv.fit(simplified_training.tokens)
train_data = cv.transform(simplified_training.tokens).todense()
train_labels = simplified_training.label 

cv2 = CountVectorizer(max_features = 10000)
cv2.fit(simplified_testing.tokens)
test_data = cv.transform(simplified_testing.tokens).todense()
test_labels = simplified_testing.label

In [31]:
#convert the matrix made by the CountVectorizer and the labels into a tensorflow dataset
dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))

In [24]:
model = keras.Sequential()

#Tries to group words that are similar e.g. great and good, does kind of "clustering" of words with similar meanings
model.add(keras.layers.Embedding(10000, 16))


model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation = "relu"))

#try adding some more layers

#model.add(keras.layers.Dense(16, activation = "relu"))
#model.add(keras.layers.Dense(16, activation = "relu"))
model.add(keras.layers.Dense(1, activation = "sigmoid"))

model.summary()
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

NameError: name 'keras' is not defined

In [28]:
train_data = dataset.take(13000)
validation_data = dataset.skip(13000)
train_data

<TakeDataset shapes: ((10000,), ()), types: (tf.int64, tf.int32)>

In [32]:
dataset.__dict__

{'_tensors': [<tf.Tensor: shape=(14277, 10000), dtype=int64, numpy=
  array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]], dtype=int64)>,
  <tf.Tensor: shape=(14277,), dtype=int32, numpy=array([1, 1, 1, ..., 1, 1, 1])>],
 '_structure': (TensorSpec(shape=(10000,), dtype=tf.int64, name=None),
  TensorSpec(shape=(), dtype=tf.int32, name=None)),
 '_variant_tensor_attr': <tf.Tensor: shape=(), dtype=variant, numpy=<unprintable>>,
 '_self_setattr_tracking': True,
 '_self_unconditional_checkpoint_dependencies': [TrackableReference(name='_variant_tracker', ref=<tensorflow.python.data.ops.dataset_ops._VariantTracker object at 0x000002B4B6591520>)],
 '_self_unconditional_dependency_names': {'_variant_tracker': <tensorflow.python.data.ops.dataset_ops._VariantTracker at 0x2b4b6591520>},
 '_self_unconditional_deferred_dependencies': {},
 

In [48]:
model = kers.Sequential([
    kers.layers.Dense(10000),
    kers.layers.Dense(10000, activation = tf.nn.sigmoid)
])

In [49]:
model.compile(optimizer = "sgd", loss = "categorical_crossentropy", metrics = ["accuracy"])

In [50]:
model.fit(train_data, train_labels, epochs = 10)

Epoch 1/10


ValueError: in user code:

    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:805 train_function  *
        return step_function(self, iterator)
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:788 run_step  **
        outputs = model.train_step(data)
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:755 train_step
        loss = self.compiled_loss(
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:203 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\keras\losses.py:152 __call__
        losses = call_fn(y_true, y_pred)
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\keras\losses.py:256 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\keras\losses.py:1537 categorical_crossentropy
        return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\keras\backend.py:4833 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    C:\Users\George\anaconda3\lib\site-packages\tensorflow\python\framework\tensor_shape.py:1134 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 1) and (None, 10000) are incompatible


In [None]:
pd.set_option('display.max_colwidth', None)

In [129]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from random import shuffle

In [144]:
tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(simplified_training.tokens)

raw_train = tokenizer.texts_to_sequences(simplified_training.tokens)
raw_test = tokenizer.texts_to_sequences(simplified_testing.tokens)

padded_train = pad_sequences(raw_train, padding = "post", maxlen = 24)
padded_test = pad_sequences(raw_train, padding = "post", maxlen = 24)

#pair each instance with it's label
unsplitwlabels = list(zip(padded_train, simplified_training.label))
testwlabels = list(zip(padded_test, simplified_testing.label))

#shuffle the training data before splitting it into a validation set
shuffle(unsplitwlabels)

In [145]:
#take the first n elements of the list
trainwlabels = trainwlabels[:12000]

#take the last n elements of the list
validationwlabels = trainwlabels[12000:]

In [146]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

11814

In [184]:
model = Sequential()
model.add(layers.Embedding(input_dim = vocab_size, output_dim = 50, input_length = 24))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation = "relu"))
model.add(layers.Dense(1, activation = "sigmoid"))
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 24, 50)            590700    
_________________________________________________________________
flatten_2 (Flatten)          (None, 1200)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 10)                12010     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 11        
Total params: 602,721
Trainable params: 602,721
Non-trainable params: 0
_________________________________________________________________


In [185]:
train_data = np.array(list(np.array(x[0]).astype(int) for x in trainwlabels))
train_label = np.array(list(np.array(x[1]).astype(int) for x in trainwlabels))
validation_data = np.array(list(np.array(x[0]).astype(int) for x in validationwlabels))
validation_label = np.array(list(np.array(x[1]).astype(int) for x in validationwlabels))
test_data = np.array(list(np.array(x[0]).astype(int) for x in testwlabels))
test_label = np.array(list(np.array(x[1]).astype(int) for x in testwlabels))

  train_data = np.array(list(np.array(x[0]).astype(int) for x in trainwlabels))


In [None]:
train_data = np.array(list(np.array(x[0]).astype(int) for x in trainwlabels))
train_label = np.array(list(np.array(x[1]).astype(int) for x in trainwlabels))
validation_data = np.array(list(np.array(x[0]).astype(int) for x in validationwlabels))
validation_label = np.array(list(np.array(x[1]).astype(int) for x in validationwlabels))
test_data = np.array(list(np.array(x[0]).astype(int) for x in testwlabels))
test_label = np.array(list(np.array(x[1]).astype(int) for x in testwlabels))

In [189]:
history = model.fit(
    raw_train, simplified_training.label,
    epochs = 20, 
    verbose = True, 
    validation_data = (raw_test, simplified_testing.label)
)

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {"(<class 'list'> containing values of types set())", '(<class \'list\'> containing values of types {"<class \'int\'>"})'}), <class 'pandas.core.series.Series'>

In [72]:
X_train.sort(key = len)
X_train.reverse()
X_train

[[8072,
  2,
  1,
  1548,
  7,
  8,
  52,
  167,
  478,
  148,
  105,
  310,
  98,
  59,
  2,
  1,
  1224,
  2,
  6,
  1,
  65,
  30,
  339,
  771,
  1463,
  729,
  8073,
  310,
  98,
  59,
  1258,
  1,
  2,
  8074,
  998,
  98,
  59,
  2,
  1,
  1099,
  468,
  311,
  49,
  11,
  25,
  2407,
  1475,
  125,
  620,
  2931,
  8075,
  310,
  98,
  59,
  2,
  1,
  597,
  4233,
  439,
  11,
  25,
  614,
  148,
  321,
  8076,
  310,
  98,
  59,
  204,
  226,
  1259,
  2,
  1,
  82,
  8077,
  8078,
  236,
  98,
  59,
  38,
  350,
  25,
  100,
  2,
  1,
  1099,
  183,
  1012,
  464,
  8079,
  310,
  98,
  59,
  2,
  1,
  14,
  251,
  912,
  100,
  77,
  104,
  63,
  310,
  98,
  59,
  2,
  1,
  144,
  7,
  8,
  52,
  24,
  734,
  361,
  8080,
  236,
  98,
  59,
  16,
  1734,
  76,
  34,
  2,
  1,
  8081,
  310,
  98,
  59,
  314,
  124,
  2,
  1,
  544,
  346,
  8082,
  884,
  8083,
  236,
  98,
  59,
  57,
  311,
  109,
  2,
  1,
  656,
  432,
  113,
  76,
  2,
  1,
  92,
  8084,
  310,
  98,


In [86]:
mapped = list(map (lambda x: len(x), X_train))

In [87]:
mapped.reverse()
mapped

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
