In [None]:
from tensorflow.keras.utils import to_categorical


In [None]:
import pandas as pandas
import numpy as numpy
import tensorflow as tf

from keras_preprocessing.text import Tokenizer
from gensim.models import KeyedVectors
from keras_preprocessing.sequence import pad_sequences
from keras import Sequential, Model
from keras.layers import Conv1D, Dropout, Dense, Embedding, MaxPooling1D, Concatenate, Flatten, Input, LSTM
from keras.layers.merge import concatenate
from sklearn.utils import resample

In [None]:
PATH = '../input/fake-news-challenge/'


In [None]:
RANDOM_SEED = 42 # A random seed is a starting point in generating random numbers
numpy.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
# With the seed reset (every time), the same set of numbers will appear every time - make the results more predictable and therefore reproducable 
# If the random seed is not reset, different numbers appear with every invocation.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# load the data set from the train csv files
def load_train_data():     
    #create Pandas dataframes from the two csv files
    train_bodies = pandas.read_csv(PATH + "train_bodies.csv", encoding='utf-8')
    train_headlines = pandas.read_csv(PATH + "train_stances.csv", encoding='utf-8')

    #merge the csv files on Body ID so that we can have article headlines, bodies, and stances all in one dataframe together 
    train_data_set = pandas.merge(train_bodies, train_headlines, how='left', on='Body ID')
    stances = {
        'Stance': {
          'agree': 0,
          'disagree': 1,
          'discuss': 2,
          'unrelated': 3,
        }
    }
    train_data_set.replace(stances, inplace=True)
    print("original here")
    print(train_data_set)
    # print(train_data_set['Stance'].value_counts())
   
    # average to 8909 or 3678? because 36545 is a lot but 840 is very small
    # unrelated - 36545, discuss - 8909, agree - 3678, disagree - 840
    data_length = 8909

    # resampling ensures that distribution of classes (in this case, stances) are even - we chose to match them to one of the middle distribution sizes 
    # we want to pversample to better represent minorities classes (agree and disagree) so model has more samples to learn more 
    # also want to undersample majority class (unrelated) so that we don't risk our model skewing towards this class 
    unrelated_resampled = resample(train_data_set.loc[train_data_set['Stance'] == 3], replace = False, n_samples = data_length, random_state = RANDOM_SEED)
    discuss_resampled = resample(train_data_set.loc[train_data_set['Stance'] == 2], replace = False, n_samples = data_length, random_state = RANDOM_SEED)
    agree_resampled = resample(train_data_set.loc[train_data_set['Stance'] == 0], replace=True, n_samples=data_length, random_state=RANDOM_SEED)
    disagree_resampled = resample(train_data_set.loc[train_data_set['Stance'] == 1], replace=True, n_samples=data_length, random_state=RANDOM_SEED)
  
    
    all_resampled = [unrelated_resampled, discuss_resampled, agree_resampled, disagree_resampled]
    result = pandas.concat(all_resampled)
    result = result.sample(frac=1)

    print(result['Stance'].value_counts())
    print("result here")
    print(result)

    return result

In [None]:
def load_test_data():     
    #create Pandas dataframes from the two csv files
    train_bodies = pandas.read_csv(PATH + "competition_test_bodies.csv", encoding='utf-8')
    train_headlines = pandas.read_csv('../input/competition-test-stances/competition_test_stances.csv', encoding='utf-8')

    #merge the csv files on Body ID
    test_data_set = pandas.merge(train_bodies, train_headlines, how='left', on='Body ID')
    stances = {
        'Stance': {
          'agree': 0,
          'disagree': 1,
          'discuss': 2,
          'unrelated': 3,
        }
    }
    test_data_set.replace(stances, inplace=True)
    print(test_data_set['Stance'].value_counts())
    # print(test_data_set)
    
    return test_data_set 


In [None]:
import pickle as pkl

def prepare_data(data_set, length=None):
    # find the max length of each dataset, this is used to pad sequence vectors below so that we can safely pass data without worrying about dimensions
    bodies_max_length = 0
    headlines_max_length = 0
    if not length:
      bodies_max_length = data_set['articleBody'].map(lambda x : len(x.split())).max()
      headlines_max_length = data_set['Headline'].map(lambda x : len(x.split())).max()
    else:
      bodies_max_length = length[0]
      headlines_max_length = length[1]

    # we want to tokenize the data to break down the text into smaller units called tokens (can be words, characters, or subwords)
    # tokens are then used to prepare a vocabulary of the dataset (vocab is the set of unique tokens in the corpus)
    bodies_tokenizer, headlines_tokenizer = (Tokenizer(), Tokenizer())
    
    # fit the tokenizer on the data - creates the vocabulary index based on word frequency
    # the lower the index, the more frequently it appears
    bodies_tokenizer.fit_on_texts(data_set['articleBody'])
    headlines_tokenizer.fit_on_texts(data_set['Headline'])

    with open('bodies_tokenizer.pkl', 'wb') as handle:
      pkl.dump(bodies_tokenizer, handle, protocol=pkl.HIGHEST_PROTOCOL)
    with open('headlines_tokenizer.pkl', 'wb') as handle:
      pkl.dump(headlines_tokenizer, handle, protocol=pkl.HIGHEST_PROTOCOL)
    
    # convert the texts to sequences, we need to do this because computers understand integers not strings 
    # this process ransforms each text in texts to a sequence of integers, essentially it takes each word in the text and replaces it with its corresponding integer value from the vocab
    bodies_sequences = bodies_tokenizer.texts_to_sequences(data_set['articleBody'])
    headlines_sequences = headlines_tokenizer.texts_to_sequences(data_set['Headline'])

    # pad the data to be the max length, this helps avoid dimension errors 
    bodies_sequences = pad_sequences(bodies_sequences, maxlen=bodies_max_length, padding='post', truncating='post')
    headlines_sequences = pad_sequences(headlines_sequences, maxlen=headlines_max_length, padding='post', truncating='post')


    return bodies_sequences, headlines_sequences, bodies_tokenizer.word_index, headlines_tokenizer.word_index, data_set['Stance']


def create_embeddings(bodies_word_index, headlines_word_index):
    # create empty dictionaries for the embeddings
    bodies_embeddings_index, headlines_embeddings_index = ({},{})
    word2vec_model = KeyedVectors.load_word2vec_format('../input/googlenewsvectors/' + "GoogleNews-vectors-negative300.bin", binary=True) # here we are using 

    def getVector(str):
      if str in word2vec_model:
        return word2vec_model[str]
      else:
        return None;

    #save the vector for each word to the matrix
    bodies_embeddings_matrix = numpy.zeros((len(bodies_word_index)+1, 300))
    for word, i in bodies_word_index.items():
        embedding_vector = getVector(word)
        if embedding_vector is not None:
            bodies_embeddings_matrix[i] = embedding_vector

    headlines_embeddings_matrix = numpy.zeros((len(headlines_word_index)+1, 300))
    for word, i in headlines_word_index.items():
        embedding_vector = getVector(word)
        if embedding_vector is not None:
            headlines_embeddings_matrix[i] = embedding_vector

    return bodies_embeddings_matrix, headlines_embeddings_matrix
    
    #save the vector for each word to the matrix
    bodies_embeddings_matrix = numpy.zeros((len(bodies_word_index)+1, 100))
    for word, i in bodies_word_index.items():
        embedding_vector = bodies_embeddings_index.get(word)
        if embedding_vector is not None:
            bodies_embeddings_matrix[i] = embedding_vector

    headlines_embeddings_matrix = numpy.zeros((len(headlines_word_index)+1, 100))
    for word, i in headlines_word_index.items():
        embedding_vector = headlines_embeddings_index.get(word)
        if embedding_vector is not None:
            headlines_embeddings_matrix[i] = embedding_vector

    return bodies_embeddings_matrix, headlines_embeddings_matrix

if __name__ == '__main__':
    train_data = load_train_data()
    # train_data = train_data[train_data['Stance'] != 3]

    # g = train_data.groupby('Stance')
    # train_data = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))


    test_data = load_test_data()

    # f = test_data.groupby('Stance')
    # test_data = f.apply(lambda x: x.sample(f.size().min()).reset_index(drop=True))
    # test_data = test_data[test_data['Stance'] != 3]
    
    bodies_sequences, headlines_sequences, bodies_word_index, headlines_word_index, stances = prepare_data(train_data)
    test_bodies_sequences, test_headlines_sequences, test_bodies_word_index, test_headlines_word_index, test_stances = prepare_data(test_data,[bodies_sequences.shape[1],headlines_sequences.shape[1]])

    
    bodies_embeddings_matrix, headlines_embeddings_matrix = create_embeddings(bodies_word_index=bodies_word_index, headlines_word_index=headlines_word_index)

    bodies_vocab_size, headlines_vocab_size = len(bodies_word_index), len(headlines_word_index)


In [None]:
def create_model(embedding_matrix, vocab_size, input_length):
    model = Sequential()  # stack of layers, each has one input and one output that are passed through layers 
    model.add(Embedding(vocab_size + 1,300, weights = [embedding_matrix], trainable=False, input_length=input_length)) # tell model to use word2vec embeddings for words

    # convolutional -> dropout -> pooling
    
    # convolutional: apply a filter to an input to create a feature map that summarizes the presence of detected features in the input
    # dropout: randomly sets input units to 0 with a frequency of rate at each step during training time, which helps prevent overfitting
      # overfitting means that the model learns features of the testing set too well that it can't generalize to testing data and has low testing accuracy but high training accuracy
    # pooling: reduce the dimensions of the feature maps -> reduce number of parameters to learn by summarizing the features present in a region of feature map 

    model.add(Conv1D(256, 3, activation='relu')) # activation function decides if node would fire or not - relu gets rid of negative inputs 
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2, padding="same"))
    model.add(LSTM(100, activation='tanh', return_sequences=True))

    model.add(Conv1D(256, 3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2,padding="same"))
    model.add(LSTM(100, activation='tanh', return_sequences=True))

    model.add(Conv1D(512, 3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2,padding="same"))
    model.add(LSTM(100, activation='tanh', return_sequences=True))

    model.add(Conv1D(512, 3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2,padding="same"))
    model.add(LSTM(100, activation='tanh', return_sequences=True))


    print(input_length)
    if input_length >= 512:
      print("issue5 starting")
      model.add(Conv1D(512, 3, activation='relu'))
      model.add(Dropout(0.5))
      model.add(MaxPooling1D(pool_size=2,padding="same"))
      model.add(LSTM(100, activation='tanh', return_sequences=True))
      print("issue5")
      

    print("issue6 starting")
    model.add(Conv1D(768, 1, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2,padding="same"))
    model.add(LSTM(100, activation='tanh', return_sequences=True))

    model.add(Flatten())

    return model

In [None]:
bodies_model = create_model(embedding_matrix=bodies_embeddings_matrix, vocab_size=bodies_vocab_size, input_length=bodies_sequences.shape[1])

headlines_model = create_model(embedding_matrix=headlines_embeddings_matrix, vocab_size=headlines_vocab_size, input_length=headlines_sequences.shape[1])

print(bodies_vocab_size)
print(headlines_vocab_size)


finalModel = Sequential()
print(bodies_model.input)
print(headlines_model.input)
print(bodies_model.output)
print(headlines_model.output)
finalModel = Concatenate()([bodies_model.output, headlines_model.output])
finalModel = Flatten()(finalModel)
finalModel = Dense(1024, activation='relu') (finalModel)
finalModel = Dense(1024, activation='relu') (finalModel)
finalModel = Dense(1024, activation='relu') (finalModel)
finalModel = Dense(4, activation='softmax') (finalModel)
#0,1,2,3
#0: [1,0,0,0]
#1: [0,1,0,0]
#2: [0,0,1,0]

model = Model(inputs=[bodies_model.input, headlines_model.input], outputs = finalModel)

opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(loss='categorical_crossentropy',optimizer=opt,metrics=['accuracy'])


print(model.summary())
from keras.utils.vis_utils import plot_model

plot_model(model, to_file= './' + 'model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
import keras
from IPython.display import clear_output
from matplotlib import pyplot as plt
class PlotLearning(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.x = []
        self.losses = []
        self.val_losses = []
        self.acc = []
        self.val_acc = []
        self.fig = plt.figure()
        
        self.logs = []

    def on_epoch_end(self, epoch, logs={}):
        
        self.logs.append(logs)
        self.x.append(self.i)
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.acc.append(logs.get('acc'))
        self.val_acc.append(logs.get('val_acc'))
        self.i += 1
        f, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
        
        clear_output(wait=True)
        
        ax1.set_yscale('log')
        ax1.plot(self.x, self.losses, label="loss")
        ax1.plot(self.x, self.val_losses, label="val_loss")
        ax1.legend()
        
        ax2.plot(self.x, self.acc, label="accuracy")
        ax2.plot(self.x, self.val_acc, label="validation accuracy")
        ax2.legend()
        
        plt.show();
        
plot = PlotLearning()

In [None]:
filepath= './' + "test.best.hdf5"

In [None]:
 # print(headlines_sequences[4].size)
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

print(bodies_sequences.shape)
print(headlines_sequences.shape)
print(stances)
onehot_stances = to_categorical(stances)
print(onehot_stances)


stances_counts = train_data['Stance'].value_counts();
count_total = stances_counts.sum()
count_unrelated, count_dicuss, count_agree, count_disagree = stances_counts[3], stances_counts[2], stances_counts[0], stances_counts[1]
weight_unrelated = 1/(count_unrelated) * (count_total) / 2.0
weight_discuss = 1/(count_dicuss) * (count_total) / 2.0
weight_agree = 1/(count_agree) * (count_total) / 2.0
weight_disagree = 1/(count_disagree) * (count_total) / 2.0

class_weights = {0: weight_agree, 1: weight_disagree, 2: weight_discuss, 3: weight_unrelated}

checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

model.fit([bodies_sequences, headlines_sequences],
              onehot_stances,batch_size=16,
              epochs=50, callbacks=callbacks_list,
              validation_split=0.05, 
              shuffle=True,
              )


model.save('./' + 'model')
    

In [None]:
model.load_weights('./' + "test.best.hdf5")

from tensorflow.keras.utils import to_categorical
test_onehot_stances = to_categorical(test_stances)
print(len(bodies_sequences))
print(len(test_bodies_sequences), test_headlines_sequences[0], test_onehot_stances[0])
model.evaluate([test_bodies_sequences, test_headlines_sequences], test_onehot_stances)

In [None]:
import pandas as pd
import numpy as np
def test(headline, body):
  data = {'Headline': [headline], 'articleBody':[body], 'Stance': [None]}
  df = pd.DataFrame.from_dict(data)
  bodies_sequences, headlines_sequences, bodies_word_index, headlines_word_index, stances = prepare_data(df, [2243,40])
  stances = {
      0: "agree",
      1: "disagree",
      2: "discuss",
      3: "unrelated"
  }
  prediction = model.predict([bodies_sequences, headlines_sequences])
  print(prediction)
  print(stances[np.argmax(prediction)])
test("Pope Francis loves Donald Trump", '''Pope Francis hates Donlad Trump''')