### Please fill the following paths.

In [None]:
# Path to the test dataset, containing "digi.json" and "libertatea.json"
TEST_PATH = ""

# Path to the test dataset, containing "protv.json", "cancan.json" and "wowbiz.json"
TRAIN_PATH = ""

# Path where the best accuracy checkpoint can be saved
CHECKPOINT_PATH =  ""

# Path to the fasttext embedding downloaded from this link https://fasttext.cc/docs/en/pretrained-vectors.html 
# Please download the file called "cc.ro.300.vec"
fasttext_embeddings_path = "" 

# Path to the folder in which the model and other necessary tools are saved
FOLDER_PATH = ""

### Reading test and train datasets

In [None]:
! pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
! pip install keras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import tensorflow
from keras.layers import GlobalMaxPooling1D
from keras.regularizers import l2
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Concatenate, Input, Flatten, Dropout, Bidirectional
from keras.models import Model
from keras.callbacks import ModelCheckpoint

import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

import itertools
from sklearn.model_selection import train_test_split
import fasttext
from tensorflow.keras.utils import to_categorical
import os
import json
import pandas as pd
import pickle

In [None]:
def read_file(path, name):
  file_path = path + "/" + name 

  reader = open(file_path)
  json_array = json.load(reader)
  news = []
  # nonclickbait = 0
  # clickbait = 1

  for element in json_array:
    cat = 1
    if element["category"] == "nonclickbait":
      cat = 0
    item = {
        "title":element["title"],
        "content":element["content"],
        "category":cat
            }
    news.append(item)

  return news

In [None]:
def read_raw_data(folder_path):
  filenames = sorted(os.listdir(folder_path))

  raw_data = []
  for filename in filenames:
    print(filename)
    current = read_file(folder_path, filename)
    raw_data.extend(current)

  return raw_data

In [None]:
print('Test files:')
test_raw_data  = read_raw_data(TEST_PATH)
print("---------------------")
print('Train files:')
train_raw_data = read_raw_data(TRAIN_PATH)
print("---------------------")

In [None]:
df_train = pd.DataFrame(train_raw_data)
df_test = pd.DataFrame(test_raw_data)

### Classification.

In [None]:
def load_fasttext_embeddings(filepath, word_index, embedding_dim):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [None]:
max_length_title = 49
max_length_content = 9401 
embedding_dim = 300
max_words_title = 12000 
max_words_content = 25000 
input_shape_title = (max_length_title,)
input_shape_content = (max_length_content,)
output_shape = 2
num_classes = 2

In [None]:
tokenizer_title = Tokenizer(num_words=max_words_title, lower=True)
tokenizer_title.fit_on_texts(df_train['title'])

tokenizer_content = Tokenizer(num_words=max_words_content, lower=True)
tokenizer_content.fit_on_texts(df_train['content'])

num_words_title = min(max_words_title, len(tokenizer_title.word_index) + 1)
num_words_content = min(max_words_content, len(tokenizer_content.word_index) + 1)

X_title = tokenizer_title.texts_to_sequences(df_train['title'])
X_content = tokenizer_content.texts_to_sequences(df_train['content'])

X_title = pad_sequences(X_title, maxlen=max_length_title, padding='post', truncating='post')
X_content = pad_sequences(X_content, maxlen=max_length_content, padding='post', truncating='post')

In [None]:
tokenizer_title_path = FOLDER_PATH + "tokenizer_title.pickle"
tokenizer_content_path = FOLDER_PATH + "tokenizer_content.pickle"

with open(tokenizer_title_path, 'wb') as handle:
    pickle.dump(tokenizer_title, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(tokenizer_content_path, 'wb') as handle:
    pickle.dump(tokenizer_title, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
title_words = dict(itertools.islice(tokenizer_title.word_index.items(), num_words_title))
content_words = dict(itertools.islice(tokenizer_content.word_index.items(), num_words_content))

In [None]:
embedding_matrix_title = load_fasttext_embeddings(fasttext_embeddings_path, title_words, embedding_dim)
embedding_matrix_content = load_fasttext_embeddings(fasttext_embeddings_path, content_words, embedding_dim)

In [None]:
Y = df_train["category"].values
X_title_train, X_title_val, X_content_train, X_content_val, y_train, y_val = train_test_split(X_title, X_content, Y, test_size=0.2, shuffle=True)
y_train = to_categorical(y_train, num_classes)
y_val = to_categorical(y_val, num_classes)

In [None]:
num_epochs = 10
batch_size = 32 

# input layers

input_title = Input(shape=(max_length_title,))
input_content = Input(shape=(max_length_content,))

#---------------------------------------------------------------------------------------------------------------------------

# embedding layers

embedding_title = Embedding(input_dim=num_words_title + 1, output_dim=embedding_dim, weights=[embedding_matrix_title], input_length=max_length_title, trainable=False)(input_title)
embedding_content = Embedding(input_dim=num_words_content + 1, output_dim=embedding_dim, weights=[embedding_matrix_content], input_length=max_length_content, trainable=False)(input_content)

#---------------------------------------------------------------------------------------------------------------------------

# lstm layers

lstm_title = Bidirectional(LSTM(32, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))(embedding_title)
lstm_title_2 = Bidirectional(LSTM(32))(lstm_title)

lstm_content = Bidirectional(LSTM(64, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))(embedding_content)
lstm_content_2 = Bidirectional(LSTM(64))(lstm_content)

#---------------------------------------------------------------------------------------------------------------------------

# pooling layers

pooled_title = GlobalMaxPooling1D()(lstm_title)
pooled_content = GlobalMaxPooling1D()(lstm_content)

#---------------------------------------------------------------------------------------------------------------------------

# concatenation layer

concatenated = Concatenate()([pooled_title, pooled_content])

#---------------------------------------------------------------------------------------------------------------------------

# fully connected and dropout layers

fc1 = Dense(128, activation='relu', kernel_regularizer=l2(0.001))(concatenated)
dropout1 = Dropout(0.6)(fc1)

fc2 = Dense(64, activation='relu')(dropout1)
dropout2 = Dropout(0.6)(fc2)

output = Dense(2, activation='softmax')(dropout2)

#---------------------------------------------------------------------------------------------------------------------------

In [None]:
model = Model(inputs=[input_title, input_content], outputs=output)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model_bestAccuracyCheckpoint = ModelCheckpoint( 
            filepath=CHECKPOINT_PATH,            
            monitor='val_accuracy',
            mode='max',
            save_weights_only=True,
            save_best_only=True
        )

model.fit([X_title_train, X_content_train], y_train, validation_data=([X_title_val, X_content_val], y_val), epochs=num_epochs, batch_size=batch_size, callbacks=model_bestAccuracyCheckpoint)

In [None]:
model.load_weights(CHECKPOINT_PATH)

In [None]:
bilstm_model_path = FOLDER_PATH + "bilstm_model"
model.save(bilstm_model_path)

In [None]:
def predict_article(title, content, model, title_tokenizer, content_tokenizer):
  encoded_title = title_tokenizer.texts_to_sequences([title])
  encoded_text = content_tokenizer.texts_to_sequences([content])

  max_length_title = 49 
  max_length_content = 9401 
  padded_title = pad_sequences(encoded_title, maxlen=max_length_title, padding='post')
  padded_text = pad_sequences(encoded_text, maxlen=max_length_content, padding='post')

  class_probabilities = model.predict([padded_title, padded_text])[0]

  predicted_label = np.argmax(class_probabilities)

  return predicted_label

In [None]:
def classify(df):
  predictions = []
  true_labels = []
  for index, row in df.iterrows():
    title = row["title"]
    content = row["content"]
    label_true = row["category"]
    label_pred = predict_article(title, content, model, tokenizer_title, tokenizer_content)
    predictions.append(label_pred)
    true_labels.append(label_true)

  return predictions, true_labels

In [None]:
preds, trues = classify(df_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(classification_report(trues, preds))

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(trues, preds)
sns.set(font_scale=1.4) 
sns.heatmap(cm, annot=True, cmap='Blues', cbar=False, fmt='g')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion matrix')
plt.show()