### Please fill the following paths.

In [None]:
# Path to the test dataset, containing "digi.json" and "libertatea.json"
TEST_PATH = ""

# Path to the saved Random Forest model
RFC_PATH = ""

# Path to the saved lstm model
LSTM_PATH = ""

# Path to the saved BERT model
BERT_PATH =  ""

# Path to the saved Contrastive Learning model
CONTRASTIVE_PATH = ""

### Random Forest

In [None]:
! pip install stanza

In [None]:
import pandas as pd
from string import punctuation
import re
import nltk
from nltk.tokenize import sent_tokenize

from scipy import sparse
import pickle
import sklearn

import stanza
stanza.download('ro')

import nltk
from nltk.tokenize import sent_tokenize

nlp = stanza.Pipeline('ro', processors='tokenize,pos', tokenize_no_ssplit=True)
nltk.download('punkt')


def preprocess(text):
    result = text.replace('/', "").replace('\n', '')
    result = re.sub(r'[0-9]+', 'număr', result)
    result = re.sub(r'(\w)(\1{2,})', r'\1', result)
    result = re.sub(r'(?x)\b(?=\w*\d)\w+\s*', '', result)
    result = result.lower()
    punctuations = punctuation + "„”"
    result = "".join(word for word in result if word not in punctuations)
    re.sub(r' +', ' ', result).lower().strip()
    return result


question_words = ["ce", "cine", "cui", "care", "căruia", "căreia", "cărora", "căruia", "cât", "cît", "câți", "câtă",
                  "câte", "câtor", "cum", "oare"]


def isquestion(text):
    result = text.lower().split()
    if result[0] in question_words:
        return 1
    else:
        return 0


# the stopwords list is taken from this url: https://countwordsfree.com/stopwords/romanian
# the words used in questions are removed from this list
romanian_stopwords = ['acea', 'aceasta', 'această', 'aceea', 'acei', 'aceia', 'acel', 'acela', 'acele', 'acelea', 'acest', 'acesta', 'aceste', 'acestea', 'aceşti', 'aceştia', 'acolo', 'acord', 'acum', 'ai', 'aia', 'aibă', 'aici', 'al', 'ăla', 'ale', 'alea', 'ălea', 'altceva', 'altcineva', 'am', 'ar', 'are', 'aş', 'aşadar', 'asemenea', 'asta', 'ăsta', 'astăzi', 'astea', 'ăstea', 'ăştia', 'asupra', 'aţi', 'au', 'avea', 'avem', 'aveţi', 'azi', 'bine', 'bucur', 'bună', 'ca', 'că', 'căci', 'când', 'care', 'cărei', 'căror', 'cărui', 'cât', 'câte', 'câţi', 'către', 'câtva', 'caut', 'ce', 'cel', 'ceva', 'chiar', 'cinci', 'cînd', 'cine', 'cineva', 'cît', 'cîte', 'cîţi', 'cîtva', 'contra', 'cu', 'cum', 'cumva', 'curând', 'curînd', 'da', 'dă', 'dacă', 'dar', 'dată', 'datorită', 'dau', 'de', 'deci', 'deja', 'deoarece', 'departe', 'deşi', 'din', 'dinaintea', 'dintr-', 'dintre', 'doi', 'doilea', 'două', 'drept', 'după', 'ea', 'ei', 'el', 'ele', 'eram', 'este', 'eşti', 'eu', 'face', 'fără', 'fata', 'fi', 'fie', 'fiecare', 'fii', 'fim', 'fiţi', 'fiu', 'frumos', 'graţie', 'halbă', 'iar', 'ieri', 'îi', 'îl', 'îmi', 'împotriva', 'în', 'înainte', 'înaintea', 'încât', 'încît', 'încotro', 'între', 'întrun', 'întruna', 'întrucât', 'întrucît', 'îţi', 'la', 'lângă', 'le', 'li', 'lîngă', 'lor', 'lui', 'mă', 'mai', 'mâine', 'mea', 'mei', 'mele', 'mereu', 'meu', 'mi', 'mie', 'mîine', 'mine', 'mult', 'multă', 'mulţi', 'mulţumesc', 'ne', 'nevoie', 'nicăieri', 'nici', 'nimeni', 'nimeri', 'nimic', 'nişte', 'noastră', 'noastre', 'noi', 'noroc', 'noştri', 'nostru', 'nouă', 'nu', 'opt', 'ori', 'oricând', 'oricare', 'oricât', 'orice', 'oricînd', 'oricine', 'oricît', 'oricum', 'oriunde', 'până', 'patra', 'patru', 'patrulea', 'pe', 'pentru', 'peste', 'pic', 'pînă', 'poate', 'pot', 'prea', 'prima', 'primul', 'prin', 'puţin', 'puţina', 'puţină', 'rog', 'sa', 'să', 'săi', 'sale', 'şapte', 'şase', 'sau', 'său', 'se', 'şi', 'sînt', 'sîntem', 'sînteţi', 'spate', 'spre', 'ştiu', 'sub', 'sunt', 'suntem', 'sunteţi', 'sută', 'ta', 'tăi', 'tale', 'tău', 'te', 'ţi', 'ţie', 'timp', 'tine', 'toată', 'toate', 'tot', 'toţi', 'totuşi', 'trei', 'treia', 'treilea', 'tu', 'un', 'una', 'unde', 'undeva', 'unei', 'uneia', 'unele', 'uneori', 'unii', 'unor', 'unora', 'unu', 'unui', 'unuia', 'unul', 'vă', 'vi', 'voastră', 'voastre', 'voi', 'voştri', 'vostru', 'vouă', 'vreme', 'vreo', 'vreun', 'zece', 'zero', 'zi', 'zice']


def count_num_stopwords(text):
    result = preprocess(text)
    words = result.split()
    count = len([word for word in words if word not in romanian_stopwords])
    return count


def extract_data(df):
    commons = []
    propers = []
    proper_words_title = []
    f_measures = []
    cls_scores = []
    for index, row in df.iterrows():
        count_common, count_proper, num_proper_words_title, f_measure, cls_score = extract_data_from_pos_tags(
            row['title'], row['content'])
        commons.append(count_common)
        propers.append(count_proper)
        proper_words_title.append(num_proper_words_title)
        f_measures.append(f_measure)
        cls_scores.append(cls_score)

    return commons, propers, proper_words_title, f_measures, cls_scores


def create_data_frame(df):
    new_df = pd.DataFrame()

    # independent title features
    new_df["processed_title"] = df['title'].apply(preprocess)
    new_df["is_question"] = df['title'].apply(isquestion)
    new_df["num_words"] = df['title'].apply(lambda x: len(x.split()))
    new_df["rix_title"] = df['title'].apply(compute_RIX)
    new_df["lix_title"] = df['title'].apply(compute_LIX)
    new_df["num_stopwords"] = df['title'].apply(count_num_stopwords)
    new_df["punct_patterns"] = df["title"].apply(punctuation_patterns)
    new_df["stop_word_ratio"] = new_df['num_stopwords'] / new_df['num_words']

    # title + content common features
    commons, propers, proper_words_title, f_measures, cls_scores = extract_data(df)
    new_df["num_proper_words"] = proper_words_title
    new_df["fmeasure_title"] = f_measures
    new_df["clscore"] = cls_scores
    new_df["commons"] = commons
    new_df["propers"] = propers

    # content features
    new_df["rix_content"] = df['content'].apply(compute_RIX)
    new_df["lix_content"] = df['content'].apply(compute_LIX)

    return new_df


# RIX = num_long_words / num_sentences

def compute_RIX(text):
    number_of_sentences = len(sent_tokenize(text))
    result = preprocess(text)
    words = result.split()

    words = [word.lower() for word in words]
    words = [word for word in words if len(word) > 7]

    rix = 0

    if number_of_sentences != 0:
        rix = len(words) / float(number_of_sentences)
    else:
        rix = 0

    return rix


# LIX = num_words / num_sentences + (100 * num_long_words) / num_words
def compute_LIX(text):
    number_of_sentences = len(sent_tokenize(text))
    result = preprocess(text)
    words = result.split()

    words = [word.lower() for word in words]
    w = len(words)
    words = [word for word in words if len(word) > 7]
    long_words = len(words)

    v1 = v2 = 0
    if number_of_sentences != 0:
        v1 = w / float(number_of_sentences)

    if w != 0:
        v2 = (100 * long_words) / float(w)

    lix = v1 + v2

    return lix


def extract_data_from_pos_tags(title, content):
    title_doc = nlp(title)
    text_doc = nlp(content)

    commons = set()
    propers = set()

    noun_freq = 0
    adj_freq = 0
    prep_freq = 0
    article_freq = 0
    pronoun_freq = 0
    verb_freq = 0
    adv_freq = 0
    interj_freq = 0

    num_proper_words_title = 0

    for sent in title_doc.sentences:
        for word in sent.words:
            if word.upos == 'PROPN':
                propers.add(word.text)
                num_proper_words_title += 1
            elif word.upos == 'NOUN':
                commons.add(word.text)

            pos = word.upos
            if word.text.lower() in ['oh', 'wow', 'hmm', 'uh', 'um']:
                interj_freq += 1
            elif "NOUN" in pos:
                noun_freq += 1
            elif "ADJ" in pos:
                adj_freq += 1
            elif "ADP" in pos:
                prep_freq += 1
            elif word.text.lower() in ['un', 'o', 'niște', 'acest', 'această', 'acești', 'aceste', 'al', 'ai', 'ale']:
                article_freq += 1
            elif 'PRON' in pos and not 'PUNCT' in pos:
                pronoun_freq += 1
            elif 'VERB' in pos:
                verb_freq += 1
            elif 'ADV' in pos:
                adv_freq += 1

    count_common = 0
    count_proper = 0

    num_words = 0
    num_sentences = 0
    num_long_words = 0
    len_words = 0
    for sent in text_doc.sentences:
        num_sentences += 1
        for word in sent.words:
            num_words += 1
            len_words += len(word.text)
            if word.text in propers and word.upos == 'PROPN':
                count_proper += 1
            elif word.text in commons and word.upos == 'NOUN':
                count_common += 1

            if len(word.text) > 7:
                num_long_words += 1

    f_measure = (noun_freq + adj_freq + prep_freq + article_freq) / 2 - (
                pronoun_freq + verb_freq + adv_freq + interj_freq + 100) / 2

    if num_words != 0:
        avg_letters_per_100_words = len_words / num_words * 100
        avg_sentences_per_100_words = num_sentences / num_words * 100
    else:
        avg_letters_per_100_words = 0
        avg_sentences_per_100_words = 0

    cls_score = 0.0588 * avg_letters_per_100_words - 0.296 * num_sentences - 15.8

    return count_common, count_proper, num_proper_words_title, f_measure, cls_score


def punctuation_patterns(title):
    patterns = ['!?', '...', '***', '!!!', '???', '(', ')', '$']

    found = False
    counts = {}
    for pattern in patterns:
        if pattern in title:
            found = True
            break

    return found


def get_pos_title(title):
    title_doc = nlp(title)
    pos_result = []
    for sent in title_doc.sentences:
        for word in sent.words:
            pos_result.append(word.upos)

    result = " ".join(pos_result)
    return result


def rfc_predict_article(title, content, rfc_tfidf, rfc_cv_pos, rfc_sc_pos, rfc_sc_val,rfc_model):
    df = pd.DataFrame({
        "title": title,
        "content": content
    }, index=[1])

    df_processed = create_data_frame(df)
    df_processed["pos_title"] = df['title'].apply(get_pos_title)

    x_test_headline = rfc_tfidf.transform(df_processed['processed_title'])

    x_test_pos = rfc_cv_pos.transform(df_processed['pos_title'])
    x_test_pos_sc = rfc_sc_pos.transform(x_test_pos)

    x_test_val = df_processed.drop(columns=['processed_title', 'pos_title']).values

    x_test_val_sc = rfc_sc_val.transform(x_test_val)

    x_test = sparse.hstack([x_test_headline, x_test_pos_sc, x_test_val_sc]).tocsr()

    label = rfc_model.predict(x_test)[0].item()

    return label


def load_rfc(folder_path):
    rfc_tfidf_path  = folder_path + "tfidf.pkl"
    rfc_cv_pos_path = folder_path + "cv_pos.pkl"
    rfc_sc_pos_path = folder_path + "sc_pos.pkl"
    rfc_sc_val_path = folder_path + "sc_val.pkl"
    rfc_model_path  = folder_path + "rfc_model.pkl"

    tfidf_path = rfc_tfidf_path
    with open(tfidf_path, 'rb') as f:
        rfc_tfidf = pickle.load(f)

    cv_pos_path = rfc_cv_pos_path
    sc_pos_path = rfc_sc_pos_path

    with open(cv_pos_path, 'rb') as f:
        rfc_cv_pos = pickle.load(f)

    with open(sc_pos_path, 'rb') as f:
        rfc_sc_pos = pickle.load(f)

    sc_val_path = rfc_sc_val_path

    with open(sc_val_path, 'rb') as f:
        rfc_sc_val = pickle.load(f)

    with open(rfc_model_path, 'rb') as f:
        rfc_model = pickle.load(f)

    return rfc_tfidf, rfc_cv_pos, rfc_sc_pos, rfc_sc_val, rfc_model


### LSTM

In [None]:
from keras.models import load_model
import pickle
from keras.utils import pad_sequences
import numpy as np

def load_lstm_model(folder_path):
    lstm_model_path             = folder_path + "bilstm_22mai/" + "bilstm_model"
    lstm_title_tokenizer_path   = folder_path + "bilstm_22mai/" + "tokenizer_title.pickle"
    lstm_content_tokenizer_path = folder_path + "bilstm_22mai/" + "tokenizer_content.pickle"

    lstm_model = load_model(lstm_model_path)
    with open(lstm_title_tokenizer_path, 'rb') as handle:
        loaded_tokenizer_title = pickle.load(handle)

    with open(lstm_content_tokenizer_path, 'rb') as handle:
        loaded_tokenizer_content = pickle.load(handle)

    return lstm_model, loaded_tokenizer_title, loaded_tokenizer_content

def lstm_predict_article(title, text, lstm_model, loaded_tokenizer_title, loaded_tokenizer_content):
    encoded_title = loaded_tokenizer_title.texts_to_sequences([title])
    encoded_text = loaded_tokenizer_content.texts_to_sequences([text])

    max_length_title = 49
    max_length_content = 9401
    padded_title = pad_sequences(encoded_title, maxlen=max_length_title, padding='post')
    padded_text = pad_sequences(encoded_text, maxlen=max_length_content, padding='post')

    class_probabilities = lstm_model.predict([padded_title, padded_text])[0]

    predicted_label = np.argmax(class_probabilities)
    return predicted_label

### BERT

In [None]:
! pip install transformers==4.28.0
! pip install datasets

In [None]:
import torch
from torch.utils.data import DataLoader
from torch import nn
from transformers import BertTokenizerFast, BertForSequenceClassification
import pandas as pd
from datasets import Dataset


class CustomClassifier(nn.Module):
    def __init__(self, pretrained_model, num_classes):
        super(CustomClassifier, self).__init__()
        self.base_model = pretrained_model
        self.dropout = nn.Dropout(0.2)
        self.dense = nn.Linear(self.base_model.config.hidden_size, 128)
        self.classifier = nn.Linear(128, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state
        x = hidden_state[:, 0, :]
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.relu(x)
        x = self.classifier(x)
        return x

    def predict(self, input_ids, attention_mask):
        outputs = self.forward(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs, dim=1)
        return predictions


def load_bert_model(folder_path):
    bert_model_path      = folder_path + "trainer"
    bert_clasifier_path  = folder_path + "custom_classifier.pth"

    bert_model = BertForSequenceClassification.from_pretrained(bert_model_path)
    classifier_model = CustomClassifier(bert_model.base_model, 2)
    # classifier_model.load_state_dict(torch.load(bert_clasifier_path))
    classifier_model.load_state_dict(torch.load(bert_clasifier_path, map_location=torch.device('cpu')))

    model_name = "dumitrescustefan/bert-base-romanian-cased-v1"
    tokenizer = BertTokenizerFast.from_pretrained(model_name)

    return classifier_model, tokenizer


def predict_labels(model, dataset, device):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=8)
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model.predict(input_ids=input_ids, attention_mask=attention_mask)
            predictions.extend(outputs.cpu().numpy())

    return predictions


def preprocess_dataset(dataset, tokenizer):
    def tokenize_and_merge_title_content(examples):
        texts = [f"{title} {tokenizer.sep_token} {article}" for title, article in zip(examples['title'], examples['content'])]
        tokenized = tokenizer.batch_encode_plus(
            texts,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_token_type_ids=False,
        )
        return {
            "input_ids": tokenized["input_ids"],
            "attention_mask": tokenized["attention_mask"],
            "labels": examples["category"]
        }

    dataset = dataset.map(tokenize_and_merge_title_content, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    return dataset


def bert_predict_article(title, text, classifier_model, tokenizer):
    df_test = pd.DataFrame({'title': [title], 'content': [text], 'category': [0]})
    test_dataset = Dataset.from_pandas(df_test)

    tokenized_dataset = preprocess_dataset(test_dataset, tokenizer)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    label = predict_labels(classifier_model, tokenized_dataset, device)

    return label[0]



### Contrastive Learning model

In [None]:
from transformers import BertTokenizerFast
from torch.nn.functional import cosine_similarity
from transformers import BertModel
import re
from string import punctuation
import torch
from torch import nn


def preprocess(text):
  result = text.replace('/',"").replace('\n','')
  result = re.sub(r'[0-9]+','număr',result)
  result = re.sub(r'(\w)(\1{2,})',r'\1',result)
  result = re.sub(r'(?x)\b(?=\w*\d)\w+\s*', '', result)
  result = result.lower()
  punctuations = punctuation + "„”"
  result = "".join(word for word in result if word not in punctuations)
  re.sub(r' +',' ',result).lower().strip()
  return result


class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.text_encoder = BertModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None):
        outputs = self.text_encoder(input_ids=input_ids,
                                    attention_mask=attention_mask,
                                    token_type_ids=token_type_ids)
        # last hidden state for bert
        last_hidden_state = outputs.last_hidden_state

        # mean pooling

        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()  # expand mask
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded,
                                   1)  # multiply last hidden state by mask to ignore padding tokens
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)  # sum mask != 0
        mean_embeddings = sum_embeddings / sum_mask  # mean token embeddings
        return mean_embeddings


def contrastive_predict_article(title, text, model, threshold=0.25):
    model_name = "dumitrescustefan/bert-base-romanian-cased-v1"
    tokenizer = BertTokenizerFast.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    title = preprocess(title)
    text = preprocess(text)

    inputs_title = tokenizer(title, padding=True, truncation=True, max_length=256, return_tensors='pt').to(device)
    inputs_text = tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors='pt').to(device)

    output1 = model(**inputs_title)
    output2 = model(**inputs_text)

    cos_sim = cosine_similarity(output1, output2)

    predicted_similarity = (1.0 - cos_sim) < threshold

    if predicted_similarity == False:  # clickbait
        return 1
    else:
        return 0


def load_contrastive_model(folder_path):
    contrastive_path = folder_path + "model_contrastive_learning.pt"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_path = contrastive_path
    model = SiameseNetwork()
    model = model.to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))

    return model


### Ensemble Prediction

In [None]:
def load_all_models():
    # Load Bert model
    classifier_model, tokenizer = load_bert_model(BERT_PATH)

    # Load LSTM model
    lstm_model, loaded_tokenizer_title, loaded_tokenizer_content = load_lstm_model(LSTM_PATH)

    # Load RFC model
    rfc_tfidf, rfc_cv_pos, rfc_sc_pos, rfc_sc_val, rfc_model = load_rfc(RFC_PATH)

    # Load Contrastive model
    contrastive_model = load_contrastive_model(CONTRASTIVE_PATH)

    return classifier_model, tokenizer, lstm_model, loaded_tokenizer_title, loaded_tokenizer_content, rfc_tfidf, rfc_cv_pos, rfc_sc_pos, rfc_sc_val, rfc_model, contrastive_model


def combine_predict_article(title, content, classifier_model, tokenizer, lstm_model, loaded_tokenizer_title,
                            loaded_tokenizer_content, rfc_tfidf, rfc_cv_pos, rfc_sc_pos, rfc_sc_val, rfc_model,
                            contrastive_model):
    lstm_label = lstm_predict_article(title, content, lstm_model, loaded_tokenizer_title, loaded_tokenizer_content)
    rfc_label = rfc_predict_article(title, content, rfc_tfidf, rfc_cv_pos, rfc_sc_pos, rfc_sc_val, rfc_model)
    bert_label = bert_predict_article(title, content, classifier_model, tokenizer)
    contrastive_label = contrastive_predict_article(title, content, contrastive_model)

    predicted = 0.27 * lstm_label + 0.24 * rfc_label + 0.24 * bert_label + 0.25 * contrastive_label

    if predicted >= 0.49:
        predicted = 1
    else:
        predicted = 0
    return predicted, bert_label, lstm_label, rfc_label, contrastive_label

In [None]:
import json
import os
import pandas as pd

def read_file(path, name):
  file_path = path + "/" + name 

  reader = open(file_path)
  json_array = json.load(reader)
  news = []
  # nonclickbait = 0
  # clickbait = 1

  for element in json_array:
    cat = 1
    if element["category"] == "nonclickbait":
      cat = 0
    item = {
        "title":element["title"],
        "content":element["content"],
        "category":cat
            }
    news.append(item)

  return news

def read_raw_data(folder_path):
  filenames = sorted(os.listdir(folder_path))

  raw_data = []
  for filename in filenames:
    print(filename)
    current = read_file(folder_path, filename)
    raw_data.extend(current)

  return raw_data

In [None]:
print('Test files:')
test_raw_data  = read_raw_data(TEST_PATH)
print("---------------------")
df_test = pd.DataFrame(test_raw_data)

Test files:
digi.json
libertatea.json
---------------------


In [None]:
classifier_model, tokenizer, lstm_model, loaded_tokenizer_title, loaded_tokenizer_content, rfc_tfidf, rfc_cv_pos, rfc_sc_pos, rfc_sc_val, rfc_model, contrastive_model = load_all_models()

In [None]:
true_labels = []
pred_labels = []

for index, row in df_test.iterrows():
    title = row['title']
    content = row['content']
    label = row["category"]
    true_labels.append(label)

    predicted = combine_predict_article(title, content, classifier_model, tokenizer, lstm_model, loaded_tokenizer_title,
                            loaded_tokenizer_content, rfc_tfidf, rfc_cv_pos, rfc_sc_pos, rfc_sc_val, rfc_model,
                            contrastive_model)
    
    pred_labels.append(predicted)