### Please fill the following paths.

In [None]:
# Path to the test dataset, containing "digi.json" and "libertatea.json"
TEST_PATH = ""

# Path to the test dataset, containing "protv.json", "cancan.json" and "wowbiz.json"
TRAIN_PATH = ""

# Path to the folder in which the model and other necessary tools are saved
FOLDER_PATH = ""

### Imports

In [None]:
! pip install stanza

In [None]:
import json
import numpy as np
import os
import pandas as pd
from copy import deepcopy
import re
from string import punctuation
import seaborn as sns
from tqdm import tqdm,tqdm_notebook
import matplotlib.pyplot as plt
tqdm.pandas()

from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics
import pickle
from sklearn.svm import SVC
from sklearn import svm

In [None]:
import stanza
stanza.download('ro')

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Reading test and train datasets

In [None]:
def read_file(path, name):
  file_path = path + "/" + name

  reader = open(file_path)
  json_array = json.load(reader)
  news = []
  # nonclickbait = 0
  # clickbait = 1

  for element in json_array:
    cat = 1
    if element["category"] == "nonclickbait":
      cat = 0
    item = {
        "title":element["title"],
        "content":element["content"],
        "category":cat
            }
    news.append(item)

  return news

In [None]:
def read_raw_data(folder_path):
  filenames = sorted(os.listdir(folder_path))

  raw_data = []
  for filename in filenames:
    print(filename)
    current = read_file(folder_path, filename)
    raw_data.extend(current)

  return raw_data

In [None]:
print('Test files:')
test_raw_data  = read_raw_data(TEST_PATH)
print("---------------------")
print('Train files:')
train_raw_data = read_raw_data(TRAIN_PATH)
print("---------------------")

In [None]:
df_train = pd.DataFrame(train_raw_data)
df_test = pd.DataFrame(test_raw_data)

### Feature extraction

In [None]:
# the stopwords list is taken from this url: https://countwordsfree.com/stopwords/romanian
# the words used in questions are removed from this list
romanian_stopwords = ['acea', 'aceasta', 'această', 'aceea', 'acei', 'aceia', 'acel', 'acela', 'acele', 'acelea', 'acest', 'acesta', 'aceste', 'acestea', 'aceşti', 'aceştia', 'acolo', 'acord', 'acum', 'ai', 'aia', 'aibă', 'aici', 'al', 'ăla', 'ale', 'alea', 'ălea', 'altceva', 'altcineva', 'am', 'ar', 'are', 'aş', 'aşadar', 'asemenea', 'asta', 'ăsta', 'astăzi', 'astea', 'ăstea', 'ăştia', 'asupra', 'aţi', 'au', 'avea', 'avem', 'aveţi', 'azi', 'bine', 'bucur', 'bună', 'ca', 'că', 'căci', 'când', 'care', 'cărei', 'căror', 'cărui', 'cât', 'câte', 'câţi', 'către', 'câtva', 'caut', 'ce', 'cel', 'ceva', 'chiar', 'cinci', 'cînd', 'cine', 'cineva', 'cît', 'cîte', 'cîţi', 'cîtva', 'contra', 'cu', 'cum', 'cumva', 'curând', 'curînd', 'da', 'dă', 'dacă', 'dar', 'dată', 'datorită', 'dau', 'de', 'deci', 'deja', 'deoarece', 'departe', 'deşi', 'din', 'dinaintea', 'dintr-', 'dintre', 'doi', 'doilea', 'două', 'drept', 'după', 'ea', 'ei', 'el', 'ele', 'eram', 'este', 'eşti', 'eu', 'face', 'fără', 'fata', 'fi', 'fie', 'fiecare', 'fii', 'fim', 'fiţi', 'fiu', 'frumos', 'graţie', 'halbă', 'iar', 'ieri', 'îi', 'îl', 'îmi', 'împotriva', 'în', 'înainte', 'înaintea', 'încât', 'încît', 'încotro', 'între', 'întrun', 'întruna', 'întrucât', 'întrucît', 'îţi', 'la', 'lângă', 'le', 'li', 'lîngă', 'lor', 'lui', 'mă', 'mai', 'mâine', 'mea', 'mei', 'mele', 'mereu', 'meu', 'mi', 'mie', 'mîine', 'mine', 'mult', 'multă', 'mulţi', 'mulţumesc', 'ne', 'nevoie', 'nicăieri', 'nici', 'nimeni', 'nimeri', 'nimic', 'nişte', 'noastră', 'noastre', 'noi', 'noroc', 'noştri', 'nostru', 'nouă', 'nu', 'opt', 'ori', 'oricând', 'oricare', 'oricât', 'orice', 'oricînd', 'oricine', 'oricît', 'oricum', 'oriunde', 'până', 'patra', 'patru', 'patrulea', 'pe', 'pentru', 'peste', 'pic', 'pînă', 'poate', 'pot', 'prea', 'prima', 'primul', 'prin', 'puţin', 'puţina', 'puţină', 'rog', 'sa', 'să', 'săi', 'sale', 'şapte', 'şase', 'sau', 'său', 'se', 'şi', 'sînt', 'sîntem', 'sînteţi', 'spate', 'spre', 'ştiu', 'sub', 'sunt', 'suntem', 'sunteţi', 'sută', 'ta', 'tăi', 'tale', 'tău', 'te', 'ţi', 'ţie', 'timp', 'tine', 'toată', 'toate', 'tot', 'toţi', 'totuşi', 'trei', 'treia', 'treilea', 'tu', 'un', 'una', 'unde', 'undeva', 'unei', 'uneia', 'unele', 'uneori', 'unii', 'unor', 'unora', 'unu', 'unui', 'unuia', 'unul', 'vă', 'vi', 'voastră', 'voastre', 'voi', 'voştri', 'vostru', 'vouă', 'vreme', 'vreo', 'vreun', 'zece', 'zero', 'zi', 'zice']

In [None]:
def preprocess(text):
  result = text.replace('/',"").replace('\n','')
  result = re.sub(r'[0-9]+','număr',result)
  result = re.sub(r'(\w)(\1{2,})',r'\1',result)
  result = re.sub(r'(?x)\b(?=\w*\d)\w+\s*', '', result)
  result = result.lower()
  punctuations = punctuation + "„”"
  result = "".join(word for word in result if word not in punctuations)
  re.sub(r' +',' ',result).lower().strip()
  return result

In [None]:
question_words = ["ce", "cine", "cui", "care", "căruia", "căreia", "cărora", "căruia", "cât", "cît", "câți", "câtă", "câte", "câtor", "cum", "oare"]

def isquestion(text):
    result = text.lower().split()
    if result[0] in question_words:
        return 1
    else:
        return 0

In [None]:
def count_num_stopwords(text):
  result = preprocess(text)
  words = result.split()
  count = len([word for word in words if word not in romanian_stopwords])
  return count

In [None]:
# RIX = num_long_words / num_sentences

def compute_RIX(text):
  number_of_sentences = len(sent_tokenize(text))
  result = preprocess(text)
  words = result.split()

  words = [word.lower() for word in words]
  words = [word for word in words if len(word) > 7]

  rix = 0

  if number_of_sentences != 0:
    rix = len(words) / float(number_of_sentences)
  else:
    rix = 0

  return rix


# LIX = num_words / num_sentences + (100 * num_long_words) / num_words
def compute_LIX(text):
  number_of_sentences = len(sent_tokenize(text))
  result = preprocess(text)
  words = result.split()

  words = [word.lower() for word in words]
  w = len(words)
  words = [word for word in words if len(word) > 7]
  long_words = len(words)

  v1 = v2 = 0
  if number_of_sentences != 0:
    v1 = w / float(number_of_sentences)

  if w != 0:
    v2 = (100 * long_words) / float(w)

  lix = v1 + v2

  return lix

In [None]:
nlp = stanza.Pipeline('ro', processors='tokenize,pos', tokenize_no_ssplit=True)

def extract_pos_tags(text):
  doc = nlp(text)

  pos_tags = []

  for sentence in doc.sentences:
    for word in sentence.words:
      pos_tags.append((word.text, word.upos))
  return pos_tags


def compute_proper_nouns_number(pos_tags):
  num_pnouns = 0
  for word, pos in pos_tags:
    if pos == 'PROPN':
      num_pnouns += 1
  return num_pnouns


def compute_fmeasure(pos_tags):
  noun_freq = 0
  adj_freq = 0
  prep_freq = 0
  article_freq = 0
  pronoun_freq = 0
  verb_freq = 0
  adv_freq = 0
  interj_freq = 0

  for word, pos in pos_tags:
      if word.lower() in ['oh', 'wow', 'hmm', 'uh', 'um']:
          interj_freq += 1
      elif "NOUN" in pos:
          noun_freq += 1
      elif "ADJ" in pos:
          adj_freq += 1
      elif "ADP" in pos:
            prep_freq += 1
      elif word.lower() in ['un', 'o', 'niște', 'acest', 'această', 'acești', 'aceste', 'al', 'ai', 'ale']:
          article_freq += 1
      elif 'PRON' in pos and not 'PUNCT' in pos:
          pronoun_freq += 1
      elif 'VERB' in pos:
          verb_freq += 1
      elif 'ADV' in pos:
          adv_freq += 1

  f_measure = (noun_freq + adj_freq + prep_freq + article_freq) / 2 - (pronoun_freq + verb_freq + adv_freq + interj_freq + 100) / 2

  return f_measure


def compute_cls_score_ro(text):
    doc = nlp(text)

    num_words = 0
    num_sentences = 0
    for sent in doc.sentences:
        num_words += len(sent.words)
        num_sentences += 1

    avg_letters_per_100_words = sum(len(word.text) for sent in doc.sentences for word in sent.words) / num_words * 100
    avg_sentences_per_100_words = num_sentences / num_words * 100

    cls_score = 0.0588 * avg_letters_per_100_words - 0.296 * avg_sentences_per_100_words - 15.8

    return cls_score



def count_common_nouns(title, text):
   title_doc = nlp(title)
   text_doc = nlp(text)

   title_nouns = set(word.text for sent in title_doc.sentences for word in sent.words if word.upos == 'NOUN')
   common_noun_count = sum(1 for sent in text_doc.sentences for word in sent.words if word.text in title_nouns and word.upos == 'NOUN')

   return common_noun_count


def count_proper_nouns(title, text):
  title_doc = nlp(title)
  text_doc = nlp(text)

  title_nouns = set(word.text for sent in title_doc.sentences for word in sent.words if word.upos == 'PROPN')
  proper_noun_count = sum(1 for sent in text_doc.sentences for word in sent.words if word.text in title_nouns and word.upos == 'PROPN')

  return proper_noun_count



def count_propers_and_common(title, text):
  title_doc = nlp(title)
  text_doc = nlp(text)

  commons = set()
  propers = set()

  for sent in title_doc.sentences:
    for word in sent.words:
      if word.upos == 'PROPN':
        propers.add(word.text)
      elif word.upos == 'NOUN':
        commons.add(word.text)


  count_common = 0
  count_proper = 0

  for sent in text_doc.sentences:
    for word in sent.words:
        if word.text in propers and word.upos == 'PROPN':
          count_proper += 1
        elif word.text in commons and word.upos == 'NOUN':
          count_common += 1

  return count_common, count_proper


def get_pos_title(title):
  title_doc = nlp(title)
  pos_result = []
  for sent in title_doc.sentences:
    for word in sent.words:
      pos_result.append(word.upos)

  result = " ".join(pos_result)
  return result

In [None]:
def punctuation_patterns(title):
    patterns = ['!?', '...', '***', '!!!', '???', '(', ')', '$']
    found = False
    counts = {}
    for pattern in patterns:
        if pattern in title:
          found = True
          break

    return found

In [None]:
def extract_data_from_pos_tags(title, content):
  title_doc = nlp(title)
  text_doc = nlp(content)

  commons = set()
  propers = set()

  noun_freq = 0
  adj_freq = 0
  prep_freq = 0
  article_freq = 0
  pronoun_freq = 0
  verb_freq = 0
  adv_freq = 0
  interj_freq = 0

  num_proper_words_title = 0

  for sent in title_doc.sentences:
    for word in sent.words:
      if word.upos == 'PROPN':
        propers.add(word.text)
        num_proper_words_title += 1
      elif word.upos == 'NOUN':
        commons.add(word.text)


      pos = word.upos
      if word.text.lower() in ['oh', 'wow', 'hmm', 'uh', 'um']:
          interj_freq += 1
      elif "NOUN" in pos:
          noun_freq += 1
      elif "ADJ" in pos:
          adj_freq += 1
      elif "ADP" in pos:
            prep_freq += 1
      elif word.text.lower() in ['un', 'o', 'niște', 'acest', 'această', 'acești', 'aceste', 'al', 'ai', 'ale']:
          article_freq += 1
      elif 'PRON' in pos and not 'PUNCT' in pos:
          pronoun_freq += 1
      elif 'VERB' in pos:
          verb_freq += 1
      elif 'ADV' in pos:
          adv_freq += 1

  count_common = 0
  count_proper = 0

  num_words = 0
  num_sentences = 0
  num_long_words = 0
  len_words = 0
  for sent in text_doc.sentences:
    num_sentences += 1
    for word in sent.words:
        num_words += 1
        len_words += len(word.text)
        if word.text in propers and word.upos == 'PROPN':
          count_proper += 1
        elif word.text in commons and word.upos == 'NOUN':
          count_common += 1

        if len(word.text) > 7:
          num_long_words += 1

  f_measure = (noun_freq + adj_freq + prep_freq + article_freq) / 2 - (pronoun_freq + verb_freq + adv_freq + interj_freq + 100) / 2

  if num_words != 0:
    avg_letters_per_100_words = len_words / num_words * 100
    avg_sentences_per_100_words = num_sentences / num_words * 100
  else:
    avg_letters_per_100_words = 0
    avg_sentences_per_100_words = 0

  cls_score = 0.0588 * avg_letters_per_100_words - 0.296 * num_sentences - 15.8

  return count_common, count_proper, num_proper_words_title, f_measure, cls_score

In [None]:
def extract_data(df):
  commons = []
  propers = []
  proper_words_title = []
  f_measures = []
  cls_scores = []
  for index, row in df.iterrows():
    count_common, count_proper, num_proper_words_title, f_measure, cls_score = extract_data_from_pos_tags(row['title'], row['content'])
    commons.append(count_common)
    propers.append(count_proper)
    proper_words_title.append(num_proper_words_title)
    f_measures.append(f_measure)
    cls_scores.append(cls_score)

  return commons, propers, proper_words_title, f_measures, cls_scores

In [None]:
def create_data_frame(df):
  new_df = pd.DataFrame()

  # independent title features
  new_df["processed_title"]    = df['title'].progress_apply(preprocess)
  new_df["is_question"]        = df['title'].progress_apply(isquestion)
  new_df["num_words"]          = df['title'].progress_apply(lambda x: len(x.split()))
  new_df["rix_title"]          = df['title'].progress_apply(compute_RIX)
  new_df["lix_title"]          = df['title'].progress_apply(compute_LIX)
  new_df["num_stopwords"]      = df['title'].progress_apply(count_num_stopwords)
  new_df["punct_patterns"]     = df["title"].progress_apply(punctuation_patterns)
  new_df["stop_word_ratio"]    = new_df['num_stopwords']/new_df['num_words']
  new_df["pos_title"]          = df['title'].progress_apply(get_pos_title)

  # title + content common features
  commons, propers, proper_words_title, f_measures, cls_scores = extract_data(df)
  new_df["num_proper_words"]   = proper_words_title
  new_df["fmeasure_title"]     = f_measures
  new_df["clscore"]            = cls_scores
  new_df["commons"]            = commons
  new_df["propers"]            = propers

  # content features
  new_df["rix_content"]         = df['content'].progress_apply(compute_RIX)
  new_df["lix_content"]         = df['content'].progress_apply(compute_LIX)

  new_df["category"]            = df['category']

  return new_df

In [None]:
def generate_processed_dataframes(df_train, df_test):
  df_processed_train =  create_data_frame(df_train)
  df_processed_test =  create_data_frame(df_test)

  y_test = df_processed_test['category']
  df_processed_test = df_processed_test.drop('category', axis=1)

  y_train = df_processed_train['category']
  df_processed_train = df_processed_train.drop('category', axis=1)

  return y_test, df_processed_test, y_train, df_processed_train

In [None]:
y_test, df_processed_test, y_train, df_processed_train = generate_processed_dataframes(df_train, df_test)

In [None]:
def process_title(train_titles, test_titles):
  tfidf = TfidfVectorizer(
    min_df=3,
    max_features=None,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1,5),
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True
    )

  x_train_headline = tfidf.fit_transform(train_titles)
  x_test_headline  = tfidf.transform(test_titles)

  tfidf_path = FOLDER_PATH + "tfidf.pkl"
  with open(tfidf_path, 'wb') as handle:
    pickle.dump(tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)

  return x_train_headline, x_test_headline

In [None]:
def process_pos_title(train_titles, test_titles):
  cv_pos = CountVectorizer()
  sc_pos = StandardScaler(with_mean=False)

  x_train_pos = cv_pos.fit_transform(train_titles)
  x_train_pos_sc = sc_pos.fit_transform(x_train_pos)

  x_test_pos = cv_pos.transform(test_titles)
  x_test_pos_sc = sc_pos.transform(x_test_pos)

  cv_pos_path = FOLDER_PATH + "cv_pos.pkl"
  sc_pos_path = FOLDER_PATH + "sc_pos.pkl"

  with open(cv_pos_path, 'wb') as handle:
    pickle.dump(cv_pos, handle, protocol=pickle.HIGHEST_PROTOCOL)

  with open(sc_pos_path, 'wb') as handle:
    pickle.dump(sc_pos, handle, protocol=pickle.HIGHEST_PROTOCOL)

  return x_train_pos_sc, x_test_pos_sc

In [None]:
def process_numerical_values(x_train_val, x_test_val):
  sc_val = StandardScaler()
  x_train_val_sc = sc_val.fit_transform(x_train_val)
  x_test_val_sc = sc_val.transform(x_test_val)

  sc_val_path = FOLDER_PATH + "sc_val.pkl"

  with open(sc_val_path, 'wb') as handle:
    pickle.dump(sc_val, handle, protocol=pickle.HIGHEST_PROTOCOL)

  return x_train_val_sc, x_test_val_sc

In [None]:
x_train_headline, x_test_headline = process_title(df_processed_train['processed_title'], df_processed_test['processed_title'])
x_train_pos_sc, x_test_pos_sc = process_pos_title(df_processed_train['pos_title'], df_processed_test['pos_title'])

x_train_val = df_processed_train.drop(columns=['processed_title','pos_title']).values
x_test_val = df_processed_test.drop(columns=['processed_title','pos_title']).values

x_train_val_sc, x_test_val_sc = process_numerical_values(x_train_val, x_test_val)

x_train = sparse.hstack([x_train_headline, x_train_pos_sc, x_train_val_sc]).tocsr()
x_test  = sparse.hstack([x_test_headline,  x_test_pos_sc,  x_test_val_sc] ).tocsr()

### Training and testing

In [None]:
rfc = RandomForestClassifier(bootstrap=True, class_weight='balanced', criterion='entropy', max_depth=None, max_features='auto',
                             max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0,
                             n_estimators=150, n_jobs=-1, oob_score=True, random_state=0, verbose=0, warm_start=False)
linear_svc = LinearSVC()
logistic_regression = LogisticRegression(random_state=0, solver="newton-cg")
svc = SVC()

models = [
    rfc,
    linear_svc,
    logistic_regression,
    svc
]

cross_validation_split = 5
cv_df = pd.DataFrame(index=range(cross_validation_split * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=cross_validation_split)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='model_name', y='accuracy',
            data=cv_df,
            color='lightblue',
            showmeans=True)
plt.title("MEAN ACCURACY (cross_validation_split = 5)", size=14);

In [None]:
cv_df

In [None]:
rfc = RandomForestClassifier(bootstrap=True, class_weight='balanced', criterion='entropy', max_depth=None, max_features='auto',
                             max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0,
                             n_estimators=150, n_jobs=-1, oob_score=True, random_state=0, verbose=0, warm_start=False)

rfc.fit(x_train,y_train)
y_pred= rfc.predict(x_test)
print(metrics.classification_report(y_test, y_pred))

In [None]:
rfc_path =  FOLDER_PATH + "rfc_model.pkl"
with open(rfc_path, 'wb') as f:
    pickle.dump(rfc, f)

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.set(font_scale=1.4)
sns.heatmap(cm, annot=True, cmap='Blues', cbar=False, fmt='g')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion matrix - Random Forest')
plt.show()

In [None]:
clf = svm.SVC(kernel='linear',probability=True)

clf.fit(x_train, y_train)
y_pred_model_svm = clf.predict_proba(x_test)
y_pred_labels_svm = clf.predict(x_test)
print(metrics.classification_report(y_test, y_pred_labels_svm, digits=5))

In [None]:
cm = confusion_matrix(y_test, y_pred_labels_svm)
sns.set(font_scale=1.4)
sns.heatmap(cm, annot=True, cmap='Blues', cbar=False, fmt='g')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion matrix - SVM')
plt.show()