# FACADE: Fake Article Classification and Decision Explanation

This notebook is meant to run **FACADE** system locally.

After installing the libraries listed in *requirements.txt*, just run the entire notebook and have fun! 

## Importing libraries

In [1]:
import os
import numpy as np
import pandas as pd
import requests
import time
from urllib.error import HTTPError
from boilerpy3 import extractors
import validators
import re
import contractions
import unicodedata
import ftfy
import secrets
import threading
import random
import nltk
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.wsd import lesk
import language_tool_python
import textstat
from textblob import TextBlob
import string
import concurrent.futures
import pickle
from collections import Counter, defaultdict
from operator import itemgetter
import itertools
import tensorflow as tf
import spacy
from sentence_transformers import util, SentenceTransformer
from transformers import pipeline
from gensim.parsing.preprocessing import remove_stopwords
import shap
import html
import dash_bootstrap_components as dbc
from explainerdashboard import ClassifierExplainer, ExplainerDashboard
from flask import Flask, request, render_template, jsonify

import warnings
warnings.filterwarnings("ignore")

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('brown')
nltk.download('omw-1.4')


The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`

The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\purifica\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\purifica\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\purifica\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\purifica\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Define helper classes and variables

In [2]:
class FileLocker:
    def __init__(self):
        self.lock = threading.Lock()

    def acquire_file_lock(self, file):
        while os.path.exists(file+".lock"):
            time.sleep(0.05)
        with self.lock:
            with open(file+".lock", "w") as f:
                f.write("locked")
        return True

    def delete_file_lock(self, file):
        with self.lock:
            if os.path.exists(file+".lock"):
                os.remove(file+".lock")
        return True

In [3]:
file_locker = FileLocker()
stored_requests = dict()
running_db = None

### Load standard classifiers

In [4]:
Entailment_Classifier = pipeline("text-classification", model = "roberta-large-mnli")
Sentiment_Classifier = pipeline("sentiment-analysis")
zero_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
attr_model = SentenceTransformer('all-MiniLM-L6-v2')
attr_model.max_seq_length = 512
dberta_model = SentenceTransformer('microsoft/deberta-v2-xlarge-mnli')
model_for_personality = None

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
No sentence-transformers model found with name C:\Users\purifica/.cache\torch\sentence_transformers\microsoft_deberta-v2-xlarge-mnli. Creating a new one with 

### Load pickle files

In [5]:
PICKLE_FILES_FOLDER = "pickle_files"

In [6]:
# Pickle files PIPELINE-1
with open(os.path.join(PICKLE_FILES_FOLDER, 'model_final_ll.pickle'), 'rb') as handle:
        model_ll = pickle.load(handle)

with open(os.path.join(PICKLE_FILES_FOLDER, 'tf_idf_with_words_ll.pickle'), 'rb') as handle:
        tf_idf_vectorizer_ll = pickle.load(handle)

# Pickle files PIPELINE-2
with open(os.path.join(PICKLE_FILES_FOLDER, 'tfidf_vect_ngram_personality.pickle'), 'rb') as handle:
        vec_for_personality = pickle.load(handle)
        
with open(os.path.join(PICKLE_FILES_FOLDER, 'model_topics.pickle'), 'rb') as handle:
        model_for_topic = pickle.load(handle)
        
with open(os.path.join(PICKLE_FILES_FOLDER, 'adjective_embeddings_labels.pickle'), 'rb') as handle:
        models_for_adj = pickle.load(handle)
        
with open(os.path.join(PICKLE_FILES_FOLDER, 'glove_dict.pickle'), 'rb') as handle:
        glove_embeddings = pickle.load(handle)

with open(os.path.join(PICKLE_FILES_FOLDER, 'df_Real_Markov_Analysis_trained.pickle'), 'rb') as handle:
        trained_markov_real_structure = pickle.load(handle)
        
with open(os.path.join(PICKLE_FILES_FOLDER, 'df_Fake_Markov_Analysis_trained.pickle'), 'rb') as handle:
        trained_markov_fake_structure = pickle.load(handle)
        
with open(os.path.join(PICKLE_FILES_FOLDER, 'semantic_attribution_embeddings_trained.pickle'), 'rb') as handle:
        trained_dberta_embeddings = pickle.load(handle)
        
with open(os.path.join(PICKLE_FILES_FOLDER, 'data_train.pickle'), 'rb') as handle:
        trained_data = pickle.load(handle)
        
with open(os.path.join(PICKLE_FILES_FOLDER, 'Mapping_Sentences_doc2vec_new.pickle'), 'rb') as handle:
        mapping_sent_doc = pickle.load(handle)       
        
with open(os.path.join(PICKLE_FILES_FOLDER, 'model_doc2vec.pickle'), 'rb') as handle:
        doc2vec_model = pickle.load(handle)
        
with open(os.path.join(PICKLE_FILES_FOLDER, 'all_columns_seq_high_level_wo_tone.pickle'), 'rb') as handle:
        seq_of_col = pickle.load(handle)
        
with open(os.path.join(PICKLE_FILES_FOLDER, 'model_highlevel_wo_tone.pickle'), 'rb') as handle:
        model_hl = pickle.load(handle)

with open(os.path.join(PICKLE_FILES_FOLDER, 'all_columns_seq_high_level_wo_zero.pickle'), 'rb') as handle:
        seq_of_col_wo_zeroshot = pickle.load(handle)
        
with open(os.path.join(PICKLE_FILES_FOLDER, 'model_highlevel_wo_zeroshot.pickle'), 'rb') as handle:
        model_hl_wo_zeroshot = pickle.load(handle)

# Pickle files EXPLAINABILITY
with open(os.path.join(PICKLE_FILES_FOLDER, 'model_highlevel_reduced_UI_use_explain.pickle'), 'rb') as handle:
        model_explain = pickle.load(handle)

with open(os.path.join(PICKLE_FILES_FOLDER, 'all_sentences_training_for_explain.pickle'), 'rb') as handle:
        all_sentences_training = pickle.load(handle)

with open(os.path.join(PICKLE_FILES_FOLDER, 'col_keep_for_explaination.pickle'), 'rb') as handle:
        col_explain = pickle.load(handle)

### Feature explanations

In [7]:
explain_dict_ll = {
    'length_body' : ('Length of Main Text','The length of the main body text (i.e. number of words in the news).'),
    'Type_Token_body' : ('TTR of Main Text','The type token ratio (number of unique words compared to all words used) of the main body text.'),
    'average_wl_body' : ('Avg. Word Length of Main Text','The average word length inside the main body text.'),
    'length_headline' : ('Length of Headline','The length of the title of the news in number of words.'),
    'Type_Token_headlines' : ('TTR of Headline','The type token ratio (number of unique words compared to all words used) of the title of the news.'),
    'average_wl_headline' : ('Avg. Word Length of Headline','The average word length inside the title of the news.'),
    'sentence_count' : ('No. of Sentences','The total of sentences of the news.'),
    'Avg_Words_Sentence' : ('Avg. Words per Sentence','The average number of words per sentence.'),
    'count_names' : ('No. of Names','The total amount of person names inside the news article.'),
    'count_grammar_mistakes' : ('No. of Grammar Mistakes','The amount of grammatical mistakes inside the text.'),
    'ease_reading_score' : ('Reading Score','The reading score (denoting how easy a text is to read) for the text.'),
    'num_pronouns' : ('No. of Pronouns','The number of pronouns used in the text.'),
    'num_poss_pronouns' : ('No. of Poss. Pronouns','The number of possesive pronouns used in the text.'),
    'num_noun_phrases' : ('No. of Noun Phrases','The number of noun phrases (e.g. the news) used in the text.'),
    'pol' : ('Polarity','The polarity of the text, i.e. how positive or negative the text is written.'),
    'sub' : ('Subjectivity','The subjectivity of the text, i.e. how many opinionated words are used.'),
    'NV' : ('Ratio of Noun-Verb','The ratio of nouns compared to verbs (nouns/verbs).'),
    'NA' : ('Ratio of Noun-Adjective','The ratio of nouns compared to adjectives (nouns/adjectives).'),
    'VN' : ('Ratio of Verb-Noun','The ratio of verbs to nouns (verbs/nouns).'),
    'VA' : ('Ratio of Verb-Adjective','The ratio of verbs to adjectives (verbs/adjectives).'),
    'AN' : ('Ratio of Adjective-Noun','The ratio of adjectives to nouns (adjectives/nouns).'),
    'AV' : ('Ratio of Adjective-Verb','The ratio of adjectives to verbs (adjectives/verbs).'),
    'num_digits' : ('No. of Digits','How many digits are found inside the text.'),
    'stop_words' : ('No. of Stopwords','How many stopwords are found inside the text. Stopwords are words, that do not contain meaningful information, e.g. words like of, the, in, etc.'),
    'upper_case_words' : ('No. of Capitalized Words','The count of words that are capitalized inside the text.'),
    'num_puntuation' : ('No. of Punctuation','The amount of punctuation used in the text.'),
    'num_char' : ('No. of Characters','The amount of characters the text is made of.')
}

explain_dict_hl = {
    'Real_weight' : ('Ratio of Real Sentences', 'The ratio of sentences, that are attributed as being real, compared to fake sentences (attributed to being wrong).'),
    'markov_pred' : ('Markov Prediction', 'Whether the markov model predicted the structure of the news article to be real or fake.'),
    'End_Real' : ('Ended on Real Sentence', 'Whether the last sentence in the news article was attributed to being real.'),
    'INTERNATIONAL POLITICAL_Norm' : ('International Politics Topic', 'The normalized (over all topics) value for the appearance of the topic "International Politics" inside the news article.')
}

## Define methods

In [8]:
# Check and handle old requests
def check_for_old_requests():
    global stored_requests
    # Throw out requests, that are too old
    to_throw = []
    for key in stored_requests.keys():
        if time.time() - stored_requests[key]["time"] > 600:
            to_throw.append(key)
    for key in to_throw:
        del stored_requests[key]

In [9]:
# Read content from URL
def read_content_from_url(news_url):
    try:
        extractor = extractors.ArticleExtractor()
        doc = extractor.get_doc_from_url(news_url)
    except HTTPError :
        url = news_url
        extractor = extractors.ArticleExtractor()
        resp = requests.get(url)
        if resp.ok:
            complete_text = extractor.get_content(resp.text)
            content = complete_text.partition('\n')[2]
            title = complete_text.partition('\n')[0]
        else:
            raise Exception(f'Failed to get URL: {resp.status_code}')
    else:
        content = doc.content
        title = doc.title
        title=title.split('|')[0]

    return content,title

In [10]:
# Prepare data
def prepare_data(text, headline = None):
    
    if validators.url(text) == True:
        data, title = read_content_from_url(text)
    else:
        data = text
        title = headline
    
    data = re.sub('\S*@\S*\s?', '', data)
    data = re.sub('\s+', ' ', data)
    data = re.sub("\n", "", data)
    data = contractions.fix(data)
    data = data.strip()
    data = re.sub("'", "", data)
    data = re.sub(r'you\.(?=[a-zA-Z0-9])', 'U.', data)
    data = re.sub(r'https\S+', '', data)
    data = re.sub(r'http\S+', '', data)
    data = re.sub(r'www\S+', '', data)
    data = unicodedata.normalize('NFKD', ftfy.fix_text(data)).encode('ascii', 'ignore').decode("utf-8")
    title = re.sub('\S*@\S*\s?', '', title)
    title = re.sub('\s+', ' ', title)
    title = re.sub("\n", "", title)
    title = contractions.fix(title)
    title = title.strip()
    title = re.sub("'", "", title)
    title = re.sub(r'you\.(?=[a-zA-Z0-9])', 'U.', title)
    title = re.sub(r'https\S+', '', title)
    title = re.sub(r'http\S+', '', title)
    title = re.sub(r'www\S+', '', title)
    title = unicodedata.normalize('NFKD', ftfy.fix_text(title)).encode('ascii', 'ignore').decode("utf-8")
    return [data,title]

In [11]:
# Divide
def divide(num1, num2):
    if num2 == 0:
        result_list = 0
    else:
        result_list = num1/num2
    return result_list

In [12]:
# IE Preprocess
def ie_preprocess(document):
    stop = stopwords.words('english')
    document = ' '.join([i for i in document.split() if i not in stop])
    sentences = sent_tokenize(document)
    sentences = [word_tokenize(sent) for sent in sentences]
    sentences = [pos_tag(sent) for sent in sentences]
    return sentences

In [13]:
# Extract names
def extract_names(document):
    names = []
    sentences = ie_preprocess(document)
    for tagged_sentence in sentences:
        for chunk in ne_chunk(tagged_sentence):
            if type(chunk) == Tree:
                if chunk.label() == 'PERSON':
                    names.append(' '.join([c[0] for c in chunk]))
    return names

In [14]:
# Low-level features extraction
def low_level_features(text, vec, headline=None):
    data, title = prepare_data(text, headline)

    tool = language_tool_python.LanguageTool('en-US')
    tokenizer = RegexpTokenizer(r'[^\W’]+')
    words = tokenizer.tokenize(data)
    length_body = len(words)
    unique_length_body = len(set(words))
    average_wl_body = sum(len(word) for word in words) / len(words)
    Type_Token_body = unique_length_body/length_body
    words_headline = tokenizer.tokenize(title)
    length_headline = len(words_headline)
    unique_length_headline = len(set(words_headline))
    average_wl_headline = sum(len(word) for word in words_headline) / len(words_headline)
    Type_Token_headlines = unique_length_headline/length_headline
    sentences = sent_tokenize(data)
    sentences = [elem for elem in sentences if len(re.findall(r'\w+', elem)) >1]
    sentence_count = len(sentences)
    Avg_Words_Sentence = divide(length_body,sentence_count)
    names_list = extract_names(data)
    count_names = len(names_list)
    grammar = tool.check(data)
    count_grammar_mistakes = len(grammar)
    ease_reading_score = textstat.text_standard(data, float_output=True)
    blob = TextBlob(data)
    num_pronouns = len([w for (w, pos) in blob.pos_tags if pos == 'PRP'])
    num_poss_pronouns = len([w for (w, pos) in blob.pos_tags if pos == 'PRP$'])
    num_noun_phrases = len(blob.noun_phrases)
    num_nouns = len([w for (w, pos) in blob.pos_tags if pos[0] == 'N'])
    num_verbs = len([w for (w, pos) in blob.pos_tags if pos[0] == 'V'])
    num_adj = len([w for (w, pos) in blob.pos_tags if pos[0] == 'J'])
    pol = blob.sentiment[0]
    sub = blob.sentiment[1]
    NV = divide(num_nouns,num_verbs)
    NA = divide(num_nouns,num_adj)
    VN = divide(num_verbs,num_nouns)
    VA = divide(num_verbs,num_adj)
    AN = divide(num_adj,num_nouns)
    AV = divide(num_adj,num_verbs)
    stop = stopwords.words('english')
    num_digits = len([x for x in tokenizer.tokenize(data) if x.isdigit()])
    stop_words = len([x for x in tokenizer.tokenize(data) if x in stop])
    upper_case_words = len([x for x in tokenizer.tokenize(data) if x.isupper()])
    num_puntuation = len("".join(_ for _ in data if _ in string.punctuation))
    num_char = len(data)
    tf_idf_features = np.array(vec.transform([data]).todense())[0]
    tf_idf_features = pd.DataFrame(tf_idf_features).T
    tf_idf_features.columns = ['212', 'video', 'funeral', 'al', 'museum', 'family', 'terrorists', 'died', 'sister', 'post', 'play', 'devoted', 'en', 'league', 'trump','company', 'please', 'earth', 'daughter', 'coach', 'gun', 'music', 'graduated', 'theater', 'survived', '2017', 'husband', 'mother', 'street', 'gold', 'art', 'la', 'loving', 'grandchildren', 'talk', 'avenue', 'father', 'percent', 'wife', 'et', 'game', 'season', 'de', 'beloved']
    low_level = pd.DataFrame([length_body, Type_Token_body, average_wl_body, length_headline,Type_Token_headlines, average_wl_headline, sentence_count, Avg_Words_Sentence,count_names, count_grammar_mistakes, ease_reading_score, num_pronouns, num_poss_pronouns, num_noun_phrases, pol, sub, NV, NA, VN, VA, AN, AV, num_digits, stop_words, upper_case_words, num_puntuation, num_char]).T
    low_level.columns = ['length_body', 'Type_Token_body', 'average_wl_body', 'length_headline', 'Type_Token_headlines', 'average_wl_headline', 'sentence_count', 'Avg_Words_Sentence', 'count_names', 'count_grammar_mistakes', 'ease_reading_score', 'num_pronouns', 'num_poss_pronouns', 'num_noun_phrases', 'pol', 'sub', 'NV', 'NA', 'VN', 'VA', 'AN', 'AV', 'num_digits', 'stop_words', 'upper_case_words', 'num_puntuation', 'num_char']
    df_low_level = pd.concat([low_level, tf_idf_features], axis=1)
    
    return (df_low_level, data, title)

In [15]:
# Prepare sentences for Pipeline 2
def prepare_sentences_pipeline2(text):
    sentences_temp =  sent_tokenize(text)
    sentences = [elem for elem in sentences_temp if len(re.findall(r'\w+', elem)) >1]
    sentence_pairs = [' '.join(sentences[i:i+2]) for i in range(len(sentences))]
    sentence_pairs.remove(sentence_pairs[-1])
    return [sentences, sentence_pairs]

In [16]:
# Chunk sentences
def chunk_sentences(sentences):
    chunked_sentences=[]
    length = len(sentences)
    if length == 1:
        chunk_size = 1
    else:
        chunk_size = round(length/3)
    for j in range(0,len(sentences),chunk_size):
        chunked_sentences.append(' '.join(sentences[j:j+chunk_size]))
        
    if len(chunked_sentences) > 3:
        chunked_sentences[2] = ' '.join(chunked_sentences[2:])
        del chunked_sentences[3:]
        
    return(chunked_sentences)

In [17]:
# Compute Entailment features
def entailment_features(sentence_pairs, sentences):
    labels = ['CONTRADICTION', 'ENTAILMENT', 'NEUTRAL']
    if len(sentence_pairs)!=0:
        temp1 = [Entailment_Classifier(elem, truncation=True)[0]['label'] for elem in sentence_pairs]
    else:
        temp1 = [Entailment_Classifier(elem, truncation=True)[0]['label'] for elem in sentences]
    counter = Counter(temp1)
    total = sum(counter.values())
    for item, count in counter.items():
        counter[item] /= total
        temp1_counter = (sorted(counter.items()))
    elem=[j[0] for j in temp1_counter]
    if labels not in elem:
        non_intersection = set(labels) - set(elem)
    for item in non_intersection:
            temp1_counter.append((item, 0))

    df_entailment_type1 = pd.DataFrame({x[0]:x[1:] for x in temp1_counter})
    
    if len(sentence_pairs) != 0:
        temp2 = [Entailment_Classifier(elem, truncation=True)[0] for elem in sentence_pairs]
    else:
        temp2 = [Entailment_Classifier(elem, truncation=True)[0] for elem in sentences]
        
    temp2_counter = []
    sorted_list = sorted(temp2, key=itemgetter('label'))
    for key, group in itertools.groupby(sorted_list, lambda item: item["label"]):
                temp2_counter.append((key, np.mean([item["score"] for item in group])))
    
    elem = [j[0] for j in temp2_counter]
    if labels not in elem:
        non_intersection = set(labels) - set(elem)
    for item in non_intersection:
            temp2_counter.append((item,0))
    df_entailment_type2 = pd.DataFrame({x[0]:x[1:] for x in temp2_counter})
    
    return[temp1, df_entailment_type1, df_entailment_type2]

In [18]:
# Compute Personality feature
def personality_feature(text, vec_for_personality, model_for_personality):
    model_for_personality = tf.keras.models.load_model(os.path.join(PICKLE_FILES_FOLDER, "personality_model"))
    personality_temp= np.array(vec_for_personality.transform([text]).todense())
    predictions= model_for_personality.predict(personality_temp.reshape(1,-1)).round()[0]
    df_personality = pd.DataFrame(predictions).T
    df_personality.columns = ['Extrovert', 'Sensing', 'Feeling', 'Perceiving']
    return (df_personality)

In [19]:
# Compute Sentiment feature
def sentiment_feature(sentences):
    labels = ['POSITIVE','NEGATIVE']
    temp1 = [Sentiment_Classifier(elem, truncation=True)[0]['label'] for elem in sentences]
    counter = Counter(temp1)
    total = sum(counter.values())
    for item, count in counter.items():
        counter[item] /= total
        temp1_counter = (sorted(counter.items()))
    elem=[j[0] for j in temp1_counter]
    if labels not in elem:
        non_intersection = set(labels) - set(elem)
    for item in non_intersection:
            temp1_counter.append((item, 0))

    df_sentiment_type1 = pd.DataFrame({x[0]:x[1:] for x in temp1_counter})
    
    temp2 = [Sentiment_Classifier(elem,truncation=True)[0] for elem in sentences]
    temp2_counter = []
    sorted_list = sorted(temp2, key=itemgetter('label'))
    for key, group in itertools.groupby(sorted_list, lambda item: item["label"]):
        temp2_counter.append((key, np.mean([item["score"] for item in group])))
    
    elem=[j[0] for j in temp2_counter]
    if labels not in elem:
        non_intersection = set(labels) - set(elem)
    for item in non_intersection:
            temp2_counter.append((item, 0))
    df_sentiment_type2 = pd.DataFrame({x[0]:x[1:] for x in temp2_counter})
    
    return [df_sentiment_type1, df_sentiment_type2]

In [20]:
# Compute Topic features
def topic_feature(text, sentences, dberta_model, model_for_topic):
    dberta_embedding = dberta_model.encode(text)
    topic_raw = model_for_topic.predict(dberta_embedding.reshape(1,-1))[0]
    if np.sum(topic_raw) != 0:
        topic_embedding = np.array(topic_raw) / np.sum(np.array(topic_raw))
    else:
        topic_embedding = topic_raw
    
    dberta_embedding_sentence = dberta_model.encode(sentences)
    temp = [model_for_topic.decision_function(dberta_embedding_sentence)][0]
    topic_sentence = np.argmax(temp, axis=1)
    return [topic_raw, topic_embedding, topic_sentence]

In [21]:
# Compute Zero-shot learning topics feature
def zero_shot_learning_topic(text):
    sequence_to_classify = text
    candidate_labels = ['war', 'government', 'politics', 'education', 'health', 'environment', 'economy', 'business', 'entertainment','sports']
    result = zero_classifier(sequence_to_classify, candidate_labels)
    temp = pd.DataFrame({'labels': result['labels'], 'score': result['scores']})
    temp = temp.sort_values(by=['labels']).reset_index(drop=True)
    headers = ['business', 'economy', 'education', 'entertainment', 'environment', 'government', 'health', 'politics', 'sports', 'war']
    df_doc_zero_topics = pd.DataFrame(temp.T.values[1:], columns=headers)
    
    sentences, sentence_pairs = prepare_sentences_pipeline2(text)
    
    sequence_to_classify = sentences
    candidate_labels = ['war', 'government', 'politics', 'education', 'health', 'environment', 'economy', 'business', 'entertainment', 'sports']
    result = (zero_classifier(sequence_to_classify, candidate_labels))
    result = [list(elem.values())[1:] for elem in result ] 
    result = [item[0] for elem in result for item in elem]
    zero_shot_topic_sentences = [result[i] for i in range(len(result)) if i % 2 == 0]
    
    return[df_doc_zero_topics, zero_shot_topic_sentences]

In [22]:
# Compute POS segment
def pos_segment(sentences):
    nlp = spacy.load("en_core_web_sm")
    pos_segments_spacy=[]
    chunks = chunk_sentences(sentences)
    for i in range(len(chunks)):
        pos_segments_spacy.append([])
        doc = nlp(chunks[i])
        for token in doc:
            pos_segments_spacy[i].append(tuple((token.text, token.pos_)))
    return(pos_segments_spacy)

In [23]:
# Compute custom adjective embeddings
def custom_adjective_embeddings(sentences, models_adj, glove_embeddings):
    adj_topic_list = [i for i in range(0,8)]
    segments_A = []
    pos_segments = pos_segment(sentences)
    for i in range(len(pos_segments)):
        segments_A.append([])
        for k in range(len(pos_segments[i])):
            if pos_segments[i][k][1] in ['ADJ']:
                segments_A[i].append(pos_segments[i][k][0])
                
    categories_Adj = []
    for i in range(len(segments_A)):
        for k in range(len(segments_A[i])):
                temp = wordnet.synsets(segments_A[i][k], pos=wordnet.ADJ)
                if temp:
                    temp_2 = temp[0].lemmas()[0].key().split("%",1)[1].split(":")[0]
                    if temp_2 == '5':
                        categories_Adj.append(((temp[0].lemmas()[0].key().split(':', 4)[3]), (temp[0].lemmas()[0].key().split('%', 4)[0])))              

    embeddings = [glove_embeddings.get(elem[0]) if elem[0] in (glove_embeddings) else glove_embeddings.get(elem[1]) if elem[1] in (glove_embeddings) else 'None' for elem in categories_Adj]
    embeddings = list(filter(("None").__ne__,embeddings))
    if len(embeddings) == 0:
        topic_embedding_adj = np.zeros(len(adj_topic_list))
    else:
        labels_adj = models_adj.predict(embeddings)

        topic_embedding_adj = np.zeros(len(adj_topic_list))
        num_seg = len(labels_adj) 
        for i in range(num_seg):
            if num_seg == 0:
                topic_embedding_adj=np.zeros(len(adj_topic_list))
            else:
                index = labels_adj[i]
                topic_embedding_adj[index] += 1/num_seg
    
    return(topic_embedding_adj)

In [24]:
# Compute custom verb embeddings
def custom_verb_embeddings(sentences):
    verb_topic_list = [str(i)for i in range(29,44)]
    segments_V = []
    pos_segments = pos_segment(sentences)
    for i in range(len(pos_segments)):
        segments_V.append([])
        for k in range(len(pos_segments[i])):
            if pos_segments[i][k][1] in ['VERB']:
                segments_V[i].append(pos_segments[i][k][0])
        
    categories_V = []
    for i in range(len(segments_V)):
        for k in range(len(segments_V[i])):
            temp = wordnet.synsets(segments_V[i][k], pos=wordnet.VERB)
            if temp:
                categories_V.append(temp[0].lemmas()[0].key().split(':', 2)[1])
                
    bi_loc = [([idx for idx, val in enumerate(verb_topic_list) if val == sub] if sub in verb_topic_list else [None]) for sub in categories_V]

    topic_embedding_verb = np.zeros(len(verb_topic_list))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            topic_embedding_verb=np.zeros(len(verb_topic_list))
        else:
            index = bi_loc[j]
            topic_embedding_verb[index] += 1/num_seg

    return(topic_embedding_verb)

In [25]:
# Compute custom noun embeddings
def custom_noun_embeddings(sentences):
    noun_topic_list = ["%.2d" % i for i in range(3,29)]
    segments_N = []
    pos_segments = pos_segment(sentences)
    for i in range(len(pos_segments)):
        segments_N.append([])
        for k in range(len(pos_segments[i])):
            if pos_segments[i][k][1] in ['NOUN']:
                segments_N[i].append(pos_segments[i][k][0])
        
    categories_N = []
    for i in range(len(segments_N)):
        for k in range(len(segments_N[i])):
            temp = wordnet.synsets(segments_N[i][k],pos=wordnet.NOUN)
            if temp:
                categories_N.append(temp[0].lemmas()[0].key().split(':', 2)[1])
  
    bi_loc = [([idx for idx, val in enumerate(noun_topic_list) if val == sub] if sub in noun_topic_list else [None]) for sub in categories_N]
    topic_embedding_noun = np.zeros(len(noun_topic_list))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            topic_embedding_noun = np.zeros(len(noun_topic_list))
        else:
            index = bi_loc[j]
            topic_embedding_noun[index] += 1/num_seg
    
    return(topic_embedding_noun)

In [26]:
# Compute attribution via dberta labels
def attribution_dberta_labels(trained_dberta_embeddings, sentences,attr_model, trained_data):
    data_wo_sw = [remove_stopwords(elem) for elem in sentences]
    query_embeddings = attr_model.encode(data_wo_sw)
    dist_with_query = []
    temp = []
    for j in range(len(trained_dberta_embeddings)):
        temp.append(util.cos_sim(query_embeddings, trained_dberta_embeddings[j]))
    max_dist_new = [np.amax(np.array(elem), axis=1) for elem in temp]
    max_dist_index = [np.argmax(np.array(elem), axis=1) for elem in temp]
    max_dist_per_document = []
    for k in range(len(max_dist_new)):
        max_dist_per_document.append([])
        for l in range(len(max_dist_new[k])):
            max_dist_per_document[k].append((max_dist_new[k][l], (k, max_dist_index[k][l])))
    dist_with_query.append([max(max_dist_per_document, key = lambda z: z[i])[i] for i in range(len(max_dist_per_document[0]))])
    
    min_dist = dist_with_query[0]
    average_per_document = np.mean([(a_tuple[0]) for a_tuple in min_dist])
    min_distance_documents = [(a_tuple[1]) for a_tuple in min_dist]
    min_distance_labels = [trained_data['News'][elem[0]] for elem in min_distance_documents]
    
    Attribution_labels = []
    Attribution_labels_Normalised = []
    beta = 0.91
    for i in range(len(min_dist)):
        if min_dist[i][0] >= beta * average_per_document:
            if min_distance_labels[i] == 1:
                Attribution_labels.append((min_dist[i][0], "Real", min_dist[i][1]))
                Attribution_labels_Normalised.append((min_dist[i][0], "Real", min_dist[i][1]))
            else:
                Attribution_labels.append((min_dist[i][0], "Fake", min_dist[i][1]))
                Attribution_labels_Normalised.append((min_dist[i][0], "Fake", min_dist[i][1]))
        else:
            Attribution_labels.append((0, "Unknown", "No Similar Source"))
            
    return [dist_with_query, Attribution_labels, Attribution_labels_Normalised]

In [27]:
# Compute attribution via dberta embeddings
def attribution_dberta_embeddings(Attribution_labels, Attribution_labels_Normalised):
    Count_labels = ['Fake', 'Real', 'Unknown']
    bi_loc = [([idx for idx, val in enumerate(Count_labels) if val == sub[1]] if sub[1] in Count_labels else [None]) for sub in Attribution_labels]
    Unigram_Sentence_embedding = np.zeros(len(Count_labels))
    num_seg = len(bi_loc) 
    for i in range(num_seg):
        if num_seg == 0:
            Unigram_Sentence_embedding = np.zeros(len(Count_labels))
        else:
            index = bi_loc[i]
            Unigram_Sentence_embedding[index] += 1/num_seg
             
    out = defaultdict(list)
    for i in Attribution_labels_Normalised:
        out[i[1]] += [i[0]]
        
    labels = ['Fake', 'Real']
    Attribution_Normalised = []
    total = sum([sum(elem) for elem in out.values()])
    if total == 0:
        Attribution_Normalised.append([(0, k) for k, v in out.items()])
    else:
        Attribution_Normalised.append([(sum(v)/total, k) for k, v in out.items()])

    bi_loc =[([idx for idx, val in enumerate(labels) if val == sub[1]] if sub[1] in labels else [None]) for sub in Attribution_Normalised[0]]
    
    Unigram_Sentence_embedding_weighted = np.zeros(len(labels))
    num_seg = len(bi_loc) 
    for i in range(num_seg):
        if num_seg == 0:
            Unigram_Sentence_embedding_weighted = np.zeros(len(labels))
        else:
            index = bi_loc[i]
            Unigram_Sentence_embedding_weighted[index] += Attribution_Normalised[0][i][0]

    return(Unigram_Sentence_embedding, Unigram_Sentence_embedding_weighted)

In [28]:
# Compute attribution via doc2vec labels
def attribution_doc2vec_labels(sentences, mapping_sent_doc, doc2vec_model, trained_data):
    data_wo_sw = [remove_stopwords(elem) for elem in sentences]
    tokenized_sent = [word_tokenize(elem.lower()) for elem in data_wo_sw]
    for i in range(len(tokenized_sent)):
        tokenized_sent[i] = [i for i in tokenized_sent[i] if i.isalnum()]

    doc2vec_embeddings = [doc2vec_model.infer_vector(elem) for elem in tokenized_sent]
    most_similar_doc2vec = [doc2vec_model.docvecs.most_similar(positive = [elem])[1] for elem in doc2vec_embeddings]
    min_dist = [(elem[1],mapping_sent_doc.iloc[elem[0]][1]) for elem in most_similar_doc2vec]
    average_per_document = np.mean([(a_tuple[0]) for a_tuple in min_dist])
    min_distance_documents = [(a_tuple[1]) for a_tuple in min_dist]
    min_distance_labels = [trained_data['News'][elem[0]] for elem in min_distance_documents]
    
    Attribution_labels = []
    Attribution_labels_Normalised = []
    beta = 0.96
    for i in range(len(min_dist)):
        if min_dist[i][0] >= beta * average_per_document:
            if min_distance_labels[i] == 1:
                Attribution_labels.append((min_dist[i][0], "Real", min_dist[i][1]))
                Attribution_labels_Normalised.append((min_dist[i][0], "Real", min_dist[i][1]))
            else:
                Attribution_labels.append((min_dist[i][0], "Fake", min_dist[i][1]))
                Attribution_labels_Normalised.append((min_dist[i][0], "Fake", min_dist[i][1]))
        else:
            Attribution_labels.append((0, "Unknown", "No Similar Source"))
    
    return(Attribution_labels, Attribution_labels_Normalised)

In [29]:
# Compute attribution via doc2vec embeddings
def attribution_doc2vec_embeddings(Attribution_labels, Attribution_labels_Normalised):
    Count_labels = ['Fake', 'Real', 'Unknown']
    bi_loc = [([idx for idx, val in enumerate(Count_labels) if val == sub[1]] if sub[1] in Count_labels else [None]) for sub in Attribution_labels]
    Unigram_Sentence_embedding = np.zeros(len(Count_labels))
    num_seg = len(bi_loc) 
    for i in range(num_seg):
        if num_seg == 0:
            Unigram_Sentence_embedding = np.zeros(len(Count_labels))
        else:
            index = bi_loc[i]
            Unigram_Sentence_embedding[index] += 1/num_seg
            
    out = defaultdict(list)
    for i in Attribution_labels_Normalised:
        out[i[1]] += [i[0]]
        
    labels = ['Fake', 'Real']
    Attribution_Normalised = []
    total = sum([sum(elem) for elem in out.values()])
    if total == 0:
        Attribution_Normalised.append([(0, k) for k, v in out.items()])
    else:
        Attribution_Normalised.append([(sum(v)/total, k) for k, v in out.items()])

    bi_loc = [([idx for idx, val in enumerate(labels) if val == sub[1]] if sub[1] in labels else [None]) for sub in Attribution_Normalised[0]]
    
    Unigram_Sentence_embedding_weighted = np.zeros(len(labels))
    num_seg = len(bi_loc) 
    for i in range(num_seg):
        if num_seg == 0:
            Unigram_Sentence_embedding_weighted = np.zeros(len(labels))
        else:
            index = bi_loc[i]
            Unigram_Sentence_embedding_weighted[index] += Attribution_Normalised[0][i][0]

    return(Unigram_Sentence_embedding, Unigram_Sentence_embedding_weighted)

In [30]:
# Compute Markov structure
def markov_structure(sentences, markov_real, markov_fake):
    POS_tags_sentences = []
    for elem in sentences:
        temp = TextBlob(elem)
        POS_tags_sentences.append([elem[1] for elem in temp.tags])
    for j in range(len(POS_tags_sentences)):
        POS_tags_sentences[j].insert(0, 'Start')
        POS_tags_sentences[j].append('End')
        
    bi_segments = []
    for elem in POS_tags_sentences:
        bi_segments.append(list(zip(elem, elem[1:])))
        
    test_real = []
    for i in range(len(bi_segments)):
        trans_prob = []
        for j in range(len(bi_segments[i])):
            temp = markov_real[markov_real['Bigram'] == bi_segments[i][j]]['norm_prob']
            if len(temp.values) != 0:
                trans_prob.append(temp.values[0])
        final_trans_prob = np.prod(trans_prob)
        test_real.append(final_trans_prob)
        
    test_fake = []
    for i in range(len(bi_segments)):
        trans_prob = []
        for j in range(len(bi_segments[i])):
            temp = markov_fake[markov_fake['Bigram'] == bi_segments[i][j]]['norm_prob']
            if len(temp.values) != 0:
                trans_prob.append(temp.values[0])
        final_trans_prob = np.prod(trans_prob)
        test_fake.append(final_trans_prob)  
        
    df_markov = pd.DataFrame({'real': test_real, 'fake': test_fake})
    df_markov['Pred'] = np.where(df_markov['real'] > df_markov['fake'], 1,0)
    
    markov_prediction = df_markov['Pred'].mode()
    if len(markov_prediction) != 1:
        markov_prediction = 0 
    else:
        markov_prediction = markov_prediction[0]

    return(markov_prediction)

In [31]:
# Compute x_attribute feature
def x_attribute_feature(attr_labels, topic_labels, Topic_Attribution_Labels, Topic_Attribution_Labels_type1, topics_for_sentences, zero_topics_for_sentences, zero_topic_labels, Zero_Topic_Attribution_Labels, Zero_Topic_Attribution_Labels_type1):
    
    attribute_labels = attr_labels
    attribute_labels = [elem[1] for elem in attribute_labels]
            
    Topic_Labels = topic_labels     
    Topic_Attr_labels = Topic_Attribution_Labels
    Topic_Attribution_Labels_type1 = Topic_Attribution_Labels_type1
    
    uni_topic_sentences = topics_for_sentences
    uni_topic_sentences = [Topic_Labels[elem] for elem in uni_topic_sentences]
    
    Topics_Bigram = list(zip(uni_topic_sentences, uni_topic_sentences[1:]))
    Topics_Bigram.append((uni_topic_sentences[-1], 'End'))
    
    Topics_Attribution = [item1 + (item2,) for (item1, item2) in zip(Topics_Bigram, attribute_labels)]
    Topics_Attribution_type1 = [(elem[1], elem[2]) for elem in Topics_Attribution]
    Topics_Attribution = [elem for elem in Topics_Attribution if elem[2] != 'Unknown']
    Topics_Attribution_type1 = [elem for elem in Topics_Attribution_type1 if elem[1] != 'Unknown']
    
    bi_loc = [([idx for idx,val in enumerate(Topic_Attr_labels) if val == sub] if sub in Topic_Attr_labels else [None]) for sub in Topics_Attribution]
    Topics_Attribution_embedding = np.zeros(len(Topic_Attr_labels))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            Topics_Attribution_embedding = np.zeros(len(Topic_Attr_labels))
        else:
            index = bi_loc[j]
            Topics_Attribution_embedding[index] += 1/num_seg
            
    bi_loc = [([idx for idx,val in enumerate(Topic_Attribution_Labels_type1) if val == sub] if sub in Topic_Attribution_Labels_type1 else [None]) for sub in Topics_Attribution_type1]
    Topics_Attribution_embedding_type1 = np.zeros(len(Topic_Attribution_Labels_type1))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            Topics_Attribution_embedding_type1 = np.zeros(len(Topic_Attribution_Labels_type1))
        else:
            index = bi_loc[j]
            Topics_Attribution_embedding_type1[index] += 1/num_seg 
            
    Zero_Topic_Labels = zero_topic_labels     
    Zero_Topic_Attr_labels = Zero_Topic_Attribution_Labels
    Zero_Topic_Attribution_Labels_type1 = Zero_Topic_Attribution_Labels_type1
    
    Zero_uni_topic_sentences = zero_topics_for_sentences
    
    Zero_Topics_Bigram = list(zip(Zero_uni_topic_sentences, Zero_uni_topic_sentences[1:]))
    Zero_Topics_Bigram.append((Zero_uni_topic_sentences[-1], 'End'))
    
    Zero_Topics_Attribution = [item1 + (item2,) for (item1, item2) in zip(Zero_Topics_Bigram, attribute_labels)]
    Zero_Topics_Attribution_type1 = [(elem[1], elem[2]) for elem in Zero_Topics_Attribution]
    
    Zero_Topics_Attribution = [elem for elem in Zero_Topics_Attribution if elem[2] != 'Unknown']
    Zero_Topics_Attribution_type1 = [elem for elem in Zero_Topics_Attribution_type1 if elem[1] != 'Unknown']
    
    bi_loc = [([idx for idx,val in enumerate(Zero_Topic_Attr_labels) if val == sub] if sub in Zero_Topic_Attr_labels else [None]) for sub in Zero_Topics_Attribution]
    Zero_Topics_Attribution_embedding = np.zeros(len(Zero_Topic_Attr_labels))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            Zero_Topics_Attribution_embedding = np.zeros(len(Zero_Topic_Attr_labels))
        else:
            index = bi_loc[j]
            Zero_Topics_Attribution_embedding[index] += 1/num_seg
            
    bi_loc = [([idx for idx, val in enumerate(Zero_Topic_Attribution_Labels_type1) if val == sub] if sub in Zero_Topic_Attribution_Labels_type1 else [None]) for sub in Zero_Topics_Attribution_type1]
    Zero_Topics_Attribution_embedding_type1 = np.zeros(len(Zero_Topic_Attribution_Labels_type1))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            Zero_Topics_Attribution_embedding_type1 = np.zeros(len(Zero_Topic_Attribution_Labels_type1))
        else:
            index = bi_loc[j]
            Zero_Topics_Attribution_embedding_type1[index] += 1/num_seg  
    
    return (Topics_Attribution, Topics_Attribution_embedding, Topics_Attribution_type1, Topics_Attribution_embedding_type1, Zero_Topics_Attribution_embedding, Zero_Topics_Attribution_embedding_type1)

In [32]:
# Compute x_attribute feature zero-shot learning
def x_attribute_feature_wo_zeroshot(attr_labels, topic_labels, Topic_Attribution_Labels, Topic_Attribution_Labels_type1,topics_for_sentences):
    attribute_labels = attr_labels
    attribute_labels = [elem[1] for elem in attribute_labels]
            
    Topic_Labels = topic_labels     
    Topic_Attr_labels = Topic_Attribution_Labels
    Topic_Attribution_Labels_type1 = Topic_Attribution_Labels_type1
    
    uni_topic_sentences = topics_for_sentences
    uni_topic_sentences = [Topic_Labels[elem] for elem in uni_topic_sentences]
    
    Topics_Bigram =list(zip(uni_topic_sentences, uni_topic_sentences[1:]))
    Topics_Bigram.append((uni_topic_sentences[-1], 'End'))
    
    Topics_Attribution = [item1 + (item2,) for (item1, item2) in zip(Topics_Bigram,attribute_labels)]
    Topics_Attribution_type1 = [(elem[1], elem[2]) for elem in Topics_Attribution]
    Topics_Attribution = [elem for elem in Topics_Attribution if elem[2] != 'Unknown']
    Topics_Attribution_type1 = [elem for elem in Topics_Attribution_type1 if elem[1] != 'Unknown']
    
    bi_loc = [([idx for idx,val in enumerate(Topic_Attr_labels) if val == sub] if sub in Topic_Attr_labels else [None]) for sub in Topics_Attribution]
    Topics_Attribution_embedding = np.zeros(len(Topic_Attr_labels))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            Topics_Attribution_embedding = np.zeros(len(Topic_Attr_labels))
        else:
            index = bi_loc[j]
            Topics_Attribution_embedding[index] += 1/num_seg
            
    bi_loc = [([idx for idx,val in enumerate(Topic_Attribution_Labels_type1) if val == sub] if sub in Topic_Attribution_Labels_type1 else [None]) for sub in Topics_Attribution_type1]
    Topics_Attribution_embedding_type1 = np.zeros(len(Topic_Attribution_Labels_type1))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            Topics_Attribution_embedding_type1 = np.zeros(len(Topic_Attribution_Labels_type1))
        else:
            index = bi_loc[j]
            Topics_Attribution_embedding_type1[index] += 1/num_seg 

    return (Topics_Attribution, Topics_Attribution_embedding, Topics_Attribution_type1, Topics_Attribution_embedding_type1)

In [33]:
# Compute x_entailment feature
def x_entailment_feature(attr_labels, entailment_feature_label, Attr_Entailment_labels, topic_labels, Topic_Entailment_labels, topics_for_sentences, zero_topics_for_sentences, zero_topic_labels, Zero_Topic_Entailment_labels):
    attribute_labels = attr_labels
    attribute_labels = [elem[1] for elem in attribute_labels]
    attribute_bigram = list(zip(attribute_labels, attribute_labels[1:]))
    entailment_labels = entailment_feature_label
    
    Attribution_Entailment_labels = Attr_Entailment_labels
    Attribution_Entailment = [item1 + (item2,) for (item1, item2) in zip(attribute_bigram, entailment_labels)]
    Attribution_Entailment = [elem for elem in Attribution_Entailment if (elem[0] != 'Unknown' and elem[1] != 'Unknown')]
    bi_loc = [([idx for idx,val in enumerate(Attribution_Entailment_labels) if val == sub] if sub in Attribution_Entailment_labels else [None]) for sub in Attribution_Entailment]
    Attr_Entailment_embedding = np.zeros(len(Attribution_Entailment_labels))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            Attr_Entailment_embedding = np.zeros(len(Attribution_Entailment_labels))
        else:
            index = bi_loc[j]
            Attr_Entailment_embedding[index] += 1/num_seg
     
    Topic_Labels = topic_labels     
    Topic_Entailment_labels = Topic_Entailment_labels
    uni_topic_sentences = topics_for_sentences
    uni_topic_sentences = [Topic_Labels[elem] for elem in uni_topic_sentences]
    Topics_Bigram = list(zip(uni_topic_sentences, uni_topic_sentences[1:]))
    Topics_Bigram.append((uni_topic_sentences[-1], 'End'))
    Topics_Entailment = [item1 + (item2,) for (item1, item2) in zip(Topics_Bigram, entailment_labels)]
    Topics_Entailment = [elem for elem in Topics_Entailment if elem[2]!='NEUTRAL']
    
    bi_loc = [([idx for idx,val in enumerate(Topic_Entailment_labels) if val == sub] if sub in Topic_Entailment_labels else [None]) for sub in Topics_Entailment]
    Topics_Entailment_embedding = np.zeros(len(Topic_Entailment_labels))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            Topics_Entailment_embedding = np.zeros(len(Topic_Entailment_labels))
        else:
            index = bi_loc[j]
            Topics_Entailment_embedding[index] += 1/num_seg
    
    Zero_Topic_Labels = zero_topic_labels     
    Zero_Topic_Entailment_labels = Zero_Topic_Entailment_labels
    Zero_uni_topic_sentences = zero_topics_for_sentences

    Zero_Topics_Bigram = list(zip(Zero_uni_topic_sentences, Zero_uni_topic_sentences[1:]))
    Zero_Topics_Bigram.append((Zero_uni_topic_sentences[-1], 'End'))
    Zero_Topics_Entailment = [item1 + (item2,) for (item1, item2) in zip(Zero_Topics_Bigram, entailment_labels)]
    Zero_Topics_Entailment = [elem for elem in Zero_Topics_Entailment if elem[2] != 'NEUTRAL']
    
    bi_loc = [([idx for idx,val in enumerate(Zero_Topic_Entailment_labels) if val == sub] if sub in Zero_Topic_Entailment_labels else [None]) for sub in Zero_Topics_Entailment]
    Zero_Topics_Entailment_embedding = np.zeros(len(Zero_Topic_Entailment_labels))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            Zero_Topics_Entailment_embedding = np.zeros(len(Zero_Topic_Entailment_labels))
        else:
            index = bi_loc[j]
            Zero_Topics_Entailment_embedding[index] += 1/num_seg
            
    return(Attribution_Entailment, Attr_Entailment_embedding, Topics_Entailment, Topics_Entailment_embedding, Zero_Topics_Entailment_embedding)

In [34]:
# Compute x_entailment feature zero-shot learning
def x_entailment_feature_wo_zeroshot(attr_labels, entailment_feature_label, Attr_Entailment_labels, topic_labels, Topic_Entailment_labels, topics_for_sentences):
    attribute_labels = attr_labels
    attribute_labels = [elem[1] for elem in attribute_labels]
    attribute_bigram = list(zip(attribute_labels, attribute_labels[1:]))
    entailment_labels = entailment_feature_label
    
    Attribution_Entailment_labels = Attr_Entailment_labels
    Attribution_Entailment = [item1 + (item2,) for (item1, item2) in zip(attribute_bigram, entailment_labels)]
    Attribution_Entailment = [elem for elem in Attribution_Entailment if (elem[0] != 'Unknown' and elem[1] != 'Unknown')]
    bi_loc = [([idx for idx,val in enumerate(Attribution_Entailment_labels) if val == sub] if sub in Attribution_Entailment_labels else [None]) for sub in Attribution_Entailment]
    Attr_Entailment_embedding = np.zeros(len(Attribution_Entailment_labels))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            Attr_Entailment_embedding = np.zeros(len(Attribution_Entailment_labels))
        else:
            index = bi_loc[j]
            Attr_Entailment_embedding[index] += 1/num_seg
    
    Topic_Labels = topic_labels     
    Topic_Entailment_labels = Topic_Entailment_labels
    uni_topic_sentences = topics_for_sentences
    uni_topic_sentences = [Topic_Labels[elem] for elem in uni_topic_sentences]
    Topics_Bigram =list(zip(uni_topic_sentences, uni_topic_sentences[1:]))
    Topics_Bigram.append((uni_topic_sentences[-1], 'End'))
    Topics_Entailment = [item1 + (item2,) for (item1, item2) in zip(Topics_Bigram, entailment_labels)]
    Topics_Entailment = [elem for elem in Topics_Entailment if elem[2] != 'NEUTRAL']
    
    bi_loc = [([idx for idx,val in enumerate(Topic_Entailment_labels) if val == sub] if sub in Topic_Entailment_labels else [None]) for sub in Topics_Entailment]
    Topics_Entailment_embedding = np.zeros(len(Topic_Entailment_labels))
    num_seg = len(bi_loc) 
    for j in range(num_seg):
        if num_seg == 0:
            Topics_Entailment_embedding = np.zeros(len(Topic_Entailment_labels))
        else:
            index = bi_loc[j]
            Topics_Entailment_embedding[index] += 1/num_seg
            
    return(Attribution_Entailment, Attr_Entailment_embedding, Topics_Entailment, Topics_Entailment_embedding)

In [35]:
# Compute hypernym and hyponym
def Hypernym_Hyponym(chunk_sentences):
    Word_Category = []
    for i in range(len(chunk_sentences)):
        Word_Category.append([])
        word_level = word_tokenize(chunk_sentences[i])
        for k in range(len(word_level)):
            temp = lesk(word_level,word_level[k])
            if temp:
                temp_hypernym = len(temp.hypernym_paths()[0])
                temp_hyponym = len(temp.hyponyms())
                if temp_hyponym == 0 :
                    if temp_hypernym == 1:
                         Word_Category[i].append("General")
                    else:
                          Word_Category[i].append("Specific")
                else:
                    if temp_hypernym == 1:
                         Word_Category[i].append("General")
                    else:
                        Word_Category[i].append("Between")
                        
    Word_Category_Counter = []
    for j in range(len(Word_Category)):
        counter = Counter(Word_Category[j])
        total = sum(counter.values())
        for item, count in counter.items():
            counter[item] /= total
        Word_Category_Counter.append(sorted(counter.items()))
        
    wordnet_labels = ['Between', 'General', 'Specific']
    
    for i in range(len(Word_Category_Counter)):
        elem=[k[0] for k in Word_Category_Counter[i]]
        if wordnet_labels not in elem:
            non_intersection = set(wordnet_labels) - set(elem)
            for item in non_intersection:
                Word_Category_Counter[i].append((item,0))
                
    if (len(Word_Category_Counter)) == 1 :
        Word_Category_Counter.append([('Between', 0), ('General', 0), ('Specific', 0)])
        Word_Category_Counter.append([('Between', 0), ('General', 0), ('Specific', 0)])
    if (len(Word_Category_Counter)) == 2 :
        Word_Category_Counter.append([('Between', 0), ('General', 0), ('Specific', 0)])
    
    df_Hypernym_Hyponym = pd.DataFrame([x[1] for x in Word_Category_Counter[0]]+[x[1] for x in Word_Category_Counter[1]]+[x[1] for x in Word_Category_Counter[2]]).T
    
    df_Hypernym_Hyponym.columns = ['B1', 'G1', 'S1', 'B2', 'G2', 'S2', 'B3', 'G3', 'S3']

    return(Word_Category, df_Hypernym_Hyponym)

In [36]:
# Entailment thread
def entailment_thread(req_id, sentences,sentence_pairs):
    global file_locker

    entailment_feature_label, df_entailment_type1, df_entailment_type2 = entailment_features(sentence_pairs, sentences)
    df_entailment_type2.columns = [elem + "_norm" for elem in df_entailment_type2.columns]
    
    # Send message
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["message"] = "Wohoo! Entailment is done"
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    
    return (entailment_feature_label, df_entailment_type1, df_entailment_type2)

In [37]:
# Personality thread
def personality_thread(req_id, data, vec_for_personality, model_for_personality, sentences):
    global file_locker

    df_personality = personality_feature(data, vec_for_personality, model_for_personality)
    df_sentiment_type1,df_sentiment_type2 = sentiment_feature(sentences) 
    df_sentiment_type2.columns = [elem + "_norm" for elem in df_sentiment_type2.columns]
    
    # Send message
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["message"] = "Wohoo! We figured out the sentiments and personality"
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    
    return (df_personality, df_sentiment_type1, df_sentiment_type2)


In [38]:
# Topic Features thread
def topic_features_thread(req_id, data, sentences, dberta_model, model_for_topic):
    global file_locker

    raw_doctopic_embedding, normalised_doctopic_embedding, topics_for_sentences = topic_feature(data, sentences, dberta_model, model_for_topic)
    df_raw_doctopic = pd.DataFrame(raw_doctopic_embedding).T
    df_normalised_doctopic = pd.DataFrame(normalised_doctopic_embedding).T
    df_raw_doctopic.columns = ['CRIME', 'DISSENT', 'DOMESTIC ECONOMIC', 'DOMESTIC POLITICAL', 'ENERGY', 'ENVIRONMENT', 'HEALTH', 'HUMAN RIGHTS', 'INTERNATIONAL ECONOMIC', 'INTERNATIONAL POLITICAL', 'LEADER', 'MEDIA', 'MIGRATION', 'MILITARY', 'NARCOTICS', 'OTHER', 'PEACEKEEPING', 'PROLIFERATION', 'TECHNOLOGY', 'TELECOM', 'TERRORISM', 'TERRORISM.911USA', 'TERRORISM.911WEB', 'URGENT']

    df_normalised_doctopic.columns = ['CRIME_Norm', 'DISSENT_Norm', 'DOMESTIC ECONOMIC_Norm', 'DOMESTIC POLITICAL_Norm', 'ENERGY_Norm', 'ENVIRONMENT_Norm', 'HEALTH_Norm', 'HUMAN RIGHTS_Norm', 'INTERNATIONAL ECONOMIC_Norm', 'INTERNATIONAL POLITICAL_Norm', 'LEADER_Norm', 'MEDIA_Norm', 'MIGRATION_Norm', 'MILITARY_Norm', 'NARCOTICS_Norm', 'OTHER_Norm', 'PEACEKEEPING_Norm', 'PROLIFERATION_Norm', 'TECHNOLOGY_Norm', 'TELECOM_Norm', 'TERRORISM_Norm', 'TERRORISM.911USA_Norm', 'TERRORISM.911WEB_Norm', 'URGENT_Norm']
    
    # Send message
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["message"] = "Pheww!! Getting out the topics was time consuming"
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    
    return (df_raw_doctopic, df_normalised_doctopic, topics_for_sentences)

In [39]:
# Zero-shot learning topics thread
def zeroshot_topic_thread(req_id, data):
    global file_locker

    df_doc_zero_topics,zero_topics_for_sentences = zero_shot_learning_topic(data)
    # Send message
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["message"] = "Oh my zero shot learning yeilded results at 0 km/h"
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    
    return (df_doc_zero_topics, zero_topics_for_sentences)

In [40]:
# Custom Embedding thread
def custom_emb_thread(req_id, sentences, models_for_adj, glove_embeddings):
    global file_locker

    adj_embeddings = custom_adjective_embeddings(sentences, models_for_adj, glove_embeddings)
    df_adj_embeddings = pd.DataFrame(adj_embeddings).T
    df_adj_embeddings.columns = ["adj_category" + str(i+1) for i in range(len(adj_embeddings))]
    
    verb_embeddings = custom_verb_embeddings(sentences)
    df_verb_embeddings = pd.DataFrame(verb_embeddings).T
    df_verb_embeddings.columns = ["verb_category" + str(i+1) for i in range(len(verb_embeddings))]
    
    noun_embeddings = custom_noun_embeddings(sentences)
    df_noun_embeddings = pd.DataFrame(noun_embeddings).T
    df_noun_embeddings.columns = ["noun_category" + str(i+1) for i in range(len(noun_embeddings))]
    
    # Send message
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["message"] = "Yaya! some crafty custom embeddings are here!"
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    
    return (df_adj_embeddings, df_verb_embeddings, df_noun_embeddings)

In [41]:
# Markov thread
def markov_thread(req_id, sentences, trained_markov_real_structure, trained_markov_fake_structure):
    global file_locker

    df_markov_prediction = pd.DataFrame([markov_structure(sentences, trained_markov_real_structure, trained_markov_fake_structure)],columns=['markov_pred'])
    
    # Send message
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["message"] = "The Markov Mystery is here as well!"
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    
    return df_markov_prediction

In [42]:
# Attribution thread
def attribution_thread(req_id, trained_dberta_embeddings, sentences, attr_model, trained_data, doc2vec_model, mapping_sent_doc):
    global file_locker

    attr_dist_from_trained_data, attr_labels, attr_labels_weighted = attribution_dberta_labels(trained_dberta_embeddings, sentences, attr_model, trained_data)
    attr_embedding, attr_embedding_weighted = attribution_dberta_embeddings(attr_labels, attr_labels_weighted)
    
    df_attr_embedding = pd.DataFrame(attr_embedding).T
    df_attr_embedding_weighted = pd.DataFrame(attr_embedding_weighted).T
    
    df_attr_embedding.columns = ['Fake', 'Real', 'Unknown_1']
    df_attr_embedding_weighted.columns = ['Fake_weight', 'Real_weight']
    
    attr_labels_doc2vec, attr_labels_weighted_doc2vec = attribution_doc2vec_labels(sentences, mapping_sent_doc, doc2vec_model, trained_data)
    attr_embedding_doc2vec, attr_embedding_weighted_doc2vec = attribution_doc2vec_embeddings(attr_labels_doc2vec, attr_labels_weighted_doc2vec)
    
    df_attr_embedding_doc2vec = pd.DataFrame(attr_embedding_doc2vec).T
    df_attr_embedding_weighted_doc2vec = pd.DataFrame(attr_embedding_weighted_doc2vec).T
    
    df_attr_embedding_doc2vec.columns = ['Fake_doc2vec', 'Real_doc2vec', 'Unknown_doc2vec']
    df_attr_embedding_weighted_doc2vec.columns = ['Fake_doc2vec_wt', 'Real_doc2vec_wt']
    
    # Send message
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["message"] = "Well your unmasking is attributed to attribution"
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    
    return (attr_labels, df_attr_embedding, df_attr_embedding_weighted, df_attr_embedding_doc2vec, df_attr_embedding_weighted_doc2vec)

In [43]:
# High-level features extraction
def high_level_features(req_id, text, vec_for_personality, model_for_personality, dberta_model, model_for_topic, models_for_adj, glove_embeddings, trained_markov_real_structure, trained_markov_fake_structure, trained_dberta_embeddings, trained_data, mapping_sent_doc, doc2vec_model, seq_of_col, Entailment_Classifier, Sentiment_Classifier, attr_model, zero_classifier):
    global file_locker

    # Send message
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["message"] = "Keep Calm and let the magic of 2nd pipeline take over"
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    
    # Data Preparation
    data = text
    sentences,sentence_pairs = prepare_sentences_pipeline2(data)
    chunks = chunk_sentences(sentences)
     
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        ent_thread_future = executor.submit(entailment_thread, req_id, sentence_pairs, sentences)
        per_thread_future = executor.submit(personality_thread, req_id, data, vec_for_personality, model_for_personality, sentences)
        top_thread_future = executor.submit(topic_features_thread, req_id, data, sentences, dberta_model, model_for_topic)
        zer_thread_future = executor.submit(zeroshot_topic_thread, req_id, data)
        emb_thread_future = executor.submit(custom_emb_thread, req_id, sentences, models_for_adj, glove_embeddings)
        mar_thread_future = executor.submit(markov_thread, req_id, sentences, trained_markov_real_structure, trained_markov_fake_structure)
        att_thread_future = executor.submit(attribution_thread, req_id, trained_dberta_embeddings, sentences, attr_model, trained_data, doc2vec_model, mapping_sent_doc)
        
    # Entailments
    entailment_feature_label, df_entailment_type1, df_entailment_type2 = ent_thread_future.result()
    
    # Personality and Sentiment
    df_personality, df_sentiment_type1, df_sentiment_type2 = per_thread_future.result()
    
    # Topics
    df_raw_doctopic, df_normalised_doctopic, topics_for_sentences = top_thread_future.result()
    
    # Zeroshot Topics
    df_doc_zero_topics, zero_topics_for_sentences = zer_thread_future.result()
   
    # Custom Embeddings
    df_adj_embeddings, df_verb_embeddings, df_noun_embeddings = emb_thread_future.result()
    
    # Markov Pred
    df_markov_prediction = mar_thread_future.result()
    
    # Attribution
    attr_labels, df_attr_embedding, df_attr_embedding_weighted, df_attr_embedding_doc2vec, df_attr_embedding_weighted_doc2vec = att_thread_future.result()
    
    topic_labels = ['CRIME', 'DISSENT', 'DOMESTIC ECONOMIC', 'DOMESTIC POLITICAL', 'ENERGY', 'ENVIRONMENT', 'HEALTH', 'HUMAN RIGHTS', 'INTERNATIONAL ECONOMIC', 'INTERNATIONAL POLITICAL', 'LEADER', 'MEDIA', 'MIGRATION', 'MILITARY', 'NARCOTICS', 'OTHER', 'PEACEKEEPING', 'PROLIFERATION', 'TECHNOLOGY', 'TELECOM', 'TERRORISM', 'TERRORISM.911USA', 'TERRORISM.911WEB', 'URGENT', 'End']
    
    zero_topic_labels = ['business', 'economy', 'education', 'entertainment', 'environment', 'government', 'health', 'politics', 'sports', 'war', 'End']

    Attribution_Labels = ['Real', 'Fake']
    Entailment_Labels = ['CONTRADICTION', 'ENTAILMENT', 'NEUTRAL']

    Topic_Bigram = list(itertools.permutations(topic_labels, 2))
    Topic_Bigram = Topic_Bigram[:-24] + [(elem,) + (elem,) for elem in topic_labels][:-1]
    Topic_Attribution_Labels = [elem + ('Real',) for elem in Topic_Bigram] + [elem + ('Fake',) for elem in Topic_Bigram]
    Topic_Attribution_Labels.sort()
    Topic_Entailment_labels = [elem + ('CONTRADICTION',) for elem in Topic_Bigram] + [elem + ('ENTAILMENT',) for elem in Topic_Bigram]
    Topic_Entailment_labels.sort()
    Topic_Attribution_Labels_type1 = [(elem,) + ('Real',) for elem in topic_labels] + [(elem,) + ('Fake',) for elem in topic_labels]
    Topic_Attribution_Labels_type1.sort()
    
    Zero_Topic_Bigram = list(itertools.permutations(zero_topic_labels, 2))
    Zero_Topic_Bigram = Zero_Topic_Bigram[:-10] + [(elem,) + (elem,) for elem in  zero_topic_labels][:-1]
    Zero_Topic_Attribution_Labels = [elem + ('Real',) for elem in Zero_Topic_Bigram] + [elem + ('Fake',) for elem in Zero_Topic_Bigram]
    Zero_Topic_Attribution_Labels.sort()
    Zero_Topic_Entailment_labels = [elem + ('CONTRADICTION',) for elem in Zero_Topic_Bigram] + [elem + ('ENTAILMENT',) for elem in Zero_Topic_Bigram]
    Zero_Topic_Entailment_labels.sort()
    Zero_Topic_Attribution_Labels_type1 = [(elem,) + ('Real',) for elem in zero_topic_labels] + [(elem,) + ('Fake',) for elem in zero_topic_labels]
    Zero_Topic_Attribution_Labels_type1.sort()
    
    Attr_bigrams = list(itertools.permutations(Attribution_Labels, 2)) +[(elem,) + (elem,) for elem in Attribution_Labels]
    Attr_Entailment_labels = [elem + ('CONTRADICTION',) for elem in Attr_bigrams] + [elem + ('ENTAILMENT',) for elem in Attr_bigrams] + [elem + ('NEUTRAL',) for elem in Attr_bigrams]
    Attr_Entailment_labels.sort()
    
    Topics_Attribution, Topics_Attribution_embedding, Topics_Attribution_type1, Topics_Attribution_embedding_type1, Zero_Topics_Attribution_embedding, Zero_Topics_Attribution_embedding_type1 = x_attribute_feature(
            attr_labels, topic_labels, Topic_Attribution_Labels, Topic_Attribution_Labels_type1, topics_for_sentences, zero_topics_for_sentences, zero_topic_labels, Zero_Topic_Attribution_Labels, Zero_Topic_Attribution_Labels_type1)
    
    Attribution_Entailment, Attr_Entailment_embedding, Topics_Entailment, Topics_Entailment_embedding, Zero_Topics_Entailment_embedding = x_entailment_feature(
            attr_labels, entailment_feature_label, Attr_Entailment_labels, topic_labels, Topic_Entailment_labels, topics_for_sentences, zero_topics_for_sentences, zero_topic_labels, Zero_Topic_Entailment_labels)
    
    df_Topics_Attribution_embedding = pd.DataFrame(Topics_Attribution_embedding).T
    df_Topics_Attribution_embedding.columns = Topic_Attribution_Labels
    df_Zero_Topics_Attribution_embedding = pd.DataFrame(Zero_Topics_Attribution_embedding).T
    df_Zero_Topics_Attribution_embedding.columns = Zero_Topic_Attribution_Labels
    
    df_Topics_Attribution_embedding_type1 = pd.DataFrame(Topics_Attribution_embedding_type1).T
    df_Topics_Attribution_embedding_type1.columns = Topic_Attribution_Labels_type1
    df_Zero_Topics_Attribution_embedding_type1 = pd.DataFrame(Zero_Topics_Attribution_embedding_type1).T
    df_Zero_Topics_Attribution_embedding_type1.columns = Zero_Topic_Attribution_Labels_type1
    
    df_Attr_Entailment_embedding = pd.DataFrame(Attr_Entailment_embedding).T
    df_Attr_Entailment_embedding.columns = Attr_Entailment_labels
    
    df_Topics_Entailment_embedding = pd.DataFrame(Topics_Entailment_embedding).T
    df_Topics_Entailment_embedding.columns = Topic_Entailment_labels
    df_Zero_Topics_Entailment_embedding = pd.DataFrame(Zero_Topics_Entailment_embedding).T
    df_Zero_Topics_Entailment_embedding.columns = Zero_Topic_Entailment_labels
    
    df_Hypernym_Hyponym = Hypernym_Hyponym(chunks)[1]
    
    df_high_level = pd.concat([df_entailment_type1, df_entailment_type2, df_personality, df_sentiment_type1, df_sentiment_type2, df_raw_doctopic, df_normalised_doctopic, df_adj_embeddings, df_verb_embeddings, df_noun_embeddings, df_markov_prediction, df_attr_embedding, df_attr_embedding_weighted, df_attr_embedding_doc2vec, df_attr_embedding_weighted_doc2vec, df_Topics_Attribution_embedding, df_Topics_Attribution_embedding_type1, df_Attr_Entailment_embedding, df_Topics_Entailment_embedding, df_Hypernym_Hyponym, df_doc_zero_topics, df_Zero_Topics_Attribution_embedding, df_Zero_Topics_Attribution_embedding_type1, df_Zero_Topics_Entailment_embedding], axis=1)
    
    col_names_final = []
    for x in df_high_level.columns:
        if type(x) == tuple:
            if len(x) == 3:
                col_names_final.append('{}_{}_{}'.format(x[0], x[1], x[2]))
            else :
                col_names_final.append('{}_{}'.format(x[0], x[1]))
        else:
            col_names_final.append(x)
            
    df_high_level.columns = col_names_final
    
    col_names_final = [col.replace(".", "_") if "." in col else col for col in df_high_level.columns]
    df_high_level.columns = col_names_final
            
    df_high_level = df_high_level.loc[:, ~df_high_level.columns.duplicated()]
    df_high_level = df_high_level.drop(['Fake_weight', 'Fake_doc2vec_wt'], axis=1)
    df_high_level = df_high_level[seq_of_col]
    
    # Send message
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["message"] = "Topic Attribution done..."
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    
    return(df_high_level, attr_labels,sentences)

In [44]:
# Compute high-level features zero-shot learning 
def high_level_features_wo_zeroshot(req_id, text, vec_for_personality, model_for_personality, dberta_model, model_for_topic, models_for_adj, glove_embeddings, trained_markov_real_structure, trained_markov_fake_structure, trained_dberta_embeddings, trained_data, mapping_sent_doc, doc2vec_model, seq_of_col, Entailment_Classifier, Sentiment_Classifier, attr_model):
    global file_locker

    # Send message
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["message"] = "Keep Calm and let the magic of 2nd pipeline take over"
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    
    data = text
    sentences, sentence_pairs = prepare_sentences_pipeline2(data)
    chunks = chunk_sentences(sentences)
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        ent_thread_future = executor.submit(entailment_thread, req_id, sentence_pairs, sentences)
        per_thread_future = executor.submit(personality_thread, req_id, data, vec_for_personality, model_for_personality, sentences)
        top_thread_future = executor.submit(topic_features_thread, req_id, data, sentences, dberta_model, model_for_topic)
        emb_thread_future = executor.submit(custom_emb_thread, req_id, sentences, models_for_adj, glove_embeddings)
        mar_thread_future = executor.submit(markov_thread, req_id, sentences, trained_markov_real_structure, trained_markov_fake_structure)
        att_thread_future = executor.submit(attribution_thread, req_id, trained_dberta_embeddings, sentences, attr_model, trained_data, doc2vec_model, mapping_sent_doc)
        
    # Entailments
    entailment_feature_label, df_entailment_type1, df_entailment_type2 = ent_thread_future.result()
    
    # Personality and Sentiment
    df_personality, df_sentiment_type1, df_sentiment_type2 = per_thread_future.result()
    
    # Topics
    df_raw_doctopic, df_normalised_doctopic, topics_for_sentences = top_thread_future.result()
   
    # Custom Embeddings
    df_adj_embeddings, df_verb_embeddings, df_noun_embeddings = emb_thread_future.result()
    
    # Markov Pred
    df_markov_prediction = mar_thread_future.result()
    
    # Attribution
    attr_labels, df_attr_embedding, df_attr_embedding_weighted, df_attr_embedding_doc2vec, df_attr_embedding_weighted_doc2vec = att_thread_future.result()
    
    topic_labels = ['CRIME', 'DISSENT', 'DOMESTIC ECONOMIC', 'DOMESTIC POLITICAL', 'ENERGY', 'ENVIRONMENT', 'HEALTH', 'HUMAN RIGHTS', 'INTERNATIONAL ECONOMIC', 'INTERNATIONAL POLITICAL', 'LEADER', 'MEDIA', 'MIGRATION', 'MILITARY', 'NARCOTICS', 'OTHER', 'PEACEKEEPING', 'PROLIFERATION', 'TECHNOLOGY', 'TELECOM', 'TERRORISM', 'TERRORISM.911USA', 'TERRORISM.911WEB', 'URGENT', 'End']

    Attribution_Labels = ['Real', 'Fake']
    Entailment_Labels = ['CONTRADICTION', 'ENTAILMENT', 'NEUTRAL']

    Topic_Bigram = list(itertools.permutations(topic_labels, 2))
    Topic_Bigram = Topic_Bigram[:-24] + [(elem,) + (elem,) for elem in  topic_labels][:-1]
    Topic_Attribution_Labels = [elem + ('Real',) for elem in Topic_Bigram] + [ elem + ('Fake',) for elem in Topic_Bigram]
    Topic_Attribution_Labels.sort()
    Topic_Entailment_labels = [elem + ('CONTRADICTION',) for elem in Topic_Bigram] + [ elem + ('ENTAILMENT',) for elem in Topic_Bigram]
    Topic_Entailment_labels.sort()
    Topic_Attribution_Labels_type1 = [(elem,) + ('Real',) for elem in topic_labels] + [ (elem,) + ('Fake',) for elem in topic_labels]
    Topic_Attribution_Labels_type1.sort()
    
    Attr_bigrams = list(itertools.permutations(Attribution_Labels,2)) + [(elem,) + (elem,) for elem in  Attribution_Labels]
    Attr_Entailment_labels = [elem + ('CONTRADICTION',) for elem in Attr_bigrams] + [ elem + ('ENTAILMENT',) for elem in Attr_bigrams] + [ elem + ('NEUTRAL',) for elem in Attr_bigrams]
    Attr_Entailment_labels.sort()
    
    Topics_Attribution, Topics_Attribution_embedding, Topics_Attribution_type1, Topics_Attribution_embedding_type1 = x_attribute_feature_wo_zeroshot(
            attr_labels, topic_labels, Topic_Attribution_Labels, Topic_Attribution_Labels_type1, topics_for_sentences)
    
    Attribution_Entailment, Attr_Entailment_embedding, Topics_Entailment, Topics_Entailment_embedding = x_entailment_feature_wo_zeroshot(attr_labels, entailment_feature_label, Attr_Entailment_labels, topic_labels, Topic_Entailment_labels, topics_for_sentences)
    
    df_Topics_Attribution_embedding = pd.DataFrame(Topics_Attribution_embedding).T
    df_Topics_Attribution_embedding.columns = Topic_Attribution_Labels

    df_Topics_Attribution_embedding_type1 = pd.DataFrame(Topics_Attribution_embedding_type1).T
    df_Topics_Attribution_embedding_type1.columns = Topic_Attribution_Labels_type1
    
    df_Attr_Entailment_embedding = pd.DataFrame(Attr_Entailment_embedding).T
    df_Attr_Entailment_embedding.columns = Attr_Entailment_labels
    
    df_Topics_Entailment_embedding = pd.DataFrame(Topics_Entailment_embedding).T
    df_Topics_Entailment_embedding.columns = Topic_Entailment_labels
    
    df_Hypernym_Hyponym = Hypernym_Hyponym(chunks)[1]
    
    df_high_level = pd.concat([df_entailment_type1, df_entailment_type2, df_personality, df_sentiment_type1, df_sentiment_type2, df_raw_doctopic, df_normalised_doctopic, df_adj_embeddings, df_verb_embeddings, df_noun_embeddings, df_markov_prediction, df_attr_embedding, df_attr_embedding_weighted, df_attr_embedding_doc2vec, df_attr_embedding_weighted_doc2vec, df_Topics_Attribution_embedding, df_Topics_Attribution_embedding_type1, df_Attr_Entailment_embedding, df_Topics_Entailment_embedding, df_Hypernym_Hyponym], axis=1)
    
    col_names_final = []
    for x in df_high_level.columns:
        if type(x) == tuple:
            if len(x) == 3:
                col_names_final.append('{}_{}_{}'.format(x[0], x[1], x[2]))
            else :
                col_names_final.append('{}_{}'.format(x[0], x[1]))
        else:
            col_names_final.append(x)
            
    df_high_level.columns = col_names_final
    
    col_names_final = [col.replace(".", "_") if "." in col else col for col in df_high_level.columns]
    df_high_level.columns = col_names_final
            
    df_high_level = df_high_level.loc[:, ~df_high_level.columns.duplicated()]
    df_high_level = df_high_level.drop(['Fake_weight', 'Fake_doc2vec_wt'], axis=1)
    df_high_level = df_high_level[seq_of_col]
    
    # Send message
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["message"] = "Topic Attribution done..."
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol = pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    
    return(df_high_level, attr_labels,sentences)

In [45]:
# Compute DataFrame with SHAP contributions
def get_contrib_df(shap_base_value, shap_values, X_row, topx=None, cutoff=None, sort='abs', cols=None):

    assert isinstance(X_row, pd.DataFrame),\
        'X_row should be a pd.DataFrame! Use X.iloc[[index]]'
    assert len(X_row.iloc[[0]].values[0].shape) == 1,\
        """X is not the right shape: len(X.values[0]) should be 1. 
            Try passing X.iloc[[index]]""" 
    assert sort in {'abs', 'high-to-low', 'low-to-high', 'importance', None}

    # start with the shap_base_value
    base_df = pd.DataFrame(
        {
            'col': ['_BASE'],
            'contribution': [shap_base_value],
            'value': ['']
        })

    contrib_df = pd.DataFrame(
                    {
                        'col': X_row.columns,
                        'contribution': shap_values,
                        'value': X_row.values[0]
                    })

    if cols is None:
        if cutoff is None and topx is not None:
            cutoff = contrib_df.contribution.abs().nlargest(topx).min()
        elif cutoff is None and topx is None:
            cutoff = 0

        display_df = contrib_df[contrib_df.contribution.abs() >= cutoff]
        if topx is not None and len(display_df) > topx:
            # in case of ties around cutoff
            display_df = display_df.reindex(
                display_df.contribution.abs().sort_values(ascending=False).index).head(topx)

        display_df_neg = display_df[display_df.contribution < 0]
        display_df_pos = display_df[display_df.contribution >= 0]

        rest_df = (contrib_df[~contrib_df.col.isin(display_df.col.tolist())]
            .sum().to_frame().T
            .assign(col="_REST", value=""))

        # sort the df by absolute value from highest to lowest:
        if sort == 'abs':
            display_df = display_df.reindex(
                                display_df.contribution.abs().sort_values(ascending=False).index)
            contrib_df = pd.concat([base_df, display_df, rest_df], ignore_index=True)
        if sort == 'high-to-low':
            display_df_pos = display_df_pos.reindex(
                                display_df_pos.contribution.abs().sort_values(ascending=False).index)
            display_df_neg = display_df_neg.reindex(
                                display_df_neg.contribution.abs().sort_values().index)
            contrib_df = pd.concat([base_df, display_df_pos, rest_df, display_df_neg], ignore_index=True)
        if sort == 'low-to-high':
            display_df_pos = display_df_pos.reindex(
                                display_df_pos.contribution.abs().sort_values().index)
            display_df_neg = display_df_neg.reindex(
                                display_df_neg.contribution.abs().sort_values(ascending=False).index)
            contrib_df = pd.concat([base_df, display_df_neg, rest_df, display_df_pos], ignore_index=True)
    else:
        display_df = contrib_df[contrib_df.col.isin(cols)].set_index('col').reindex(cols).reset_index()
        rest_df = (contrib_df[~contrib_df.col.isin(cols)]
                       .sum().to_frame().T
                       .assign(col="_REST", value=""))
        contrib_df = pd.concat([base_df, display_df, rest_df], ignore_index=True)

    # add cumulative contribution from top to bottom (for making bar chart):
    contrib_df['cumulative'] = contrib_df.contribution.cumsum()
    contrib_df['base']= contrib_df['cumulative'] - contrib_df['contribution']  

    pred_df = contrib_df[['contribution']].sum().to_frame().T.assign(
            col='_PREDICTION', 
            value="", 
            cumulative=lambda df:df.contribution, 
            base=0)
    return pd.concat([contrib_df, pred_df], ignore_index=True)

In [46]:
# Append dict to DataFrame
def append_dict_to_df(df, row_dict):
    return pd.concat([df, pd.DataFrame([row_dict])],
                     ignore_index=True)

In [47]:
# Compute DataFrame with summary of SHAP contributions
def get_contrib_summary_df(contrib_df, model_output="raw", round=2, units="", na_fill=None):
 
    assert model_output in {'raw', 'probability', 'logodds'}
    contrib_summary_df = pd.DataFrame(columns=['Reason', 'Effect'])
    
    for _, row in contrib_df.iterrows():
        if row['col'] == '_BASE':
            reason = 'Average of population'
            effect = ""
        elif row['col'] == '_REST':
            reason = 'Other features combined'
            effect = f"{'+' if row['contribution'] >= 0 else ''}"
        elif row['col'] == '_PREDICTION':
            reason = 'Final prediction'
            effect = ""
        else:
            if na_fill is not None and row['value'] == na_fill:
                reason = f"{row['col']} = MISSING"
            else:
                reason = f"{row['col']} = {row['value']}"

            effect = f"{'+' if row['contribution'] >= 0 else ''}"
        if model_output == "probability":
            effect += str(np.round(100*row['contribution'], round))+'%'
        elif model_output == 'logodds':
            effect += str(np.round(row['contribution'], round))    
        else:
            effect +=  str(np.round(row['contribution'], round)) + f" {units}"

        contrib_summary_df = append_dict_to_df(contrib_summary_df,
            dict(Reason=reason, Effect=effect))
    
    return contrib_summary_df.reset_index(drop=True)

In [48]:
# Prediction Pipeline complete
def prediction_pipeline_complete(
    model_ll, text, vec,model_hl,vec_for_personality, model_for_personality, dberta_model, model_for_topic, models_for_adj, glove_embeddings, trained_markov_real_structure, trained_markov_fake_structure, trained_dberta_embeddings, trained_data, mapping_sent_doc, doc2vec_model, seq_of_col, Entailment_Classifier, Sentiment_Classifier, attr_model, zero_classifier, fake_threshold=0.183, real_threshold=0.841, headline=None):
    
    features, ftext, ftitle = low_level_features(text, vec, headline)
    prediction = model_ll.predict(features)
    prediction_proba = model_ll.predict_proba(features)[:,1]
    if prediction_proba >= real_threshold:
        output = "Well Well, the article you wanted to reveal was a cheap muggle trick and is handled by the 'Misuse of Muggle Artefacts Office'. A simple revelio charm did the trick and this article is Real with " + str(round(prediction_proba[0] * 100,2)) + " % confidence"
    elif prediction_proba <= fake_threshold:
        output = "Well Well, the article you wanted to reveal was a cheap muggle trick and is handled by the 'Misuse of Muggle Artefacts Office'. A simple revelio charm did the trick and this article is Fake with " + str(round((1-prediction_proba[0])* 100,2)) + " % confidence"
    else:
        prediction_proba = model_hl.predict_proba(high_level_features(text, vec_for_personality, model_for_personality, dberta_model, model_for_topic, models_for_adj, glove_embeddings, trained_markov_real_structure, trained_markov_fake_structure, trained_dberta_embeddings, trained_data, mapping_sent_doc, doc2vec_model, seq_of_col, Entailment_Classifier, Sentiment_Classifier, attr_model, zero_classifier))[:,1]
        if prediction_proba[0] >= 0.5:
            output = "We are in Pipeline-2 and This News is Real with " + str(round(prediction_proba[0] * 100,2)) + " % confidence"
        else:
            output = "We are in Pipeline-2 and This News is Fake with " + str(round((1-prediction_proba[0])* 100,2)) + " % confidence"
    
    return (output, ftext, ftitle)

In [49]:
# Prediction Pipeline-1
def prediction_pipeline_1(model_ll, text, vec, fake_threshold=0.183, real_threshold=0.841, headline=None):
    features, ftext, ftitle = low_level_features(text, vec, headline)
    prediction = model_ll.predict(features)
    prediction_proba = model_ll.predict_proba(features)[:,1]
    status = 0
    if prediction_proba >= real_threshold:
        output = "Well Well, the article you wanted to reveal was a cheap muggle trick and is handled by the 'Misuse of Muggle Artefacts Office'. A simple revelio charm did the trick and this article is Real with " + str(round(prediction_proba[0] * 100,2)) + " % confidence"
        status = 1
    elif prediction_proba <= fake_threshold:
        output = "Well Well, the article you wanted to reveal was a cheap muggle trick and is handled by the 'Misuse of Muggle Artefacts Office'. A simple revelio charm did the trick and this article is Fake with " + str(round((1-prediction_proba[0])* 100,2)) + " % confidence"
        status = -1
    else:
        output = "The prediction is not very confident, but we assume it is "
        label = "Real"
        status = 2
        prediction_number = prediction_proba[0]
        if prediction[0] == 0:
            label = "Fake"
            status = -2
            prediction_number = 1-prediction_number
        output += label + " with " + str(round(prediction_number* 100,2)) + " % confidence"

    return (output, ftext, ftitle, status, features)

In [50]:
# Compute contribution explanations
def get_contribution_explain(model_hl, features):
    explainer = shap.TreeExplainer(model_hl)
    shap_values = explainer.shap_values(features)
    shap_obj = explainer(features)
    contrib_df = get_contrib_df(explainer.expected_value, shap_values[0], features, topx=None, cutoff=None, sort='abs', cols=None)
    df_explain = get_contrib_summary_df(contrib_df, model_output="raw", round=2, units="", na_fill=None)
    
    return df_explain

In [51]:
# Compute figure arrow size
def figure_arrow_size(value, arrow_cuts):
    abs_value = abs(value)
    arrow_size = len(arrow_cuts)
    for i in range(len(arrow_cuts)):
        if arrow_cuts[i] > abs_value:
            arrow_size = len(arrow_cuts) - i
    if value < 0:
        return -1 * arrow_size
    else:
        return arrow_size

In [52]:
# Prediction Pipeline-1 child
def predict_pipeline1_child(req_id, text, title):
    global file_locker

    prediction, ftext, ftitle, status, ll_feat = prediction_pipeline_1(model_ll, text, tf_idf_vectorizer_ll, fake_threshold=0.183, real_threshold=0.841, headline=title)
    # Create arrows from feature contributions
    df_explain = get_contribution_explain(model_ll, ll_feat)
    max_explain = 0
    min_explain = 1000
    for i in range(1,5):
        value = float(df_explain.iloc[i]["Effect"])
        abs_expl = abs(value)
        if abs_expl > max_explain:
            max_explain = abs_expl
        if abs_expl < min_explain:
            min_explain = abs_expl
    one_third = (max_explain-min_explain) / 3.0
    arrow_cuts = [max_explain, min_explain + 2*one_third, min_explain + one_third]
    explanation_arrows = []
    for i in range(1,5):
        value = float(df_explain.iloc[i]["Effect"])
        split_reason = df_explain.iloc[i]["Reason"].split(" = ")
        name_expl = split_reason[0]
        expl_expl = "No explanation given"
        number_expl = split_reason[1]
        number_expl = round(float(number_expl),3)
        if name_expl in explain_dict_ll:
            name_expl, expl_expl = explain_dict_ll[name_expl]
        else:
            expl_expl = "The tf-idf of the word " + name_expl + ". tf-idf is the count of the word in relation to their total appearance in all news. A high value indicates a word, that appears often inside the text, but not often in other news. It denotes therefore an importance for distinguishing articles from each other."
        explanation_arrows.append((name_expl, number_expl, figure_arrow_size(value, arrow_cuts), expl_expl))
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["explanation"] = explanation_arrows
    stored_requests[req_id]["prediction"] = prediction
    stored_requests[req_id]["text"] = ftext
    stored_requests[req_id]["title"] = ftitle
    stored_requests[req_id]["status"] = status
    stored_requests[req_id]["message"] = "Pipeline 1 has finished."
    sent = sent_tokenize(ftext)
    a_third = int(len(sent) / 2.25)
    sec_third = int((len(sent) - a_third) / 2) + a_third
    split_text = [" ".join(sent[0:a_third]), " ".join(sent[a_third:sec_third]), " ".join(sent[sec_third:len(sent)])]
    stored_requests[req_id]["split_info"] = [a_third, sec_third, len(sent)]
    stored_requests[req_id]["split_text"] = split_text
    stored_requests[req_id]["quote"] = sent[a_third]
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")

In [53]:
# Wizard explainability
def wizard_explainability(model_explain, features, col_explain=None):
    if col_explain:
        explainer = ClassifierExplainer(model_explain, features[col_explain],model_output='probability')
    else:
        explainer = ClassifierExplainer(model_explain, features, model_output='probability')
    db = ExplainerDashboard(explainer, title="FACADE", contributions=False,hide_whatifpdp=True, bootstrap=dbc.themes.SKETCHY)
    db.run(mode="external", port=8052)
    return db

In [54]:
# Highlighting sentences data
def highlighting_sentences_data(Attribution_labels, trained_data, all_sentences_training, sentences_query):
    labels_to_use = Attribution_labels
    Sentence_Weight_Dataset = []
    for i in range(len(labels_to_use)):
        Sentence_Weight_Dataset.append((sentences_query[i], labels_to_use[i]))
        
    df_coeff = pd.DataFrame(Sentence_Weight_Dataset)
    df_coeff.columns = ['word', 'num_code']
    df_coeff['label'] = [elem[1] for elem in df_coeff['num_code']]
    df_coeff['Source'] = [elem[2] for elem in df_coeff['num_code']]
    df_coeff['num_code'] = [elem[0] for elem in df_coeff['num_code']]
    
    if len(df_coeff[df_coeff['label'] == "Real"]['num_code']) < 2:
        max_alpha_real = 1
    else:
        max_alpha_real = max(df_coeff[df_coeff['label'] == "Real"]['num_code'])
   
    if len(df_coeff[df_coeff['label'] == "Fake"]['num_code']) < 2:
        max_alpha_fake = 1
    else:
        max_alpha_fake = max(df_coeff[df_coeff['label'] == "Fake"]['num_code'])
        
    if len(df_coeff[df_coeff['label'] == "Real"]['num_code']) < 2:
        min_alpha_real = 0
    else:
        min_alpha_real = min(df_coeff[df_coeff['label'] == "Real"]['num_code'])
   
    if len(df_coeff[df_coeff['label'] == "Fake"]['num_code']) < 2:
        min_alpha_fake = 0
    else:
        min_alpha_fake = min(df_coeff[df_coeff['label'] == "Fake"]['num_code'])
    
    highlighted_text = []
    for i in range(len(df_coeff)):
        word = df_coeff['word'][i]
        weight = df_coeff['num_code'][i]
        label = df_coeff['label'] [i]
        if label != 'Unknown':
            Domain_of_Source = trained_data['domain'][ df_coeff['Source'][i][0]]
            Title_of_Source =  trained_data['title'][ df_coeff['Source'][i][0]]
            Sentence = all_sentences_training[df_coeff['Source'][i][0]][df_coeff['Source'][i][1]]
            Sentence = html.escape(Sentence)
            min_alpha = 0
            max_alpha = 0
            if label == 'Real':
                min_alpha = min_alpha_real
                max_alpha = max_alpha_real
            else:
                min_alpha = min_alpha_fake
                max_alpha = max_alpha_fake
            
            highlighted_text.append((word, label, weight, Domain_of_Source, Title_of_Source, Sentence, max_alpha, min_alpha))
        else:
            highlighted_text.append((word, "Unknown"))
        
    return(highlighted_text)

In [55]:
# Prediction Pipeline-2
def prediction_pipeline_2(req_id, model_hl, text, vec_for_personality, model_for_personality, dberta_model, model_for_topic, models_for_adj, glove_embeddings, trained_markov_real_structure, trained_markov_fake_structure, trained_dberta_embeddings, trained_data, mapping_sent_doc, doc2vec_model, seq_of_col, Entailment_Classifier, Sentiment_Classifier, attr_model, zero_classifier=None):
    global file_locker
    global running_db

    if zero_classifier:
        features, attr_labels, sentences_query = high_level_features(req_id, text, vec_for_personality, model_for_personality, dberta_model, model_for_topic, models_for_adj, glove_embeddings, trained_markov_real_structure, trained_markov_fake_structure, trained_dberta_embeddings, trained_data, mapping_sent_doc, doc2vec_model, seq_of_col, Entailment_Classifier, Sentiment_Classifier, attr_model, zero_classifier)
    else:
        features, attr_labels, sentences_query = high_level_features_wo_zeroshot(req_id, text, vec_for_personality, model_for_personality, dberta_model, model_for_topic, models_for_adj, glove_embeddings, trained_markov_real_structure, trained_markov_fake_structure, trained_dberta_embeddings, trained_data, mapping_sent_doc, doc2vec_model, seq_of_col, Entailment_Classifier, Sentiment_Classifier, attr_model)
    prediction_proba = model_hl.predict_proba(features)[:,1]
    print(features.columns)
    status = 0
    if prediction_proba[0] >= 0.5:
        output = "We are in Pipeline-2 and This News is Real with " + str(round(prediction_proba[0] * 100,2)) + " % confidence"
        status = 1
    else:
        output = "We are in Pipeline-2 and This News is Fake with " + str(round((1-prediction_proba[0])* 100,2)) + " % confidence"
        status = -1
    
    if zero_classifier:
        running_db = wizard_explainability(model_explain, features, col_explain)
    else:
        print("No wizardry without zero shot at the moment!")
    
    highlighted_text = highlighting_sentences_data(attr_labels, trained_data, all_sentences_training, sentences_query)

    # Send message
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["message"] = "Generated highlights..."
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    
    # Create arrows from feature contributions
    df_explain = get_contribution_explain(model_hl, features)
    max_explain = 0
    min_explain = 1000
    for i in range(1,5):
        value = float(df_explain.iloc[i]["Effect"])
        abs_expl = abs(value)
        if abs_expl > max_explain:
            max_explain = abs_expl
        if abs_expl < min_explain:
            min_explain = abs_expl
    one_third = (max_explain-min_explain) / 3.0
    arrow_cuts = [max_explain, min_explain + 2*one_third, min_explain + one_third]
    explanation_arrows = []
    for i in range(1,5):
        value = float(df_explain.iloc[i]["Effect"])
        split_reason = df_explain.iloc[i]["Reason"].split(" = ")
        name_expl = split_reason[0]
        expl_expl = "No explanation given"
        number_expl = split_reason[1]
        number_expl = round(float(number_expl),3)
        if name_expl in explain_dict_hl:
            name_expl,expl_expl = explain_dict_hl[name_expl]
        else:
            expl_expl = "No explanation given, yet."
        explanation_arrows.append((name_expl, number_expl, figure_arrow_size(value, arrow_cuts), expl_expl))
    
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id]["attribution"] = highlighted_text
    stored_requests[req_id]["prediction"] = output
    a_third = int(len(sentences_query) / 2.25)
    sec_third = int((len(sentences_query) - a_third) / 2) + a_third
    stored_requests[req_id]["split_info"] = [a_third, sec_third, len(sentences_query)]
    stored_requests[req_id]["status"] = status
    stored_requests[req_id]["wizardry"] = zero_classifier != None
    stored_requests[req_id]["message"] = "Pipeline 2 has finished."
        
    stored_requests[req_id]["explanation"] = explanation_arrows
    
    with open('tmp/' + str(req_id) + ".pickle", "wb") as handle:
        pickle.dump(stored_requests[req_id], handle, protocol=pickle.HIGHEST_PROTOCOL)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")

---

## Flask App

In [56]:
# Flask App
app = Flask(__name__, static_url_path='/assets')
app.secret_key = secrets.token_bytes(32)


# Flask routes
@app.route('/', methods=['GET'])
def home():
    return render_template('index.html')


@app.route('/', methods=['POST'])
def webapp():
    text = request.form['text']
    title = request.form['title']
    prediction = prediction_pipeline_complete(
        model_ll, text, tf_idf_vectorizer_ll, model_hl, vec_for_personality, model_for_personality, dberta_model, model_for_topic, models_for_adj, glove_embeddings, trained_markov_real_structure, trained_markov_fake_structure, trained_dberta_embeddings, trained_data, mapping_sent_doc, doc2vec_model, seq_of_col, Entailment_Classifier, Sentiment_Classifier, attr_model, zero_classifier, fake_threshold=0.183, real_threshold=0.841, headline=title)

    return render_template('index.html', text=text, result=prediction)


@app.route('/predict1/', methods=['GET', 'POST'])
def predict_pipeline1():
    check_for_old_requests()
    text = request.args.get("text")
    title = request.args.get("title")
    req_id = random.randrange(3000000)
    stored_requests[req_id] = { "time" : time.time(), "status": 0, "text": text, "title": title, "message": "Stay put for Pipeline 1 to finish." }
    p = threading.Thread(target=predict_pipeline1_child, args=(req_id, text, title,))
    p.start()

    return jsonify(request_id=req_id, response=stored_requests[req_id])


@app.route('/predict2/', methods=['GET', 'POST'])
def predict_pipeline2():
    check_for_old_requests()
    if running_db:
        running_db.terminate(8052)
    req_id = int(request.args.get("request_id"))
    type_id = 0
    if request.args.get("type"):
        type_id = int(request.args.get("type"))
    stored_requests[req_id]["time"] = time.time()
    stored_requests[req_id]["status"] = 0
    stored_requests[req_id]["message"] = "Pipeline 2 will take over, so keep calm."
    
    if type_id == 0:
        p = threading.Thread(target=prediction_pipeline_2, args=(req_id, model_hl, stored_requests[req_id]["text"], vec_for_personality, model_for_personality, dberta_model, model_for_topic, models_for_adj, glove_embeddings, trained_markov_real_structure, trained_markov_fake_structure, trained_dberta_embeddings, trained_data, mapping_sent_doc, doc2vec_model, seq_of_col, Entailment_Classifier, Sentiment_Classifier, attr_model, zero_classifier))
    else:
        p = threading.Thread(target=prediction_pipeline_2, args=(req_id, model_hl_wo_zeroshot, stored_requests[req_id]["text"], vec_for_personality, model_for_personality, dberta_model, model_for_topic, models_for_adj, glove_embeddings, trained_markov_real_structure, trained_markov_fake_structure, trained_dberta_embeddings, trained_data, mapping_sent_doc, doc2vec_model, seq_of_col_wo_zeroshot, Entailment_Classifier, Sentiment_Classifier, attr_model, None))
    p.start()
    
    return jsonify(request_id=req_id, response=stored_requests[req_id])


@app.route('/predict_status/', methods=['GET', 'POST'])
def predict_status():
    global file_locker

    req_id = int(request.args.get("request_id"))
    if req_id not in stored_requests.keys():
        return jsonify(request_id=req_id, response={"status":404}, timeout=False)
    max_timeout = 2.0
    if (request.args.get("timeout")):
        max_timeout = float(request.args.get("timeout"))
    timeout = 0
    while not os.path.exists('tmp/' + str(req_id) + ".pickle"):
        if timeout > max_timeout:
            return jsonify(request_id=req_id, response=stored_requests[req_id], timeout=True)
        time.sleep(0.25)
        timeout += 0.25
    file_locker.acquire_file_lock('tmp/' + str(req_id) + ".pickle")
    with open('tmp/' + str(req_id) + ".pickle", "rb") as handle:
        keyed_request = pickle.load(handle)
    os.remove('tmp/' + str(req_id) + ".pickle")
    time.sleep(0.25)
    file_locker.delete_file_lock('tmp/' + str(req_id) + ".pickle")
    stored_requests[req_id] = keyed_request
    
    return jsonify(request_id=req_id, response=stored_requests[req_id], timeout=False)


# Run Flask App
if __name__ == "__main__":
    app.run(debug=True, use_reloader=False)

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [22/Oct/2022 05:35:41] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [22/Oct/2022 05:35:41] "[36mGET /assets/web/assets/mobirise-icons-bold/mobirise-icons-bold.css HTTP/1.1[0m" 304 -
127.0.0.1 - - [22/Oct/2022 05:35:41] "[36mGET /assets/web/assets/mobirise-icons/mobirise-icons.css HTTP/1.1[0m" 304 -
127.0.0.1 - - [22/Oct/2022 05:35:41] "[36mGET /assets/web/assets/mobirise-icons2/mobirise2.css HTTP/1.1[0m" 304 -
127.0.0.1 - - [22/Oct/2022 05:35:41] "[36mGET /assets/bootstrap/css/bootstrap.min.css HTTP/1.1[0m" 304 -
127.0.0.1 - - [22/Oct/2022 05:35:41] "[36mGET /assets/bootstrap/css/bootstrap-grid.min.css HTTP/1.1[0m" 304 -
127.0.0.1 - - [22/Oct/2022 05:35:41] "[36mGET /assets/bootstrap/css/bootstrap-reboot.min.css HTTP/1.1[0m" 304 -
127.0.0.1 - - [22/Oct/2022 05:35:41] "[36mGET /assets/parallax/jarallax.css HTTP/1.1[0m" 304 -
127.0.0.1 - - [22/Oct/2022 05:35:41] "[36mGET /assets/animatecss/animate.css HTTP/1

Index(['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT', 'CONTRADICTION_norm',
       'NEUTRAL_norm', 'ENTAILMENT_norm', 'Extrovert', 'Sensing', 'Feeling',
       'Perceiving',
       ...
       'war_government_CONTRADICTION', 'war_government_ENTAILMENT',
       'war_health_CONTRADICTION', 'war_health_ENTAILMENT',
       'war_politics_CONTRADICTION', 'war_politics_ENTAILMENT',
       'war_sports_CONTRADICTION', 'war_sports_ENTAILMENT',
       'war_war_CONTRADICTION', 'war_war_ENTAILMENT'],
      dtype='object', length=3061)
Generating self.shap_explainer = shap.TreeExplainer(model)
Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
No y labels were passed to the Explainer, so setting model_summary=False...
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees p