### Imports

In [3]:
import json
import logging
logging.getLogger().setLevel(logging.INFO) # instead of .INFO
import re
import nltk
nltk.download('wordnet')
from multiprocessing.pool import ThreadPool
from bs4 import BeautifulSoup
from scipy.special import softmax
from web_search import get_html, WebParser, duckduckgo_search
from flask import request, Flask, jsonify
from flask_cors import CORS
from scipy import spatial
from faspect import Faspect
from flask import request, Flask
from flask_cors import CORS, cross_origin
import transformers
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
import pandas as pd
import numpy as np
import torch.nn.functional as F
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
import time
import math
import random
import re
import argparse
import webbrowser
from datetime import datetime, timedelta
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score
from sentence_transformers.cross_encoder import CrossEncoder

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dfingerlos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Parameters

In [90]:
snippetsCount = 20 #Fascet extraction is trained on snippetcount 20
snippetsBatchCnt = 4
modelpath = "hf_v2_6_Model_20f.ckpt" # "../hf_v1_0_Model.ckpt" #
fascetCount4Generation = 20
fascetThreshold = 0.80
CEmodel_path = "../output/training_F0_7-2022-06-20_16-30-36"

### Webcrawle

In [5]:
class Quin:
    def __init__(self):
        nltk.download('punkt')

        self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        self.sent_tokenizer._params.abbrev_types.update(['e.g', 'i.e', 'subsp'])

        self.app = Flask(__name__)
        CORS(self.app)

        logging.info('Initialized!')

    def extract_snippets(self, text, sentences_per_snippet=4): #how many sentences in every snippet
        sentences = self.sent_tokenizer.tokenize(text)
        snippets = []
        i = 0
        last_index = 0
        while i < len(sentences):
            snippet = ' '.join(sentences[i:i + sentences_per_snippet])
            if len(snippet.split(' ')) > 4:
                snippets.append(snippet)
            last_index = i + sentences_per_snippet
            i += sentences_per_snippet
        if last_index < len(sentences):
            snippet = ' '.join(sentences[last_index:])
            if len(snippet.split(' ')) > 4:
                snippets.append(snippet)
        return snippets

    def search_web_evidence(self, query):
        logging.info('searching the web...')
        urls = duckduckgo_search(query, pages=2)
        logging.info('downloading {} web pages...'.format(len(urls)))
        search_results = []

        def download(url):
            nonlocal search_results
            data = get_html(url)
            soup = BeautifulSoup(data, features='lxml')
            title = soup.title.string
            w = WebParser()
            w.feed(data)
            new_snippets = sum([self.extract_snippets(b) for b in w.blocks if b.count(' ') > 20], [])
            new_snippets = [{'snippet': p, 'url': url, 'title': title} for p in new_snippets]
            search_results += new_snippets

        def timeout_download(arg):
            pool = ThreadPool(1)
            try:
                pool.apply_async(download, [arg]).get(timeout=5)
            except:
                pass
            pool.close()
            pool.join()

        p = ThreadPool(32)
        p.map(timeout_download, urls)
        p.close()
        p.join()

        snippets = [s['snippet'] for s in search_results]
        logging.info('done searching')

        return snippets

In [8]:
def utf8Decoder(inputStr): #decode the input string 
    outputStr = inputStr.encode('latin1', errors='ignore').decode('unicode-escape', errors='ignore').encode('latin1', errors='ignore').decode('utf8', errors='ignore')
    return outputStr

In [113]:
def constructSnippets(res): #take entire response from the webcrawler and construct the snippets according to count and batchsize specified under parameters
    snippets = []
    stepscnt = min(int(snippetsCount), len(res))

    for i in range(snippetsBatchCnt):
        snippetsEpoch = []
        for j in range(stepscnt):
            snippetsEpoch.append(utf8Decoder(res[j]))
            
        res = res[snippetsCount:]
        snippets.append(snippetsEpoch)
        
        #If there are to view entries in res to construct the next Batch fully => stop
        if len(res) < snippetsCount:
            return snippets
        
    return snippets

### Generete Clarifying Question

In [12]:
class LitModel(pl.LightningModule):
  # Instantiate the model
  def __init__(self, learning_rate, tokenizer, model, hparams):
    super().__init__()
    self.tokenizer = tokenizer
    self.model = model
    self.learning_rate = learning_rate
    # self.freeze_encoder = freeze_encoder
    # self.freeze_embeds_ = freeze_embeds
    self.save_hyperparameters(hparams)

    if self.hparams.freeze_encoder:
      freeze_params(self.model.get_encoder())

    if self.hparams.freeze_embeds:
      self.freeze_embeds()
  
  def freeze_embeds(self):
    ''' freeze the positional embedding parameters of the model; adapted from finetune.py '''
    freeze_params(self.model.model.shared)
    for d in [self.model.model.encoder, self.model.model.decoder]:
      freeze_params(d.embed_positions)
      freeze_params(d.embed_tokens)

  # Do a forward pass through the model
  def forward(self, input_ids, **kwargs):
    return self.model(input_ids, **kwargs)
  
  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
    return optimizer

  def training_step(self, batch, batch_idx):
    # Load the data into variables
    src_ids, src_mask = batch[0], batch[1]
    tgt_ids = batch[2]
    # Shift the decoder tokens right (but NOT the tgt_ids)
    decoder_input_ids = shift_tokens_right(tgt_ids, tokenizer.pad_token_id)

    # Run the model and get the logits
    outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]
    # Create the loss function
    ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    # Calculate the loss on the un-shifted tokens
    loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

    return {'loss':loss}

  def validation_step(self, batch, batch_idx):
    src_ids, src_mask = batch[0], batch[1]
    tgt_ids = batch[2]

    decoder_input_ids = shift_tokens_right(tgt_ids, tokenizer.pad_token_id)
    
    # Run the model and get the logits
    outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]

    ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    val_loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

    return {'loss': val_loss}
  
  # Method that generates text using the BartForConditionalGeneration's generate() method
  def generate_text(self, text, eval_beams, early_stopping = True, max_len = 40):
    ''' Function to generate text '''
    generated_ids = self.model.generate(
        text["input_ids"],
        attention_mask=text["attention_mask"],
        use_cache=True,
        decoder_start_token_id = self.tokenizer.pad_token_id,
        num_beams= eval_beams,
        max_length = max_len,
        early_stopping = early_stopping
    )
    return [self.tokenizer.decode(w, skip_special_tokens=True, clean_up_tokenization_spaces=True) for w in generated_ids]

def freeze_params(model):
  ''' Function that takes a model as input (or part of a model) and freezes the layers for faster training
      adapted from finetune.py '''
  for layer in model.parameters():
    layer.requires_grade = False

In [13]:
# Create the hparams dictionary to pass in the model
hparams = argparse.Namespace()
hparams.freeze_encoder = True
hparams.freeze_embeds = True
hparams.eval_beams = 4

In [14]:
# Load the model
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base', add_prefix_space=True)
bart_model = BartForConditionalGeneration.from_pretrained(
    "facebook/bart-base")

In [15]:
def generate_prediction(seed_line, model_): #Cenerate Clarifying question
    model_.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    model_.eval()
    prompt_line_tokens = tokenizer(seed_line, max_length = 192, return_tensors = "pt", truncation = True).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    line = model_.generate_text(prompt_line_tokens, eval_beams = 8)
    return line

In [16]:
model_loaded = LitModel.load_from_checkpoint(modelpath , learning_rate = 2e-5, 
                                             tokenizer = tokenizer, model = bart_model, hparams = hparams)

### Selecting best Fascets for clarification

In [17]:
def getBestFascets(clariQ, fascets, threshold = 0.75):
    data = []
    for fascet in fascets:
        data.append([ clariQ, fascet ])
    
    #Use CrossEncoder Prediction
    CEmodel = CrossEncoder(CEmodel_path)
    scores = CEmodel.predict(data)
    
    #Convert data to pandas Dataframe
    df = pd.DataFrame(data)
    df['scores'] = pd.DataFrame(scores)
    df = df.sort_values(by=['scores'], ascending=False)
    
    #Gen strign with all fascets
    FascetStr = ""
    for row in df.iterrows():
        FascetStr = FascetStr + row[1][1] + " (" + "{0:.3f}".format(row[1]['scores']) + "), "
    
    #select best fascets
    fascet_list = df[df.scores > threshold][1].to_list()
    
    return [(' , '.join(fascet_list)), FascetStr]

### Fascet Preprocessing

In [19]:
def preprocessingFascets(fascets, query):
    #lemmatize input ?!!?!??! Adapt, just if there is lemmartized 2 times the same word remove it!!
    lemma = nltk.wordnet.WordNetLemmatizer()
    for idx, el in enumerate(fascets):
        fascets[idx] = lemma.lemmatize(el)
    
    #remove query as fascet from fascets
    for idx, el in enumerate(fascets):
        if str(el).lower() == str(query).lower():
            fascets.remove(el)
        
    #remove query from fascet
    for idx, el in enumerate(fascets):
        fascets[idx] = fascets[idx].replace(query.lower(),"").strip()
    
    # using set() to remove duplicated from list 
    fascets = list(set(fascets))
    
    #remove empty element
    fascets = list(filter(None, fascets))

    return fascets

### API

In [20]:
class PipelineApi:
    def __init__(self):
        self.app = Flask(__name__)
        self.cors = CORS(self.app)
        
    def build_endpoints(self):
        #Init Websearch unit
        WSFramework = Quin()
        #Init Fascets Extraction
        extractor = Faspect()

        @self.app.route("/extract", methods=["GET", "POST"])
        @cross_origin()
        def search_endpoint():
            params = request.json
            query = params["query"]
            
            #Loadbalancing, if multiple requests only allow one at a time
            if self.app.task_nextExec_time > datetime.now() : #if there is a waittime
                print("Call '" + query + "' waitingtime: " + str((self.app.task_nextExec_time - datetime.now()).total_seconds()))
                timeToWait = (self.app.task_nextExec_time - datetime.now()).total_seconds() #calc waittime
                self.app.task_nextExec_time = self.app.task_nextExec_time + timedelta(seconds=13) #increase next execution time by 10s
                time.sleep(timeToWait)
            else: #If there was no waittime, set next execution time to in 10s
                self.app.task_nextExec_time = datetime.now() + timedelta(seconds=13) 

            print("Call '" + query + "' is now processing..." )

            #If more then 1 request per 13 second duckduckgo freezes -> open website in browser and search something!!
            res = WSFramework.search_web_evidence(query)
            if len(res) == 0:
                #if there are too many requests, the connection is blocked. To unblock it usually helps to wait a littel and/or send a proper searchrequest
                webbrowser.open_new('https://duckduckgo.com/?q=this+is+errorhandling.+Please+just+close+that+tab+and+have+a+great+day!&t=h_&ia=web')
                res = WSFramework.search_web_evidence(query)              
                
            #Response to Snippetslist conversion
            snippets = constructSnippets(res)
            #Run Fascet Extraction and fascet preprocessing
            results = []
            for snippet in snippets: 
                resRaw = extractor.extract_facets(query, snippet, classification_threshold=0.05)
                results.extend(preprocessingFascets(resRaw, query))
            resultsFinal = preprocessingFascets(results, query)
            
            #Generate clarifying question
            seed_line = query + " | " + (' , '.join(resultsFinal[0:fascetCount4Generation]))
            clariQ = generate_prediction(seed_line = seed_line, model_ = model_loaded)
            
            #Extract best Fascets
            Fascets = getBestFascets(clariQ[0], resultsFinal, fascetThreshold)
            
            #Construct Response
            response_dict = {
                "query": query,
                "snippets": snippets,
                "fascets": Fascets[1],
                "bestFascets": Fascets[0],
                "clariQ": clariQ
            }
            
            response = json.dumps(response_dict)
            print("Call '" + query + "' finished!" )
            return response

    def serve(self, port=80):
        self.build_endpoints()
        self.app.task_nextExec_time = datetime.now()
        self.app.run(host='0.0.0.0', port=port)

### Start API

In [1]:
PipelineApi = PipelineApi()
PipelineApi.serve(port=7789)

NameError: name 'PipelineApi' is not defined