In [2]:
!pip install transformers
!pip install transformers[sentencepiece]



In [3]:
import torch
import numpy as np
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AutoModelForSequenceClassification, AutoTokenizer


class BertSquad(object):

    def __init__(self):

        self.USE_SUMMARY = False

        self.QA_MODEL = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') # where are the models
        self.QA_TOKENIZER = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
        self.torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.QA_MODEL.to(self.torch_device)
        self.QA_MODEL.eval()


        self.MNLI_MODEL = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')
        self.MNLI_TOKENIZER = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')


        if self.USE_SUMMARY:
            self.SUMMARY_TOKENIZER = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
            self.SUMMARY_MODEL = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
            self.SUMMARY_MODEL.to(self.torch_device)
            self.SUMMARY_MODEL.eval()

        
    def reconstructText(self,tokens, start=0, stop=-1):
        tokens = tokens[start: stop]
        if '[SEP]' in tokens:
            sepind = tokens.index('[SEP]')
            tokens = tokens[sepind+1:]
        txt = ' '.join(tokens)
        txt = txt.replace(' ##', '')
        txt = txt.replace('##', '')
        txt = txt.strip()
        txt = " ".join(txt.split())
        txt = txt.replace(' .', '.')
        txt = txt.replace(' . ', '.')
        txt = txt.replace('( ', '(')
        txt = txt.replace(' )', ')')
        txt = txt.replace(' - ', '-')
        txt_list = txt.split(' , ')
        txt = ''
        nTxtL = len(txt_list)
        if nTxtL == 1:
            return txt_list[0]
        newList =[]
        for i,t in enumerate(txt_list):
            if i < nTxtL -1:
                if t[-1].isdigit() and txt_list[i+1][0].isdigit():
                    newList += [t,',']
                else:
                    newList += [t, ', ']
            else:
                newList += [t]
        return ''.join(newList)


    def makeBERTSQuADPrediction(self,document, question):
        ## we need to rewrite this function so that it chuncks the document into 250-300 word segments with
        ## 50 word overlaps on either end so that it can understand and check longer abstracts
        nWords = len(document.split())
        input_ids_all = self.QA_TOKENIZER.encode(question, document)
        tokens_all = self.QA_TOKENIZER.convert_ids_to_tokens(input_ids_all)
        overlapFac = 1.1
        # print(input_ids_all)

        if len(input_ids_all)*overlapFac > 2048:
            nSearchWords = int(np.ceil(nWords/5))
            quarter = int(np.ceil(nWords/4))
            docSplit = document.split()
            docPieces = [' '.join(docSplit[:int(nSearchWords*overlapFac)]), 
                        ' '.join(docSplit[quarter-int(nSearchWords*overlapFac/2):quarter+int(quarter*overlapFac/2)]),
                        ' '.join(docSplit[quarter*2-int(nSearchWords*overlapFac/2):quarter*2+int(quarter*overlapFac/2)]),
                        ' '.join(docSplit[quarter*3-int(nSearchWords*overlapFac/2):quarter*3+int(quarter*overlapFac/2)]),
                        ' '.join(docSplit[-int(nSearchWords*overlapFac):])]
            input_ids = [self.QA_TOKENIZER.encode(question, dp) for dp in docPieces]        
            
        elif len(input_ids_all)*overlapFac > 1536:
            nSearchWords = int(np.ceil(nWords/4))
            third = int(np.ceil(nWords/3))
            docSplit = document.split()
            docPieces = [' '.join(docSplit[:int(nSearchWords*overlapFac)]), 
                        ' '.join(docSplit[third-int(nSearchWords*overlapFac/2):third+int(nSearchWords*overlapFac/2)]),
                        ' '.join(docSplit[third*2-int(nSearchWords*overlapFac/2):third*2+int(nSearchWords*overlapFac/2)]),
                        ' '.join(docSplit[-int(nSearchWords*overlapFac):])]
            input_ids = [self.QA_TOKENIZER.encode(question, dp) for dp in docPieces]        
            
        elif len(input_ids_all)*overlapFac > 1024:
            nSearchWords = int(np.ceil(nWords/3))
            middle = int(np.ceil(nWords/2))
            docSplit = document.split()
            docPieces = [' '.join(docSplit[:int(nSearchWords*overlapFac)]), 
                        ' '.join(docSplit[middle-int(nSearchWords*overlapFac/2):middle+int(nSearchWords*overlapFac/2)]),
                        ' '.join(docSplit[-int(nSearchWords*overlapFac):])]
            input_ids = [self.QA_TOKENIZER.encode(question, dp) for dp in docPieces]
        
        elif len(input_ids_all)*overlapFac > 512:
            nSearchWords = int(np.ceil(nWords/2))
            docSplit = document.split()
            docPieces = [' '.join(docSplit[:int(nSearchWords*overlapFac)]), ' '.join(docSplit[-int(nSearchWords*overlapFac):])]
            input_ids = [self.QA_TOKENIZER.encode(question, dp) for dp in docPieces]
        else:
            input_ids = [input_ids_all]
        absTooLong = False    
        
        answers = []
        cons = []

        for iptIds in input_ids:
            tokens = self.QA_TOKENIZER.convert_ids_to_tokens(iptIds)
            sep_index = iptIds.index(self.QA_TOKENIZER.sep_token_id)
            num_seg_a = sep_index + 1
            num_seg_b = len(iptIds) - num_seg_a
            segment_ids = [0]*num_seg_a + [1]*num_seg_b
            assert len(segment_ids) == len(iptIds)
            n_ids = len(segment_ids)

            if n_ids < 512:
                res = self.QA_MODEL(torch.tensor([iptIds]).to(self.torch_device), 
                                        token_type_ids=torch.tensor([segment_ids]).to(self.torch_device))
                
                start_scores = res.start_logits
                end_scores = res.end_logits

              
            else:
                
                print('****** warning only considering first 512 tokens, document is '+str(nWords)+' words long.  There are '+str(n_ids)+ ' tokens')
                absTooLong = True
                res = self.QA_MODEL(torch.tensor([iptIds[:512]]).to(self.torch_device), 
                                        token_type_ids=torch.tensor([segment_ids[:512]]).to(self.torch_device),)
                start_scores = res.start_logits
                end_scores = res.end_logits
                
            
            start_scores = start_scores[:,1:-1]
            end_scores = end_scores[:,1:-1]
            answer_start = torch.argmax(start_scores)
            answer_end = torch.argmax(end_scores)
           
            answer = self.reconstructText(tokens, answer_start, answer_end+4)
        
            if answer.startswith('.') or answer.startswith(','):
                answer = answer[2:]
                
            c = start_scores[0,answer_start].item()+end_scores[0,answer_end].item()
            answers.append(answer)
            cons.append(c)
        
        maxC = max(cons)
        iMaxC = [i for i, j in enumerate(cons) if j == maxC][0]
        confidence = cons[iMaxC]
        answer = answers[iMaxC]
        
        sep_index = tokens_all.index('[SEP]')
        full_txt_tokens = tokens_all[sep_index+1:]
        
        abs_returned = self.reconstructText(full_txt_tokens)

        ans={}
        ans['answer'] = answer
        
        if answer.startswith('[CLS]') or answer_end.item() < sep_index or answer.endswith('[SEP]'):
            ans['confidence'] = -1000000
        else:
            ans['confidence'] = confidence
        
        ans['abstract_bert'] = abs_returned
        ans['abs_too_long'] = absTooLong
        
        
        # mnli_response = self.get_mnli_response(context,question)
        # ans.update(mnli_response)
        return ans

    def get_mnli_response(self,document,question):
        result = {}

        premise = document
        hypothesis = question

        # run through model pre-trained on MNLI
        x = self.MNLI_TOKENIZER.encode(premise, hypothesis, return_tensors='pt')
        logits = self.MNLI_MODEL(x.to(self.torch_device))[0]

        # output = tokenizer.decode(logits)
        labels = ['contradiction','entailment']

        # we throw away "neutral" (dim 1) and take the probability of
        # "entailment" (2) as the probability of the label being true 
        entail_contradiction_logits = logits[:,[0,2]]
        probs = entail_contradiction_logits.softmax(dim=1)
        prob_label_is_true = probs[:,1]
        label = labels[np.argmax(probs[0].tolist())]
        result['label'] = label
        result['mnli_score'] = max(probs[0].tolist())
        return result


    def highlight_answer(self,answer_dict):
        start_idx = answer_dict['abstract_bert'].index(answer_dict['answer'])
        end_idx = start_idx+len(answer_dict['answer'])
        reframe_text = answer_dict['abstract_bert'][:start_idx-1] + "  <font color='#1E90FF'> <b>" + answer_dict['abstract_bert'][start_idx:end_idx] + " </b></font> " + answer_dict['abstract_bert'][end_idx+1 : ]
        return reframe_text

    # def show_query(self,query):
    #         """HTML print format for the searched query"""
    #         return HTML('<br/><div font-size: 20px;'
    #                     'padding-bottom:12px"><b>Query</b>: ' + query + '</div>')



    def get_ranked_answers(self,question,hit_dictionary):
        answers_list = []
        
        for doc in hit_dictionary:
            text = doc['values']
            ans = self.makeBERTSQuADPrediction(text,question)
            
            if ans['answer'] != '':
                ans.update({'source' : doc['source']})
                ans.update({'hl_answer' : self.highlight_answer(ans)})
                # Creating an weighted Average
                norm_score = (0.65*ans['confidence']+ 0.45*doc['confidence'] )/2
                ans.update({'norm_score':norm_score})
                answers_list.append(ans)
        
        answers_list = sorted(answers_list, key = lambda i: i['norm_score'],reverse=True)
        return answers_list

In [4]:
bq = BertSquad()

In [5]:
question = 'Which enterprise size segment led the advanced analytics market?'
hit_dictionary = [
  {
    "Account Name": "Plantpick",
    "Opportunity Name": "Plantpick - Smartphone consumer review analysis - Phase#1",
    "Opportunity Owner": "Jitendra Jethanandani",
    "P&L BU": "Enterprise Tech",
    "Account Type": "D",
    "Stage": "Closed Won",
    "Bi2i Industry": "HiTech Enterprise",
    "Bi2i Sub-Industry": "B2B Tech",
    "Created Date": "4/24/2019",
    "Last Stage Change Date": "5/15/2019",
    "Probability (%)": 100,
    "Sum of Amount": 61800,
    "Sum of Expected Revenue": 61800,
    "values": "Account Name Plantpick Opportunity Name Plantpick - Smartphone consumer review analysis - Phase#1 P&L BU Enterprise Tech Account Type D Bi2i Industry HiTech Enterprise Bi2i Sub-Industry B2B Tech Sum of Amount 61800.0 Sum of Expected Revenue 61800.0",
    "source": "Dataset",
    "confidence": 7.7573066
  },
  {
    "source": "PDF documents",
    "creationdate": "11/16/21, 12:54 PM ",
    "topichead": "Global Advanced Analytics Market Size Report, 2021-2028",
    "contentpage": "2/9",
    "urls": [
      "https://www.grandviewresearch.com/industry-analysis/artificial-intelligence-ai-market),",
      "https://www.grandviewresearch.com/industry-analysis/advanced-analytics-market"
    ],
    "values": "To learn more about this report request a free sample copy industry-analysis advanced-analytics-market request rs2 The volume of unorganized data such as surveillance data corporate emails and consumer data generated from streaming activity for media subscription services such as Amazon Prime Video NetFlix and Hulu has been rising in recent years. Hence the demand for database management tools has also been rising. This has particularly translated into the rising demand for analytics for data management and strategic decision-making. At the same time the continued integration of analytics with Geographical Information System GIS and improvements in location-based services have increased the volumes of geospatial data significantly thereby necessitating the adoption of advanced analytics for optimal management of the data. Several businesses are adopting progressive analytics solutions which can potentially utilize geospatial data to locate clients on a map and draft new strategies accordingly. The growing adoption of advanced analytics in predicting and forecasting trading patterns electricity consumption patterns and traffic conditions is propelling the growth of the market. The use of analytics in demand forecasting allows organizations to make data-based informed decisions and augment profitability. Apart from incumbents of various industries and industry verticals such as manufacturing banking and professional services government agencies are also investing aggressively in big data analytics thereby driving the demand for advanced analytics. The rising demand for IoT-based predictive solutions which can potentially help in collecting and analyzing data from sensors including temperature sensors air quality sensors and motion sensors among others within the IoT network is expected to boost the adoption of advanced analytical tools over the forecast period. Type Insights The big data analytics segment accounted for the largest share of around 30% in 2020 owing to the increasing popularity of social media and the rise in the number of virtual or digital offices that produce large volumes of data. Information management is emerging as an area where big data analytics can have a significant impact on business processes and productivity. The growing adoption of big data analytics across businesses for enhancing operational efficiency and strengthening market strategies is expected to drive the growth of the segment. The customer analytics segment is expected to grow significantly over the forecast period in line with the increasing demand for improved lead management customer retention and customer experience management. The strong emphasis businesses are putting on understanding the customers buying patterns and behavior and accordingly offering a highly customizable customer experience is driving the demand for customer analytics. Continued introduction of business process automation advances in the latest technologies such as machine learning and artificial intelligence  www.grandviewresearch.com industry-analysis artificial-intelligence-ai-market and the integration of these technologies into customer analytics are expected to contribute to the growth of the segment. Efforts being pursued by the incumbents of the retail industry to offer an omnichannel experience to their customers and the growing adoption of customer analytics by large retailers to roll out marketing programs and personalized communications also bode well for the growth of the segment. This site uses cookies to improve user",
    "confidence": 2.464856
  },
  {
    "Account Name": "Cellular Curvaceous",
    "Opportunity Name": "Cellular CurvaceousFPA",
    "Opportunity Owner": "Mukesh Saharan",
    "P&L BU": "Consumer",
    "Account Type": "A",
    "Stage": "Solutioning",
    "Bi2i Industry": "Consumer",
    "Bi2i Sub-Industry": "CPG",
    "Created Date": 43414,
    "Last Stage Change Date": "(blank)",
    "Probability (%)": 70,
    "Sum of Amount": 80000,
    "Sum of Expected Revenue": 56000,
    "values": "Account Name Cellular Curvaceous Opportunity Name Cellular CurvaceousFPA P&L BU Consumer Account Type A Bi2i Industry Consumer Bi2i Sub-Industry CPG Sum of Amount 80000.0 Sum of Expected Revenue 56000.0",
    "source": "Dataset",
    "confidence": 2.2711976
  },
  {
    "Account Name": "Sigmasocial",
    "Opportunity Name": "Sigmasocial - Commodity Forecasting",
    "Opportunity Owner": "Arun Krishnamoorthy",
    "P&L BU": "Consumer",
    "Account Type": "A",
    "Stage": "Closed Won",
    "Bi2i Industry": "Consumer",
    "Bi2i Sub-Industry": "CPG",
    "Created Date": "3/19/2019",
    "Last Stage Change Date": "9/29/2019",
    "Probability (%)": 100,
    "Sum of Amount": 48387,
    "Sum of Expected Revenue": 48387,
    "values": "Account Name Sigmasocial Opportunity Name Sigmasocial - Commodity Forecasting P&L BU Consumer Account Type A Bi2i Industry Consumer Bi2i Sub-Industry CPG Sum of Amount 48387.0 Sum of Expected Revenue 48387.0",
    "source": "Dataset",
    "confidence": 2.2711976
  },
  {
    "Account Name": "Sigmasocial",
    "Opportunity Name": "Sigmasocial - Jarvis USA",
    "Opportunity Owner": "Sonal Gupta",
    "P&L BU": "Consumer",
    "Account Type": "A",
    "Stage": "Closed Won",
    "Bi2i Industry": "Consumer",
    "Bi2i Sub-Industry": "CPG",
    "Created Date": "11/21/2019",
    "Last Stage Change Date": 43781,
    "Probability (%)": 100,
    "Sum of Amount": 34650,
    "Sum of Expected Revenue": 34650,
    "values": "Account Name Sigmasocial Opportunity Name Sigmasocial - Jarvis USA P&L BU Consumer Account Type A Bi2i Industry Consumer Bi2i Sub-Industry CPG Sum of Amount 34650.0 Sum of Expected Revenue 34650.0",
    "source": "Dataset",
    "confidence": 2.2711976
  },
  {
    "Account Name": "Sigmasocial",
    "Opportunity Name": "Sigmasocial Jobby MDM",
    "Opportunity Owner": "Sonal Gupta",
    "P&L BU": "Consumer",
    "Account Type": "A",
    "Stage": "Negotiation/Review",
    "Bi2i Industry": "Consumer",
    "Bi2i Sub-Industry": "CPG",
    "Created Date": 43716,
    "Last Stage Change Date": "10/25/2021",
    "Probability (%)": 90,
    "Sum of Amount": 15000,
    "Sum of Expected Revenue": 13500,
    "values": "Account Name Sigmasocial Opportunity Name Sigmasocial Jobby MDM P&L BU Consumer Account Type A Bi2i Industry Consumer Bi2i Sub-Industry CPG Sum of Amount 15000.0 Sum of Expected Revenue 13500.0",
    "source": "Dataset",
    "confidence": 2.2711976
  },
  {
    "Account Name": "Sigmasocial",
    "Opportunity Name": "Sigmasocial - Jarvis India Support",
    "Opportunity Owner": "Sonal Gupta",
    "P&L BU": "Consumer",
    "Account Type": "A",
    "Stage": "Closed Won",
    "Bi2i Industry": "Consumer",
    "Bi2i Sub-Industry": "CPG",
    "Created Date": "9/30/2019",
    "Last Stage Change Date": "11/18/2019",
    "Probability (%)": 100,
    "Sum of Amount": 72300,
    "Sum of Expected Revenue": 72300,
    "values": "Account Name Sigmasocial Opportunity Name Sigmasocial - Jarvis India Support P&L BU Consumer Account Type A Bi2i Industry Consumer Bi2i Sub-Industry CPG Sum of Amount 72300.0 Sum of Expected Revenue 72300.0",
    "source": "Dataset",
    "confidence": 2.2609577
  },
  {
    "Account Name": "Sigmasocial",
    "Opportunity Name": "Sigmasocial - Jarvis Phase 2",
    "Opportunity Owner": "Sonal Gupta",
    "P&L BU": "Consumer",
    "Account Type": "A",
    "Stage": "Closed Won",
    "Bi2i Industry": "Consumer",
    "Bi2i Sub-Industry": "CPG",
    "Created Date": "3/14/2019",
    "Last Stage Change Date": 43472,
    "Probability (%)": 100,
    "Sum of Amount": 75000,
    "Sum of Expected Revenue": 75000,
    "values": "Account Name Sigmasocial Opportunity Name Sigmasocial - Jarvis Phase 2 P&L BU Consumer Account Type A Bi2i Industry Consumer Bi2i Sub-Industry CPG Sum of Amount 75000.0 Sum of Expected Revenue 75000.0",
    "source": "Dataset",
    "confidence": 2.2609577
  },
  {
    "Account Name": "Sigmasocial",
    "Opportunity Name": "Sigmasocial Finance Control Watchtower",
    "Opportunity Owner": "Sonal Gupta",
    "P&L BU": "Consumer",
    "Account Type": "A",
    "Stage": "Closed Won",
    "Bi2i Industry": "Consumer",
    "Bi2i Sub-Industry": "CPG",
    "Created Date": "3/19/2019",
    "Last Stage Change Date": "9/29/2019",
    "Probability (%)": 100,
    "Sum of Amount": 15714,
    "Sum of Expected Revenue": 15714,
    "values": "Account Name Sigmasocial Opportunity Name Sigmasocial Finance Control Watchtower P&L BU Consumer Account Type A Bi2i Industry Consumer Bi2i Sub-Industry CPG Sum of Amount 15714.0 Sum of Expected Revenue 15714.0",
    "source": "Dataset",
    "confidence": 2.2609577
  },
  {
    "Account Name": "Cellular Curvaceous",
    "Opportunity Name": "Cellular CurvaceousAgent performance Assessment",
    "Opportunity Owner": "Mukesh Saharan",
    "P&L BU": "Consumer",
    "Account Type": "A",
    "Stage": "Closed Won",
    "Bi2i Industry": "Consumer",
    "Bi2i Sub-Industry": "CPG",
    "Created Date": "11/13/2018",
    "Last Stage Change Date": 43525,
    "Probability (%)": 100,
    "Sum of Amount": 117222,
    "Sum of Expected Revenue": 117222,
    "values": "Account Name Cellular Curvaceous Opportunity Name Cellular CurvaceousAgent performance Assessment P&L BU Consumer Account Type A Bi2i Industry Consumer Bi2i Sub-Industry CPG Sum of Amount 117222.0 Sum of Expected Revenue 117222.0",
    "source": "Dataset",
    "confidence": 2.2508814
  }
]

In [6]:
ranked_answers = bq.get_ranked_answers(question,hit_dictionary)

Token indices sequence length is longer than the specified maximum sequence length for this model (594 > 512). Running this sequence through the model will result in indexing errors


In [7]:
ranked_answers

[{'abs_too_long': False,
  'abstract_bert': 'account name sigmasocial opportunity name sigmasocial finance control watchtower p & l bu consumer account type a bi2i industry consumer bi2i sub-industry cpg sum of amount 15714. 0 sum of expected revenue 15714. 0',
  'answer': 'bu consumer account type',
  'confidence': 7.516219139099121,
  'hl_answer': "account name sigmasocial opportunity name sigmasocial finance control watchtower p & l  <font color='#1E90FF'> <b>bu consumer account type </b></font> a bi2i industry consumer bi2i sub-industry cpg sum of amount 15714. 0 sum of expected revenue 15714. 0",
  'norm_score': 2.9514867027072142,
  'source': 'Dataset'},
 {'abs_too_long': False,
  'abstract_bert': 'account name sigmasocial opportunity name sigmasocial jobby mdm p & l bu consumer account type a bi2i industry consumer bi2i sub-industry cpg sum of amount 15000. 0 sum of expected revenue 13500. 0',
  'answer': 'bu consumer account type',
  'confidence': 7.273857593536377,
  'hl_answe

In [15]:


def find(s, ch):
    return [i for i, ltr in enumerate(s) if ltr == ch]

def get_summarization_input(ranked_answers):
    for doc in ranked_answers:
            all_context = []
            answer_start_idx = doc['abstract_bert'].index(doc['answer'])
            text = doc['abstract_bert']
            if doc['source'] == 'PDF documents':
                #print(text)
                sentance_start_idx_list = find(text,'.')
                sentance_start_idx_list = sentance_start_idx_list + [answer_start_idx]
                sentance_start_idx_list = sorted(sentance_start_idx_list)
                sentance_start_idx = sentance_start_idx_list[find(sentance_start_idx_list,answer_start_idx)[0] - 1]
                sentance_end_idx = sentance_start_idx_list [find(sentance_start_idx_list,answer_start_idx)[0] + 1]
                if sentance_start_idx < 0 :
                    sentance_start_idx = 0     
                    context = text[sentance_start_idx:sentance_end_idx]
                    all_context.append(context)
                else:
                    context = text[sentance_start_idx:sentance_end_idx]
                    all_context.append(context)
            else:
                pass


    summarization_input = ''.join(all_context)   
    return summarization_input


summarization_input = get_summarization_input(ranked_answers)

In [16]:
summarization_input

'. the rising demand for iot-based predictive solutions which can potentially help in collecting and analyzing data from sensors including temperature sensors air quality sensors and motion sensors among others within the iot network is expected to boost the adoption of advanced analytical tools over the forecast period. type insights the big data analytics segment accounted for the largest share of around 30 % in 2020 owing to the increasing popularity of social media and the rise in the number of virtual or digital offices that produce large volumes of data. information management is emerging as an area where big data analytics can have a significant impact on business processes and productivity'

In [8]:
def find(s, ch):
    return [i for i, ltr in enumerate(s) if ltr == ch]



def get_summarization_input(self,ranked_answers):
    for doc in ranked_answers:
            all_context = []
            answer_start_idx = doc['abstract_bert'].index(doc['answer'])
            text = doc['abstract_bert']
            if doc['source'] == 'PDF documents':
                #print(text)
                sentance_start_idx_list = find(text,'.')
                sentance_start_idx_list = sentance_start_idx_list + [answer_start_idx]
                sentance_start_idx_list = sorted(sentance_start_idx_list)
                sentance_start_idx = sentance_start_idx_list[find(sentance_start_idx_list,answer_start_idx)[0] - 2]
                sentance_end_idx = sentance_start_idx_list [find(sentance_start_idx_list,answer_start_idx)[0] + 2]
                if sentance_start_idx < 0 :
                    sentance_start_idx = 0     
                    context = text[sentance_start_idx:sentance_end_idx]
                    all_context.append(context)
                else:
                    context = text[sentance_start_idx:sentance_end_idx]
                    all_context.append(context)
            else:
                pass


    summarization_input = ''.join(all_context)   
    return summarization_input

In [18]:
import torch
import numpy as np
from transformers import PegasusTokenizer, PegasusForConditionalGeneration


torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Summarizer(object):
    
    def __init__(self):
        self.SUMMARY_MODEL_NAME = "human-centered-summarization/financial-summarization-pegasus"
        self.SUMMARY_TOKENIZER = PegasusTokenizer.from_pretrained(self.SUMMARY_MODEL_NAME)
        self.SUMMARY_MODEL = PegasusForConditionalGeneration.from_pretrained(self.SUMMARY_MODEL_NAME) 
        self.SUMMARY_MODEL.to(torch_device)

    def find(self,s, ch):
        return [i for i, ltr in enumerate(s) if ltr == ch]


    def get_summarization_input(self,ranked_answers):
        for doc in ranked_answers:
                all_context = []
                answer_start_idx = doc['abstract_bert'].index(doc['answer'])
                text = doc['abstract_bert']
                if doc['source'] == 'PDF documents':
                    #print(text)
                    sentance_start_idx_list = find(text,'.')
                    sentance_start_idx_list = sentance_start_idx_list + [answer_start_idx]
                    sentance_start_idx_list = sorted(sentance_start_idx_list)
                    sentance_start_idx = sentance_start_idx_list[find(sentance_start_idx_list,answer_start_idx)[0] - 2]
                    sentance_end_idx = sentance_start_idx_list [find(sentance_start_idx_list,answer_start_idx)[0] + 2]
                    if sentance_start_idx < 0 :
                        sentance_start_idx = 0     
                        context = text[sentance_start_idx:sentance_end_idx]
                        all_context.append(context)
                    else:
                        context = text[sentance_start_idx:sentance_end_idx]
                        all_context.append(context)
                else:
                    pass


        summarization_input = ''.join(all_context)   
        return summarization_input

    
    def get_summary(self,summarization_input):
        # self.USE_SUMMARY = True
        summary_result = {}

        input_ids = self.SUMMARY_TOKENIZER(summarization_input, return_tensors="pt", max_length=1024,truncation=True).input_ids

        # Generate the output (Here, we use beam search but you can also use any other strategy you like)
        output = self.SUMMARY_MODEL.generate(
            input_ids, 
            max_length=64, 
            num_beams=10, 
            early_stopping=True,
            no_repeat_ngram_size=4,
            length_penalty=1.6
        )

        summarized_output = self.SUMMARY_TOKENIZER.decode(output[0], skip_special_tokens=True)
        summarize_HTML = '<div style="font-size:12px;color:#CCCC00"><b>Pegasus Financial Summary:</b>: '+ summarized_output +'</div>'
        warning_HTML = '<div style="font-size:12px;padding-bottom:12px;color:#CCCC00;margin-top:1px"> Warning: This is an autogenerated summary based on semantic search of abstracts, please examine the results before accepting this conclusion. There may be scenarios in which the summary will not be able to clearly answer the question.</div>'

        summary_result['summary'] = summarized_output
        summary_result['summary_html'] = summarize_HTML
        summary_result['warning_HTML'] = warning_HTML

        return summary_result




In [19]:
summarizer =  Summarizer()

In [20]:
summarizer.get_summary(summarization_input)

{'summary': 'type insights the big data analytics segment accounted for the largest share of around 30 % in 2020.',
 'summary_html': '<div style="font-size:12px;color:#CCCC00"><b>Pegasus Financial Summary:</b>: type insights the big data analytics segment accounted for the largest share of around 30 % in 2020.</div>',

In [34]:
# allAnswersTxt = ' '.join(ranked_aswers[:6]).replace('\n','')
torch_device= 'cpu'

SUMMARY_TOKENIZER = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
SUMMARY_MODEL = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
SUMMARY_MODEL.to(torch_device)
SUMMARY_MODEL.eval()


answers_input_ids =SUMMARY_TOKENIZER.batch_encode_plus([summarization_input], return_tensors='pt', max_length=1024,truncation=True)['input_ids'].to(torch_device)
summary_ids = SUMMARY_MODEL.generate(answers_input_ids, num_beams=10, length_penalty=1.2, max_length=1024, min_length=64, no_repeat_ngram_size=4)

exec_sum = SUMMARY_TOKENIZER.decode(summary_ids.squeeze(), skip_special_tokens=True)
execSum_HTML = '<div style="font-size:12px;color:#CCCC00"><b>BART Abstractive Summary:</b>: '+exec_sum+'</div>'
warning_HTML = '<div style="font-size:12px;padding-bottom:12px;color:#CCCC00;margin-top:1px"> Warning: This is an autogenerated summary based on semantic search of abstracts, please examine the results before accepting this conclusion. There may be scenarios in which the summary will not be able to clearly answer the question.</div>'


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [35]:
print(exec_sum)

. type insights the big data analytics segment accounted for the largest share of around 30 % in 2020. owing to the increasing popularity of social media and the rise in the number of virtual or digital offices that produce large volumes of data. In 2020, big data analytics will account for around 30% of the global data analytics market.


In [8]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Let's load the model and the tokenizer 

def get_summary(self,summarization_input):
    # SUMMARY_MODEL_NAME = "human-centered-summarization/financial-summarization-pegasus"
    # SUMMARY_TOKENIZER = PegasusTokenizer.from_pretrained(SUMMARY_MODEL_NAME)
    # SUMMARY_MODEL = PegasusForConditionalGeneration.from_pretrained(SUMMARY_MODEL_NAME) # If you want to use the Tensorflow model 
    #                                                                     # just replace with TFPegasusForConditionalGeneration



    input_ids = tokenizer(summarization_input, return_tensors="pt", max_length=1024,truncation=True).input_ids

    # Generate the output (Here, we use beam search but you can also use any other strategy you like)
    output = model.generate(
        input_ids, 
        max_length=64, 
        num_beams=10, 
        early_stopping=True,
        no_repeat_ngram_size=4,
        length_penalty=1.6
    )

    summarized_output = tokenizer.decode(output[0], skip_special_tokens=True)
    summarized_output_HTML = '<div style="font-size:12px;color:#CCCC00"><b>Pegasus Financial Summary:</b>: '+exec_sum+'</div>'
    warning_HTML = '<div style="font-size:12px;padding-bottom:12px;color:#CCCC00;margin-top:1px"> Warning: This is an autogenerated summary based on semantic search of abstracts, please examine the results before accepting this conclusion. There may be scenarios in which the summary will not be able to clearly answer the question.</div>'


type insights the big data analytics segment accounted for the largest share of around 30 % in 2020.


In [11]:
# output[0]

tensor([    0,   619,  4275,   109,   461,   335,  5832,  5125, 15193,   118,
          109,  1368,   537,   113,   279,   677,  7308,   115, 12573,     1])

In [9]:
input_ids = tokenizer(summarization_input, return_tensors="pt", max_length=1024,truncation=True).input_ids

# Generate the output (Here, we use beam search but you can also use any other strategy you like)
output = model.generate(
    input_ids, 
    max_length=64, 
    num_beams=10, 
    early_stopping=True,
    no_repeat_ngram_size=4,
    length_penalty=1.6
)

summarized_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(summarized_output)

type insights the big data analytics segment accounted for the largest share of around 30 % in 2020.


In [38]:
tokenizer.decode(output[0], skip_special_tokens=True)

Social media analytics segment accounted for the largest share of around 30 % in 2020.
