In [1]:
import sys
import os
import math
import random
import bisect
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import urllib
import subprocess
import re
import tempfile
import itertools
import torch
import spacy
import amrlib
import penman

from typing import List, Tuple
from operator import itemgetter 
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertForSequenceClassification
# import qgrid

In [2]:
BASE_DIR = os.path.abspath(os.getcwd()+'/../..')  # /home/gil/dev/NEBULA2/
os.chdir(os.getcwd()+'/../..')

In [3]:
from nebula_api.nebula_enrichment_api import *
from experts.common.RemoteAPIUtility import RemoteAPIUtility
from nebula_api.vlmapi import VLM_API
from nebula_api.atomic2020.comet_enrichment_api import *
from nebula_api.canonisation_api import CANON_API
import nebula_api.playground_api as pg_api

In [None]:
nre = NRE_API()
api = RemoteAPIUtility()
vlm = VLM_API()
# mdmmt = mdmmt_api.MDMMT_API()
# comet = Comet("/app/NEBULA2/nebula_api/atomic2020/comet-atomic_2020_BART")
ascore = CANON_API()
stog = amrlib.load_stog_model(model_dir="/app/NEBULA2/models/model_stog")
gtos = amrlib.load_gtos_model(model_dir="/app/NEBULA2/models/model_gtos")
# model_name = "Alireza1044/albert-base-v2-cola" 


# Download cola model
# cola_model = AutoModelForSequenceClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def flatten(lst): return [x for l in lst for x in l]

def compute_batch_scores(video_emb: torch.Tensor, texts: List[str], normalize=True, **kwargs) -> List[float]:    
    emb_batch = vlm.encode_text(texts, **kwargs)
    if type(emb_batch) == list:
        emb_batch = torch.stack(emb_batch,axis=0)
    if normalize:
        video_emb = video_emb / video_emb.norm(2)
        # print("normalized video norm: {}".format(video_emb.norm(2)))
        n = (emb_batch * emb_batch).sum(axis=1).sqrt()
        emb_batch = emb_batch / n.unsqueeze(1).expand_as(emb_batch)
        # print("normalized text norms:")
        # for emb in emb_batch:
        #     print(emb.norm(2))                        
    return (video_emb.unsqueeze(0).expand_as(emb_batch)*emb_batch).sum(dim=1).cpu().numpy()


def compute_concat_score(image_emb: torch.Tensor, texts: List[str], join_on=',') -> float:
    combined_text = ""
    for t in [x.strip() for x in texts]:
        if t[-1]=='.':
            t = t[:-1]       
        t+=join_on
        t+=' '
        combined_text+=t
    print("Combined: "+combined_text)
    return torch.matmul(image_emb,mdmmt.encode_text(combined_text.strip()) )       

In [None]:
def transform_concept(c):
    exp = re.compile(r"^([a-zA-z]+)-?(\d*)$")
    r = exp.match(c)
    return r.group(1) if r else c

class ConceptManager:
    def __init__(self):
        pass
    def ground_concept(concept):
        return transform_concept(concept)

In [4]:
class SimilarityManager:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_lg')

    def similarity(self, c1, c2):
        if type(c2) is not list:
            c2 = [c2]   
        a = self.nlp(c1)
        targets = self.nlp(' '.join(c2))
        return [a.similarity(x) for x in targets]


smanager = SimilarityManager()


In [6]:
smanager.similarity("woman", "the women went on a marth with flags")

[0.34148266953994577,
 0.7178931365020378,
 0.34316043123950124,
 0.24771382812948767,
 0.43955152647925716,
 -0.08777517374751317,
 0.34892602136745754,
 0.11595497055245296]

In [7]:
softmax = lambda x: np.exp(x)/sum(np.exp(x))

class SubsetOptimization:
    def __init__(self, video_emb, experts: List, candidates_strings: List[str]):
        self.video_emb = video_emb
        self.initial_temp = 10
        self.final_temp = .05
        self.alpha = 0.01
        self.theta = 0.5
        self.experts = experts
        self.candidates_strings = candidates_strings
        self.candidates_similarity = compute_batch_scores(self.video_emb, self.candidates_strings)             
        self.opt_results = []
        self.smanager = SimilarityManager()

        self.coverage_matrix = np.zeros([len(self.experts),len(self.candidates_strings)])
        self.coverage_matrix[:] = np.nan
        for i in range(len(experts)):
            for j in range(len(candidates_strings)):
                self.coverage_matrix[i][j]=self.concept_similarity(self.experts[i],self.candidates_strings[j])
        self.max_size = int(len(self.experts)*1.5)

    def concept_similarity(self, concept, sent):        
        return max(self.smanager.similarity(concept,sent))

    def get_coverage(self,i,j):        
        if np.isnan(self.coverage_matrix[i][j]):
            self.coverage_matrix[i][j] = self.concept_similarity(self.experts[i],self.candidates_strings[j])
        return self.coverage_matrix[i][j]

    def get_expert_coverage(self,state):
        return self.coverage_matrix[:,state].max(axis=1)

    def get_state_coverage(self,state) -> float:
        print("State coverage for {}:".format(state))
        print(self.get_expert_coverage(state))
        return np.mean(self.get_expert_coverage(state))

    # def get_state_coverage(self, state: List[int]) -> float:
    #     experts_coverage = [max([self.get_coverage(i,j) for j in state]) for i in range(len(self.experts))]    # A list of partial coverege        
    #     return sum(experts_coverage) / len(self.experts)

    def get_cost(self, state: List[int]) -> float:
        if not state:
            return 0
        coverage_score = self.get_state_coverage(state)           
        similarity_score = self.candidates_similarity[state].mean().item()
        return -(coverage_score + self.theta*similarity_score)

    # state here is assumed (and guaranteed on return) to be -sorted-
    def get_candidate(self, state: List[int]) -> List[int]:
        def compute_state_arrays(s):
            print("Computing arrays for state: ")
            print(s)
            s_score = self.candidates_similarity[s]
            s_coverage = self.coverage_matrix.mean(axis=0)[s]
            s_max_coverage = self.coverage_matrix.max(axis=0)[s]
            s_fitscore = s_coverage+self.theta*s_score

            return (s_score,s_coverage,s_max_coverage,s_fitscore)

        if not state:
            print("Empty state")
            return [random.randint(0,len(self.candidates_strings)-1)]
            
        rc = state.copy()
        s = np.array(state)
        s_score, s_coverage, s_max_coverage, s_fitscore = compute_state_arrays(s)
               
        if len(state) == self.max_size:
            print("Maximum state size, removing")
            idx = np.argmin(s_fitscore)
            del rc[idx]
            return rc
            
        remove_sentence = random.random()<self.get_state_coverage(state)        
        print("coverage of {} is {}, remove?{}".format(state,self.get_state_coverage(state),remove_sentence))
        if remove_sentence:             # We decide to remove a sentence from the set
            print("Removing")
            probs = softmax(-s_fitscore)
            idx = np.random.multinomial(1,probs).argmax()
            del rc[idx]                   
        else:                           # Add a sentence from the outside
            print("Adding")
            anti_state = []
            for i in range(len(self.candidates_strings)):
                if not i in state:
                    anti_state.append(i)
            s1 = np.array(anti_state)
            s1_score, s1_coverage, s1_max_coverage, s1_fitscore = compute_state_arrays(s1)
            # Pick an expert to try and cover
            probs = softmax(self.get_expert_coverage(s)*10)         # Coverage is in (0,1), so we use low temprature
            expert_to_cover = np.random.multinomial(1,probs).argmax()
            probs = softmax(self.coverage_matrix[expert_to_cover][s1]*10)
            idx_to_add = np.random.multinomial(1,probs).argmax()
            bisect.insort(rc,anti_state[idx_to_add])
            
        return rc



    def get_scored_permutations(self, k):
        n = len(self.candidates)
        return [(x,self.get_cost(list(x))) for x in itertools.permutations(range(n),k)]
        
    def simulated_annealing(self, initial_state):
        self.opt_results = []
        current_temp = self.initial_temp

       # Start by initializing the current state with the initial state
        current_state = initial_state

        while current_temp > self.final_temp:
            next_cand = self.get_candidate(current_state)

            print("current cost: {} ({}). Candidate cost: {} ({})".format(self.get_cost(current_state),current_state,self.get_cost(next_cand),next_cand))

            # Check if next_cand is best so far
            cost_diff = self.get_cost(current_state) - self.get_cost(next_cand)

            # if the new solution is better, accept it
            if cost_diff > 0:
                current_state = next_cand
            # if the new solution is not better, accept it with a probability of e^(-cost/temp)
            else:
                print("chance to move: {}".format(math.exp(cost_diff / current_temp)))
                if random.uniform(0, 1) < math.exp(cost_diff / current_temp):
                    current_state = next_cand
            # decrement the temperature
            current_temp -= self.alpha
            self.opt_results.append(-self.get_cost(current_state))

        return current_state

