In [1]:
import stanza
import os
HOME = os.getcwd()
print(HOME)

from stanza.pipeline.core import DownloadMethod
NLP = stanza.Pipeline('en', download_method=DownloadMethod.REUSE_RESOURCES, use_gpu=True) # to avoid downloading the models every time
# nlp is a pipeline

  from .autonotebook import tqdm as notebook_tqdm


c:\Users\bouab\DEV\see-and-tell


2023-05-08 17:09:38 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| constituency | wsj       |
| depparse     | combined  |
| sentiment    | sstplus   |
| ner          | ontonotes |

2023-05-08 17:09:38 INFO: Using device: cuda
2023-05-08 17:09:38 INFO: Loading: tokenize
2023-05-08 17:09:40 INFO: Loading: pos
2023-05-08 17:09:40 INFO: Loading: lemma
2023-05-08 17:09:40 INFO: Loading: constituency
2023-05-08 17:09:40 INFO: Loading: depparse
2023-05-08 17:09:41 INFO: Loading: sentiment
2023-05-08 17:09:41 INFO: Loading: ner
2023-05-08 17:09:41 INFO: Done loading processors!


In [2]:
sentences = ['The woman in the garden is reading a book next to her husband', 'A man in a blue shirt is talking to another man', "A young girl is having breakfast with her mother"]
final_str = ". ".join(sentences)
# let's see how things go now !!
doc = NLP(final_str)

In [3]:
s = doc.sentences
type(s[0].constituency)

stanza.models.constituency.parse_tree.Tree

In [4]:
# let's try to understand how to identify expressions that could represent humans
from nltk.corpus import wordnet as wn
PEOPLE = wn.synset('people.n.01')
PERSON = wn.synset('person.n.01')

def words_tags(tree):
    return [(x.label, x.children[0].label) for x in tree.yield_preterminals()]

# print(words_tags(c[1]))
# print(words_tags(c[1].children[1]))

In [5]:
def person_word(word:str, pos: str, threshold: float=0.2, synset_reference=None) -> bool:
    # this function will return 2 boolean values:
    # first if the word could possibly mean the word "person"     

    # first extract the possible meanings of such word
    if synset_reference is None:
        synset_reference = PERSON # this way we can use PEOPLE as well with the same function

    if pos.startswith('v'):
        pos = 'VERB'
    
    is_person = False
    try:
        l = wn.synsets(word, pos=pos.lower()[0])
        for meaning in l[:3]: # make sure to include the pos tag in the problem
            # unfrequent use cases of words
            if meaning.path_similarity(PERSON) >= threshold:
                is_person = True
                break
    except KeyError as e:
        print(e)
        pass
    return is_person


In [6]:
def num_person_words(noun_phrase: list[tuple[str, str]], meta_data: dict, known_person_words: set):
    # the meta data contains both the lemms and POS tags
    pos_tags = [meta_data[t[1].lower()][0] for t in noun_phrase]
    lemmas = [meta_data[t[1].lower()][1] for t in noun_phrase]
    

    person_lists = [(index, l) for index, (l, pos) in enumerate(zip(lemmas, pos_tags)) if pos == 'NOUN' and 
                    (l in known_person_words or person_word(l, pos, synset_reference=PERSON) or person_word(l, pos, synset_reference=PEOPLE))]    

    # make sure to add the words that could possibly mean PERSON to the set
    known_person_words.update(person_lists)
    return person_lists, known_person_words

In [7]:
def get_NP_components(root, meta_data: dict, person_np: set=None, ):
    # this function is supposed to return the largest NPs including exactly one word with a close meaning to 'PERSON'  
    if person_np is None:
        person_np = set()
    
    result = []
    # only consider Noun Phrases
    if root.label == 'NP':
        # if the current root reprsents a Noun Phrase, 
        noun_phrase = words_tags(root)
        # extract the number of words that represent a PERSON
        num_persons, person_np = num_person_words(noun_phrase, meta_data, person_np)

        if len(num_persons) == 1:
            # the word that could mean person can be preceeded only with Adjectives (ADJ) or determinents
            # first extract the position of such word
            pos_word = num_persons[0][0]
            for i in range(pos_word):
                good = meta_data[noun_phrase[i][1].lower()][0] not in ['NOUN', 'VERB']
                if not good:
                    for child in root.children:
                        result.extend(get_NP_components(child, meta_data, person_np))
            if good:
                result.append(noun_phrase)

        elif len(num_persons) >= 2:
            # one exception for this condition is having all the words (with approximately PERSON meaning ) on a row
            if [n[0] for n in num_persons] == list(range(num_persons[0][0], num_persons[0][0] + len(num_persons))): 
                result.append(noun_phrase)
            
            else:                
                for child in root.children:
                    result.extend(get_NP_components(child, meta_data, person_np))

    else:
        for child in root.children:
            result.extend(get_NP_components(child, meta_data, person_np))

    return result 

In [8]:
def convert_to_text(l: list[tuple[str, str]], filter):
    if filter:
        return " ".join([t[1] for t in l if t[0] in ['NN', 'JJ']]).strip().lower() 
    return " ".join([t[1] for t in l]).strip().lower()


def extract_NP_text(text: str, nlp_object=None, plain_text:bool=True, filter:bool=True):
    if nlp_object is None:
        nlp_object = NLP
    
    doc = nlp_object(text)
    np_components = []
    person_words = set()
    # iterate through sentences
    for s in doc.sentences:
        tree = s.constituency
        c = tree.children[0]
        meta_data = dict([(w.text.lower(), [w.upos, w.lemma]) for w in s.words]) # the assumption is as follows: if the word is repeated then it is frequent and the lemma and POS tag is the same
        np_components.append(get_NP_components(c, meta_data, person_words))

    if plain_text:
        return [[convert_to_text(t, filter=filter) for t in component] for component in np_components]
    
    return np_components


In [9]:
sentences = ['The woman in the garden is reading a book next to her husband', 
             'A man in a blue shirt is talking to another man', 
             "A young girl is having breakfast with her mother", 
             "One boy and one man were hitting an old woman"]
final_str = ". ".join(sentences)
print(final_str)
results = extract_NP_text(final_str, plain_text=True, filter=False)
print(results)

The woman in the garden is reading a book next to her husband. A man in a blue shirt is talking to another man. A young girl is having breakfast with her mother. One boy and one man were hitting an old woman
[['the woman in the garden', 'her husband'], ['a man in a blue shirt', 'another man'], ['a young girl', 'her mother'], ['one boy', 'one man', 'an old woman']]


# Face recognition

In [10]:
from src.experimental.exp import get_caption
from src.face.face_recognition import recognize_faces
def get_result_image(image_path):
    # extract the caption
    caption = get_caption(image_path)
    # get the faces present in the image 
    o1 = recognize_faces(image_path, embeddings=bbt_embeddings, display=False)    
    print(caption)
    print(o1)
    return caption, o1


In [11]:
# for f in face_pred:
#     print(char_names_predictions(f)) 
# for p in char_names_predictions(face_pred):
#     print(p)


In [12]:
from collections import Counter
import numpy as np

def char_names_predictions(face_predictions: list[list[list[str, np.array]]]):
    # first extract only the character names as needed
    char_names  = [[l[0] for l in p] for p in face_predictions]
    return char_names


In [13]:
def build_captions_class_matrix(filtered_nps: list[list[str]], face_predictions: list[list[str]]):
    class_np_counter = Counter() # to save the frequency of each np with each class
    np_class_counter = {} # to save which classes each np was seen with
    
    for np_list, pred_list in zip(filtered_nps, face_predictions): # np_list represents a list of noun phrases for a single caption

        # each class in the predictions should be associated with any noun phrases in the np_list

        for char_pred in pred_list:
            
            if char_pred not in class_np_counter:
                class_np_counter[char_pred] = Counter() # a dictionary for each of the classes        
            
            # iterate through the noun phrases: 
            for np in np_list:
                # increase the frequency of each term in the caption, in the dictionary associated with the class
                class_np_counter[char_pred].update(dict([(word, 1) for word in np.split(" ")]))
                
                # the current class should be associated with every word in the noun phrase.
                for word in np.split(" "):
                    if word not in np_class_counter:
                        np_class_counter[word] = set() 
                    np_class_counter[word].add(char_pred)
            
    return class_np_counter, np_class_counter


In [14]:
def find_decided_captions(filtered_noun_phrases: list[list[str]], face_predictions: list[list[str]]):

    # this function will just return the captions and face_predictions with length 1
    decisive_list =  [(np_list[0], fp[0]) for np_list, fp in zip(filtered_noun_phrases, face_predictions) if len(np_list) == len(fp) == 1]
    
    # convert the list of tuples to 2 lists
    # nps represents a list of strings 
    # predictions: a list of strings
    nps, predictions = list(map(list, zip(*decisive_list))) 
    
    # build a counter to map each class to its decided captions
    decisive_class_np_counter = Counter()

    for np, pred in zip(nps, predictions):

        if pred not in decisive_class_np_counter:
            decisive_class_np_counter[pred] = Counter()

        decisive_class_np_counter[pred].update(dict([(w, 1) for w in np.split(" ")]))

    return decisive_class_np_counter


In [15]:

def np_class_score(noun_phrase: str, class_prediction: str, class_np_counter: Counter, np_class_counter: Counter, decided_class_np: Counter):
    def word_class_score(word: str):        
        try:
            frequency_score = class_np_counter[class_prediction][word]
        except:
            frequency_score = 0
        
        try:
            decided_freq_score = class_np_counter[class_prediction][word]
        except:
            decided_freq_score = 0
        
        numerator = 1 + frequency_score + decided_freq_score
    
        # the denominator: the number of unique classes the word was associated with + 1 
        denominator = 1 + len(np_class_counter[word]) 

        return np.log(numerator / denominator) + 1
    
    words = noun_phrase.split(" ")
    return np.mean([word_class_score(w) for w in words])

    

In [16]:
import pandas as pd

def map_np_char_name(noun_phrases: list[str], face_predictions: list[str], class_np_counter: Counter, np_class_counter: Counter, decided_class_np:Counter):
    # create a dataframe to save the score of each noun phrase with the class predicted
    np_scores = pd.DataFrame(data=[], index=noun_phrases, columns=face_predictions, dtype=float)
    for noun_p in noun_phrases:
        for face_pred in face_predictions:
            np_scores.at[noun_p, face_pred] = np_class_score(noun_p, face_pred, class_np_counter, np_class_counter, decided_class_np)

    mapping = {}
    while not np_scores.empty:
        # first extract the highest score in the table
        max_score = np.amax(np_scores.values)
        
        # locate it
        indices, columns = np.where(np_scores == max_score)
        # extract the corresponding noun phrase and prediction
        best_np = list(np_scores.index)[indices[0]]
        best_pred = list(np_scores.columns)[columns[0]]

        # map the best_face_pred to the best_index
        mapping[best_np] = best_pred
        # remove the index and the face from np_scores
        np_scores.drop(columns=best_pred, index=best_np, inplace=True)

    return mapping    

In [17]:
def replace_with_char_names(captions: list[str], face_predictions: list[list[list[str, np.array]]]):
    # first off extract plain names 
    face_predictions = char_names_predictions(face_predictions)
    # extract the nps from the captions
    nps = extract_NP_text(". ".join([c[:-1] if c[-1] == '.' else c for c in captions]), plain_text=False)
    # first extract the captions as plain text
    plain_nps = [[convert_to_text(t, filter=False) for t in component] for component in nps]

    # extract the filtered version of each caption
    filtered_text_nps = [[convert_to_text(t, filter=True) for t in component] for component in nps]
    # now we have the captions and the predictions ready 
    # time to build the matrix
    class_np_counter, np_class_counter = build_captions_class_matrix(filtered_text_nps, face_predictions)
    # find the decided captions
    decisive_class_np_counter = find_decided_captions(filtered_text_nps, face_predictions)
    # iterate through each of the predictions and captions
    final_captions = []
    
    for np_list_index, (np_list, pred_list) in enumerate(zip(filtered_text_nps, face_predictions)):    
        # map the noun phrase to the suitable class
        mapping = map_np_char_name(np_list, pred_list, class_np_counter, np_class_counter, decisive_class_np_counter)
        
        new_caption = captions[np_list_index]

        # for each pair of noun phrase and prediction
        for np, face_pred in mapping.items():
            # within the different nps in the current noun phrases, find the position 'np'
            np_index = np_list.index(np)
            # determine the exact text to replace 
            text_to_replace = plain_nps[np_list_index][np_index]
            # replace it in the caption
            new_caption = new_caption.replace(text_to_replace, face_pred)

        final_captions.append(new_caption)

    return final_captions

# Test

In [18]:
from pathlib import Path
HOME = os.getcwd()
import json
with open(os.path.join(HOME,'src','TBBT_embeddings_16_160.json')) as f:
  bbt_embeddings = json.load(f)

import os
f = os.path.join(HOME, 'src', 'frames copy')
frames = os.listdir(f)
frames = sorted([os.path.join(f, x) for x in frames], key=lambda x: int(os.path.basename(x)[:-4]))

In [19]:
from src.experimental.exp import get_caption
# captions = []
# for i in range (1, len(frames), 5):
#     ims = [os.path.join(f, frames[j]) for j in range(i, min(i + 5, len(frames)))]
#     # get the captions for the images 
#     captions.extend(get_caption(ims, True))


In [22]:
from src.face.face_recognition import recognize_faces
face_pred = [recognize_faces(os.path.join(f), embeddings=bbt_embeddings) for f in frames[1:]]
assert len(face_pred) == len(captions)



'NoneType' object is not iterable
'NoneType' object is not iterable
'NoneType' object is not iterable
'NoneType' object is not iterable
'NoneType' object is not iterable
'NoneType' object is not iterable
'NoneType' object is not iterable
'NoneType' object is not iterable
'NoneType' object is not iterable
'NoneType' object is not iterable
'NoneType' object is not iterable
'NoneType' object is not iterable
'NoneType' object is not iterable


In [24]:
import pickle 

# with open('all_predictions_1.pickle', 'wb') as handle:
#     pickle.dump(face_pred, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('all_captions_1.pickle', 'rb') as handle:
    b = pickle.load(handle)

captions = b

with open('all_predictions_1.pickle', 'rb') as handle:
    predictions = pickle.load(handle)
face_preds = predictions

In [26]:
captions

['a man is standing in front of a table with a cake on it.',
 'a man and a woman are sitting at a table with a cake in front of them.',
 'a man and a woman are in a kitchen reading a book.',
 'a blonde woman sits at a table with a coffee pot and a coffee pot in front of her',
 'a blonde woman sits in front of a coffee pot that has the word coffee on it.',
 'a blonde woman is sitting at a table with a coffee pot in front of her.',
 'a blonde woman sits at a table with a coffee pot in front of her.',
 'a blonde woman is sitting at a table with a yellow coffee mug.',
 'a man and a woman in a kitchen reading a book.',
 'a man and a woman are standing in a kitchen with a coffee maker on the counter.',
 'a man and a woman are in a kitchen reading a book.',
 'a man and a woman are in a kitchen with a coffee pot on the counter.',
 'a man and a woman are standing in a kitchen with a coffee mug and a mug that says',
 'a man and a woman are in a kitchen with a coffee maker on the counter.',
 'two

In [25]:
from src.captions import captions_improved as ci
nc2 = ci.replace_with_char_names(captions, face_pred)
nc2

2023-05-08 17:12:14 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| constituency | wsj       |
| depparse     | combined  |
| sentiment    | sstplus   |
| ner          | ontonotes |

2023-05-08 17:12:14 INFO: Using device: cuda
2023-05-08 17:12:14 INFO: Loading: tokenize
2023-05-08 17:12:14 INFO: Loading: pos
2023-05-08 17:12:14 INFO: Loading: lemma
2023-05-08 17:12:15 INFO: Loading: constituency
2023-05-08 17:12:15 INFO: Loading: depparse
2023-05-08 17:12:15 INFO: Loading: sentiment
2023-05-08 17:12:16 INFO: Loading: ner
2023-05-08 17:12:16 INFO: Done loading processors!


['leonard is standing in front of a table with a cake on it.',
 'leonard and penny are sitting at a table with a cake in front of them.',
 'leonard and a woman are in a kitchen reading a book.',
 'penny sits at a table with a coffee pot and a coffee pot in front of her',
 'penny sits in front of a coffee pot that has the word coffee on it.',
 'penny is sitting at a table with a coffee pot in front of her.',
 'penny sits at a table with a coffee pot in front of her.',
 'penny is sitting at a table with a yellow coffee mug.',
 'leonard and a woman in a kitchen reading a book.',
 'leonard and a woman are standing in a kitchen with a coffee maker on the counter.',
 'leonard and a woman are in a kitchen reading a book.',
 'leonard and a woman are in a kitchen with a coffee pot on the counter.',
 'leonard and a woman are standing in a kitchen with a coffee mug and a mug that says',
 'a man and a woman are in a kitchen with a coffee maker on the counter.',
 'two people in a kitchen, one readi

In [28]:
from src.captions.captions_improved import char_names_predictions
for f in char_names_predictions(face_preds):
    print(f)

['leonard']
['leonard', 'penny']
['leonard']
['penny']
['penny']
['penny']
['penny']
['penny']
['leonard']
['leonard']
['leonard']
['leonard']
['leonard']
[]
[]
['penny']
['leonard']
['penny']
['penny']
['leonard']
['leonard']
['amy', 'penny']
['penny']
['penny']
['penny']
['leonard']
['leonard']
['sheldon']
['penny', 'leonard']
['sheldon']
['sheldon']
['sheldon']
['sheldon']
['sheldon']
['leonard']
['leonard']
['leonard']
['leonard']
['leonard']
['leonard']
['sheldon']
['sheldon']
['sheldon']
['sheldon']
['sheldon']
['leonard']
['leonard']
['penny']
['penny']
['penny']
['sheldon']
['leonard']
['leonard']
['leonard']
['sheldon']
['sheldon']
['sheldon']
['leonard']
['leonard']
[]
['sheldon', 'bernadette']
[]
['penny']
['penny']
['sheldon']
['sheldon']
['sheldon']
['penny']
['penny']
['sheldon']
['sheldon']
['penny']
['penny']
['penny']
['sheldon']
['sheldon']
['sheldon']
['sheldon', 'amy']
['sheldon']
[]
['penny']
['penny']
['penny']
['sheldon']
['sheldon']
['sheldon']
['sheldon']
['leo