In [39]:
import json
import os
folder_path = "people"  # Replace this with the actual path to your folder

def load_json_files_into_dictionary(folder_path):
    data_dict = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            key = os.path.splitext(filename)[0]  # Exclude the ".json" extension from the filename
            with open(file_path, 'r') as file:
                try:
                    data = json.load(file)
                    data_dict[key] = data
                except json.JSONDecodeError:
                    print(f"Error: Failed to load JSON from file '{filename}'. Skipping this file.")
    return data_dict


In [40]:
json_data_dict = load_json_files_into_dictionary(folder_path)

In [41]:
field_names = ['text','position', 'name']

In [42]:
for name in json_data_dict:
    json_data_dict[name] = [item for item in json_data_dict[name] if not(any(fname in item and (item[fname] == ' ' or item[fname] == '') for fname in field_names))]
    for item in json_data_dict[name]:
        if 'embeds' in item:
            del item['embeds']
        if 'media' in item:    
            if 'media' != ' ':
                item['text'] =item['text'] + f" Media: {item['media']} "
        if 'link' in item:
            if 'link' != ' ':
                item['text'] = item['text'] + f" Link: {item['link']} "  


In [47]:
for name in json_data_dict:
    print(json_data_dict[name][0])
    print('\n')

{'text_id': 'a0b8241d-d9d2-461c-a9c2-dcdd56c1ab14', 'text': 'VP FOR ACADEMICS\n\nRev. Fr. Leo G. Alaras, OAR is a Batangueño who was ordained as a priest in 2010. He was first assigned as the Campus Ministry Office (CMO) Director in Colegio de San Nicholas de Tolentino-Recoletos (CSNTR) in Talisay City, Negros Occidental. He was later appointed as the treasurer and Property Administrator of the same school from 2011-2015.\n\nIn the same year, Fr. Alaras was transferred to Poblacion Valencia, Negros Oriental where he served as the School Director and High School Principal at the San Pedro Academy from 2015-2018. He was also the School Director at San Pedro Academy-Recoletos in Valencia, Negros Oriental.\n\nHe is a graduate of Bachelor of Arts in Classical Philosophy major in Languages at Casiciaco Recoletos Seminary in Baguio City. He also finished a bachelor’s degree in Theology at the University of Santo Thomas (UST) and Master of Arts in Theology in Recoletos School of Theology in Ma

In [14]:
for name in json_data_dict:
    for att in json_data_dict[name][0]:
        if att != 'embeds':
            print(att)
    print("\n")

IndexError: list index out of range

In [44]:
import string

for name in json_data_dict:
    for item in json_data_dict[name]:
        embeds = []
        for field in item:
            if field not in ['uuid', 'partition_name', 'text_id']:
                embeds.append(f"{field} is {item[field]}")
        item['embeds'] = ', '.join(embeds)


In [45]:
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def embed_string(text: str):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2')
    model = AutoModel.from_pretrained('intfloat/e5-large-v2')

    # Prefix the text with 'query: '
    text = 'query: ' + text

    # Tokenize the input text
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')

    # Generate model outputs
    outputs = model(**inputs)

    # Average pool the last hidden states and apply the attention mask
    embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])

    # Normalize the embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)

    # Convert tensor to list
    embeddings_list = embeddings.tolist()

    return embeddings_list[0]


In [46]:
from tqdm import tqdm

for name in json_data_dict:
    for item in tqdm(json_data_dict[name]):
        item['embeds'] = embed_string(item['embeds'].lower())

100%|█████████████████████████████████████████████| 6/6 [00:19<00:00,  3.23s/it]
100%|█████████████████████████████████████████████| 6/6 [00:12<00:00,  2.15s/it]
100%|█████████████████████████████████████████████| 6/6 [00:12<00:00,  2.13s/it]


In [None]:
# for name in json_data_dict:
# #     print(name, json_data_dict[name][0]['embeds'])
#     print("\n")

In [77]:
for name in json_data_dict:
    for item in json_data_dict[name]:
        item['embeds'] = item['embeds'][0]

In [89]:
def update_and_save_json_files(folder_path):
    for name in json_data_dict:
        file_path = os.path.join(folder_path, f"{name}.json")
        with open(file_path, 'w') as file:
            json.dump(json_data_dict[name], file)

# Call the function to update and save the JSON files
update_and_save_json_files(folder_path)