In [1]:
import json
import os
folder_path = "documents"  # Replace this with the actual path to your folder

def load_json_files_into_dictionary(folder_path):
    data_dict = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            key = os.path.splitext(filename)[0]  # Exclude the ".json" extension from the filename
            with open(file_path, 'r') as file:
                try:
                    data = json.load(file)
                    data_dict[key] = data
                except json.JSONDecodeError:
                    print(f"Error: Failed to load JSON from file '{filename}'. Skipping this file.")
    return data_dict


In [2]:
json_data_dict = load_json_files_into_dictionary(folder_path)

In [3]:
field_names = ['title','text','author','date']

In [4]:
for name in json_data_dict:
    json_data_dict[name] = [item for item in json_data_dict[name] if not(any(fname in item and (item[fname] == ' ' or item[fname] == '') for fname in field_names))]
    for item in json_data_dict[name]:
        if 'embeds' in item:
            del item['embeds']


In [5]:
for name in json_data_dict:
    print(name, json_data_dict[name][0])
    print("\n")

text_collection {'link': 'https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/article/view/25', 'media': ' ', 'partition_name': 'documents_partition', 'text': 'Responding to the challenges of global employment, universities recognize the need to produce graduates who meet the ever-changing demands of work and life in the national and global environment. This paper investigates the extent of practice of graduate attributes among graduating college students of a Catholic University. It determines the influence of graduate attributes in studentsâ€™ lives. Using a sequential explanatory mixed-method design, the findings revealed that the participants developed the graduate attributes to a high extent. Remarkable implication signifies that university education has successfully prepared and trained students to meet the challenges of national and global employment and industry. link: https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/article/view/25', 'text_id': '0105944d-3ebe-45f0-89f8-53e07f2d345d', 'uu

In [6]:
import string

for name in json_data_dict:
    for item in json_data_dict[name]:
        embeds = []
        for field in item:
            if field not in ['uuid', 'partition_name', 'text_id']:
                embeds.append(f"{field} is {item[field]}")
        item['embeds'] = ', '.join(embeds)


In [7]:
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def embed_string(text: str):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2')
    model = AutoModel.from_pretrained('intfloat/e5-large-v2')

    # Prefix the text with 'query: '
    text = 'query: ' + text

    # Tokenize the input text
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')

    # Generate model outputs
    outputs = model(**inputs)

    # Average pool the last hidden states and apply the attention mask
    embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])

    # Normalize the embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)

    # Convert tensor to list
    embeddings_list = embeddings.tolist()

    return embeddings_list


In [None]:
from tqdm import tqdm

for name in json_data_dict:
    for item in tqdm(json_data_dict[name]):
        item['embeds'] = embed_string(item['embeds'].lower())

 22%|█████████▏                                | 43/196 [01:38<05:42,  2.24s/it]

In [None]:
# for name in json_data_dict:
# #     print(name, json_data_dict[name][0]['embeds'])
#     print("\n")

In [None]:
for name in json_data_dict:
    for item in json_data_dict[name]:
        item['embeds'] = item['embeds'][0]

In [None]:
def update_and_save_json_files(folder_path):
    for name in json_data_dict:
        file_path = os.path.join(folder_path, f"{name}.json")
        with open(file_path, 'w') as file:
            json.dump(json_data_dict[name], file)

# Call the function to update and save the JSON files
update_and_save_json_files(folder_path)