In [66]:
import json
import os
folder_path = "documents"  # Replace this with the actual path to your folder

def load_json_files_into_dictionary(folder_path):
    data_dict = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            key = os.path.splitext(filename)[0]  # Exclude the ".json" extension from the filename
            with open(file_path, 'r') as file:
                try:
                    data = json.load(file)
                    data_dict[key] = data
                except json.JSONDecodeError:
                    print(f"Error: Failed to load JSON from file '{filename}'. Skipping this file.")
    return data_dict


In [67]:
json_data_dict = load_json_files_into_dictionary(folder_path)

In [68]:
field_names = ['title','text','author','date']

In [69]:
for name in json_data_dict:
    json_data_dict[name] = [item for item in json_data_dict[name] if not(any(fname in item and (item[fname] == ' ' or item[fname] == '') for fname in field_names))]
    for item in json_data_dict[name]:
        if 'embeds' in item:
            del item['embeds']


In [75]:
for name in json_data_dict:
    print(name, json_data_dict[name][2])
    print("\n")

text_collection {'link': 'https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/article/view/680', 'media': ' ', 'partition_name': 'documents_partition', 'text': 'Depressive symptomatology is among the major psychological problems experienced by adolescents. Sex differences in the occurrence of depressive symptoms have likewise been extensively reported in literature. However, a great majority of related research have primarily been carried out in the Western context. Hence, we conducted a study to identify the prevalence of depressive symptoms among Malaysian adolescents, and to determine its relationship with certain risk factors as well as the potential moderating role of sex. The sample comprised 964 adolescents from 20 secondary schools across Malaysia. Using logistic regression, results demonstrated that stressful life events and maternal verbal aggression significantly predicted symptoms of depression. In the moderation tests, sex significantly interacted with stressful life events and p

In [71]:
import string

for name in json_data_dict:
    for item in json_data_dict[name]:
        embeds = []
        for field in item:
            if field not in ['uuid', 'partition_name', 'text_id']:
                embeds.append(f"{field} is {item[field]}")
        item['embeds'] = ', '.join(embeds)


In [72]:
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def embed_string(text: str):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2')
    model = AutoModel.from_pretrained('intfloat/e5-large-v2')

    # Prefix the text with 'query: '
    text = 'query: ' + text

    # Tokenize the input text
    inputs = tokenizer(text, max_length=2000, padding=True, truncation=True, return_tensors='pt')

    # Generate model outputs
    outputs = model(**inputs)

    # Average pool the last hidden states and apply the attention mask
    embeddings = average_pool(outputs.last_hidden_state, inputs['attention_mask'])

    # Normalize the embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)

    # Convert tensor to list
    embeddings_list = embeddings.tolist()

    return embeddings_list[0]


In [73]:
from tqdm import tqdm

for name in json_data_dict:
    for item in tqdm(json_data_dict[name]):
        item['embeds'] = embed_string(item['embeds'].lower())

100%|█████████████████████████████████████████| 196/196 [07:13<00:00,  2.21s/it]
100%|█████████████████████████████████████████| 162/162 [05:21<00:00,  1.99s/it]
100%|█████████████████████████████████████████| 151/151 [05:00<00:00,  1.99s/it]
100%|█████████████████████████████████████████| 244/244 [08:20<00:00,  2.05s/it]


In [None]:
# for name in json_data_dict:
# #     print(name, json_data_dict[name][0]['embeds'])
#     print("\n")

In [77]:
for name in json_data_dict:
    for item in json_data_dict[name]:
        item['embeds'] = item['embeds'][0]

In [78]:
def update_and_save_json_files(folder_path):
    for name in json_data_dict:
        file_path = os.path.join(folder_path, f"{name}.json")
        with open(file_path, 'w') as file:
            json.dump(json_data_dict[name], file)

# Call the function to update and save the JSON files
update_and_save_json_files(folder_path)