In [24]:
import json
import os
from typing import List,Dict,Any
from langchain_core.documents import Document

In [5]:
os.makedirs('data/json',exist_ok=True)

In [6]:
json_data = {
  "company": "TechCorp",
  "employees": [
    {
      "id": 1,
      "name": "John Doe",
      "role": "Software Engineer",
      "skills": [
        "Python",
        "JavaScript",
        "React"
      ],
      "projects": [
        {
          "name": "RAG System",
          "status": "In Progress"
        },
        {
          "name": "Data Pipeline",
          "status": "Completed"
        }
      ]
    },
    {
      "id": 2,
      "name": "Jane Smith",
      "role": "Data Scientist",
      "skills": [
        "Python",
        "Machine Learning",
        "SQL"
      ],
      "projects": [
        {
          "name": "ML Model",
          "status": "In Progress"
        },
        {
          "name": "Analytics Dashboard",
          "status": "Planning"
        }
      ]
    }
  ],
  "departments": {
    "engineering": {
      "head": "Mike Johnson",
      "budget": 1000000,
      "team_size": 25
    },
    "data_science": {
      "head": "Sarah Williams",
      "budget": 750000,
      "team_size": 15
    }
  }
}

In [7]:
with open('data/json/data.json','w') as f:
  json.dump(json_data,f,indent=2)

In [8]:
jsonl_data = [
    {"timestamp": "2024-01-01", "event": "user_login", "user_id": 123},
    {"timestamp": "2024-01-01", "event": "page_view", "user_id": 123, "page": "/home"},
    {"timestamp": "2024-01-01", "event": "purchase", "user_id": 123, "amount": 99.99}
]

with open('data/json/events.jsonl','w') as f:
    for data in jsonl_data:
        f.write(json.dumps(data))
        f.write('\n')

In [20]:
from langchain_community.document_loaders import JSONLoader
# method-1
employee_loader=JSONLoader(
    file_path='data/json/data.json',
    jq_schema='.employees[]',
    text_content=False
)
employee_docs = employee_loader.load()

for i in range(len(employee_docs)):
    print(employee_docs[i].page_content)

{"id": 1, "name": "John Doe", "role": "Software Engineer", "skills": ["Python", "JavaScript", "React"], "projects": [{"name": "RAG System", "status": "In Progress"}, {"name": "Data Pipeline", "status": "Completed"}]}
{"id": 2, "name": "Jane Smith", "role": "Data Scientist", "skills": ["Python", "Machine Learning", "SQL"], "projects": [{"name": "ML Model", "status": "In Progress"}, {"name": "Analytics Dashboard", "status": "Planning"}]}


In [37]:
def process_json_intelligently(file_path:str):
    documents=[]
    with open(file_path,'r') as file:
        json_data_format=json.load(file)

    for emp in json_data_format.get('employees',[]):
        content = f"""Employee Profile:
            Name: {emp['name']}
            Role: {emp['role']}
            Skills: ({' '.join(emp['skills'])}
            Projects: """
        for project in json_data_format.get('projects',[]):
            content += f"\n Project: {project['name']}(Status: {project['status']})"
            """
        """
        doc = Document(
            page_content=content,
            metadata={
                'source':file_path,
                'format':'json',
                'data_type':'employee_profile',
                'employee':emp['id'],
                'name':emp['name'],
                'role':emp['role']
            }
        )
        documents.append(doc)
    return documents


In [39]:
print(process_json_intelligently(file_path='data/json/data.json'))

[Document(metadata={'source': 'data/json/data.json', 'format': 'json', 'data_type': 'employee_profile', 'employee': 1, 'name': 'John Doe', 'role': 'Software Engineer'}, page_content='Employee Profile:\n            Name: John Doe\n            Role: Software Engineer\n            Skills: (Python JavaScript React\n            Projects: '), Document(metadata={'source': 'data/json/data.json', 'format': 'json', 'data_type': 'employee_profile', 'employee': 2, 'name': 'Jane Smith', 'role': 'Data Scientist'}, page_content='Employee Profile:\n            Name: Jane Smith\n            Role: Data Scientist\n            Skills: (Python Machine Learning SQL\n            Projects: ')]
