# Import JSON data 


## JSON Schema
```json
[
  {
    "prompt": "Answer as if you were the medical director of an anesthesia service. Develop a preoperatively testing algorithm reported in a chart format. Select specific tests that are required for each test based on both particular medical diagnoses as well as by the type of operation is planned.",
    "completion": "Preoperative Testing Algorithm  Patient's Medical Diagnosis:   Type of Operation:   Recommended Testing:  | Test | Medical Diagnosis | Type of Operation | | --- | --- | --- | | Complete Blood Count (CBC) | All | All | | Coagulation Panel | Bleeding disorders, liver disease | All major surgeries, neurosurgery, orthopedic surgeries | | Electrol",
    "id": "RzBNjvK"
  },
  {
    "prompt": "Web search results:  [1] \"The deposition efficiency (DE) was calculated by measuring the final coating mass after spraying and by dividing it over the total powder mass sprayed onto the sample without overspray. Microstructural investigations of the MG coatings were carried out by scanning electron microscopy (SEM) on the cross-section of the samples.\" URL: https://www.sciencedirect.com/science/article/pii/S1359645416309430  [2] \"The deposition mechanisms of metallic glass particles impacting a ",
    "completion": "Metallic glasses (MGs) are an interesting class of materials for their unique properties and potential industrial applications. Deposition of MGs on different substrates has been studied using various methods including cold gas spraying (CGS). Here are some key findings from the web search results regarding the deposition mechanism of metallic glass on different substrates:  * The deposition efficiency (DE) of MG coatings can be calculated by measuring the final coating mass after spraying and d",
    "id": "Hhq5zTB"
  }
]
```

## Weaviate Schema

```python
class_obj = {
    "class": "DriftBenchmark",
    "vectorizer": "text2vec-huggingface",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    "moduleConfig": {
        "text2vec-huggingface": {
            "model": "sentence-transformers/all-MiniLM-L6-v2",  # Can be any public or private Hugging Face model.
            "options": {
                "waitForModel": True
            }
        }
    }
}

# NEW SCHEMA

{
    "classes": [
        {
            "class": "DriftBenchmark",
              "invertedIndexConfig": {"bm25": {"b": 0.75, "k1": 1.2},
                                      "cleanupIntervalSeconds": 60,
                                      "stopwords": {"additions": None,
                                                    "preset": "en",
                                                    "removals": None}},
              "moduleConfig": {"text2vec-transformers": {"model": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
                                                         "options": {"waitForModel": False},
                                                         "poolingStrategy": "masked_mean",
                                                         "vectorizeClassName": False}},
              "multiTenancyConfig": {"enabled": False},
            "properties": [{"dataType": ["text"],
                              "description": "The text sent to the API.",
                              "indexFilterable": True,
                              "indexSearchable": True,
                              "moduleConfig": {"text2vec-transformers": {"skip": True,
                                                                         "vectorizePropertyName": False}},
                              "name": "prompt",
                              "tokenization": "word"},
                             {"dataType": ["text"],
                              "description": "This is the value returned by the API and used to test similarity",
                              "indexFilterable": True,
                              "indexSearchable": True,
                              "moduleConfig": {"text2vec-transformers": {"skip": False,
                                                                         "vectorizePropertyName": False}},
                              "name": "completion",
                              "tokenization": "word"},
                             {"dataType": ["date"],
                              "description": "Just the date_time that the record was made.",
                              "indexFilterable": True,
                              "indexSearchable": False,
                              "moduleConfig": {"text2vec-transformers": {"skip": True,
                                                                         "vectorizePropertyName": False}},
                              "name": "date_time"},
                             {"dataType": ["text"],
                              "description": "A v4 UUID",
                              "indexFilterable": True,
                              "indexSearchable": True,
                              "moduleConfig": {"text2vec-transformers": {"skip": True,
                                                                         "vectorizePropertyName": False}},
                              "name": "chat_id",
                              "tokenization": "word"}],
            "vectorizer": "text2vec-transformers"
        }
    ]
}
```

In [None]:
import weaviate
import json
import uuid
from datetime import datetime, timezone
local_time = datetime.now(timezone.utc).astimezone().isoformat()

print("Let's a go!")

print("Connecting to weaviate instance on localhost:8080...")
client = weaviate.Client("http://localhost:8080")
print("Client created")

print("Deleting all previous schemas")
client.schema.delete_all()

print("Creating new schema")

# ===== add schema =====
class_obj = {
   "class": "DriftBenchmark",
              "invertedIndexConfig": {"bm25": {"b": 0.75, "k1": 1.2},
                                      "cleanupIntervalSeconds": 60,
                                      "stopwords": {"additions": None,
                                                    "preset": "en",
                                                    "removals": None}},
              "moduleConfig": {"text2vec-transformers": {"model": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
                                                         "options": {"waitForModel": False},
                                                         "poolingStrategy": "masked_mean",
                                                         "vectorizeClassName": False}},
              "multiTenancyConfig": {"enabled": False},
            "properties": [{"dataType": ["text"],
                              "description": "The text sent to the API.",
                              "indexFilterable": True,
                              "indexSearchable": True,
                              "moduleConfig": {"text2vec-transformers": {"skip": True,
                                                                         "vectorizePropertyName": False}},
                              "name": "prompt",
                              "tokenization": "word"},
                             {"dataType": ["text"],
                              "description": "This is the value returned by the API and used to test similarity",
                              "indexFilterable": True,
                              "indexSearchable": True,
                              "moduleConfig": {"text2vec-transformers": {"skip": False,
                                                                         "vectorizePropertyName": False}},
                              "name": "completion",
                              "tokenization": "word"},
                             {"dataType": ["date"],
                              "description": "Just the date_time that the record was made.",
                              "indexFilterable": True,
                              "indexSearchable": False,
                              "moduleConfig": {"text2vec-transformers": {"skip": True,
                                                                         "vectorizePropertyName": False}},
                              "name": "date_time"},
                             {"dataType": ["text"],
                              "description": "A v4 UUID",
                              "indexFilterable": True,
                              "indexSearchable": True,
                              "moduleConfig": {"text2vec-transformers": {"skip": True,
                                                                         "vectorizePropertyName": False}},
                              "name": "chat_id",
                              "tokenization": "word"}],
            "vectorizer": "text2vec-transformers"
}

client.schema.create_class(class_obj)

# ===== import data =====
# Load data
# import requests
# url = 'https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json'
# resp = requests.get(url)
# testJson = json.loads(resp.text)

testJson = open('c:\\data\\dev\\chats_50MB_clean_fmt.json')
  
# returns JSON object as 
# a dictionary
testData = json.load(testJson)
# Closing file
testJson.close()

# Configure a batch process
with client.batch(
    batch_size=100
) as batch:
    # Batch import all Questions
    for i, d in enumerate(testData):
        # print(f"importing sample content JSON: {i+1}")
        properties = {
            "prompt": d["prompt"],
            "completion": d["completion"],
            "date_time": local_time,
            "chat_id": json.dumps(uuid.uuid4(), default=str)
        }
        client.batch.add_data_object(
            properties,
            "DriftBenchmark",
        )
        
print("Completed import.")        