In [1]:
import json
import weaviate
import os
import pathlib
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# get the weaviate client
client = weaviate.Client("http://localhost:8080")  # Replace the URL with that of your Weaviate instance

client.schema.get() 
client.is_ready()

True

In [3]:
DATA_PATH = os.path.join(os.getcwd(), "Data", "output")
os.path.exists(DATA_PATH)

True

In [4]:
TRANSCRIPTS = [item for item in os.listdir(DATA_PATH) if ".json" in item]
print(len(TRANSCRIPTS), TRANSCRIPTS)

11 ['Lecture 3_ Editors (vim) (2020).mp3.json', 'Lecture 1_ Course Overview + The Shell (2020).mp3.json', 'Lecture 6_ Version Control (git) (2020).mp3.json', 'Lecture 5_ Command-line Environment (2020).mp3.json', 'Lecture 4_ Data Wrangling (2020).mp3.json', 'Lecture 8_ Metaprogramming (2020).mp3.json', 'Lecture 10_ Potpourri (2020).mp3.json', 'Lecture 9_ Security and Cryptography (2020).mp3.json', 'Lecture 7_ Debugging and Profiling (2020).mp3.json', 'Lecture 11_ Q_A (2020).mp3.json', 'Lecture 2_ Shell Tools and Scripting (2020).mp3.json']


In [86]:
with open(os.path.join(DATA_PATH, TRANSCRIPTS[0]), 'r') as f:
    content = json.load(f)


In [87]:
content['segments']

[{'start': 0.379,
  'end': 0.841,
  'text': ' Ah, OK.',
  'words': [{'word': 'Ah,', 'start': 0.379, 'end': 0.459, 'score': 0.706},
   {'word': 'OK.', 'start': 0.761, 'end': 0.841, 'score': 0.273}],
  'id': '0_Lecture 3: Editors (vim) (2020).mp3'},
 {'start': 1.675,
  'end': 2.455,
  'text': ' Cool.',
  'words': [{'word': 'Cool.', 'start': 1.675, 'end': 1.855, 'score': 0.444}],
  'id': '1_Lecture 3: Editors (vim) (2020).mp3'},
 {'start': 2.455,
  'end': 6.818,
  'text': 'So welcome to the third lecture of the missing semester of your CS education.',
  'words': [{'word': 'So', 'start': 2.455, 'end': 2.595, 'score': 0.962},
   {'word': 'welcome', 'start': 2.676, 'end': 2.976, 'score': 0.779},
   {'word': 'to', 'start': 2.996, 'end': 3.076, 'score': 0.514},
   {'word': 'the', 'start': 3.096, 'end': 3.176, 'score': 0.815},
   {'word': 'third', 'start': 3.216, 'end': 3.416, 'score': 0.82},
   {'word': 'lecture', 'start': 3.476, 'end': 3.836, 'score': 0.768},
   {'word': 'of', 'start': 4.016,

In [8]:
# load model
model = SentenceTransformer('flax-sentence-embeddings/all_datasets_v4_MiniLM-L6') # base Marqo uses this

Downloading: 100%|██████████| 737/737 [00:00<00:00, 307kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 61.8kB/s]
Downloading: 100%|██████████| 7.34k/7.34k [00:00<00:00, 2.56MB/s]
Downloading: 100%|██████████| 612/612 [00:00<00:00, 205kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 61.8kB/s]
Downloading: 100%|██████████| 39.3k/39.3k [00:00<00:00, 198kB/s]
Downloading: 100%|██████████| 90.9M/90.9M [01:01<00:00, 1.49MB/s]
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 18.6kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 34.2kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 763kB/s] 
Downloading: 100%|██████████| 535/535 [00:00<00:00, 289kB/s]
Downloading: 100%|██████████| 13.2k/13.2k [00:00<00:00, 3.50MB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 498kB/s] 
Downloading: 100%|██████████| 349/349 [00:00<00:00, 28.6kB/s]


In [19]:
text = "Replace me by any text you'd like."
text_embbedding = model.encode(text)
print(text_embbedding[0:5])

[-0.07001531  0.06014673  0.1025539   0.02994502  0.06606302]




In [23]:
model_2 = SentenceTransformer("./model")
model_2.encode(text)[0:5]

array([-0.07001531,  0.06014673,  0.1025539 ,  0.02994502,  0.06606302],
      dtype=float32)

In [None]:

# def save_model(model_name: str):
#     """Loads any model from Hugginface model hub and saves it to disk."""
#     model = SentenceTransformer(model_name)
#     model.save("./model")
# model_name = 'flax-sentence-embeddings/all_datasets_v4_MiniLM-L6'
# save_model(model_name)


Create WEAVIATE schema

In [124]:
transcription_schema = {
    "classes": [{
            # name of the class
            "class": "Transcriptions",
            "vectorIndexType":"hnsw",
            "vectorizer": "none",

            # description of what the class represents
            "description": "Transcription class to store transcriptions of youtube videos",

            # class properties
            "properties": [
                {
                    "name": "title",
                    "dataType": ["text"],
                    "description": "title of the youtube video"
                },
                {
                    "name": "text",
                    "dataType": ["text"],
                    "description": "transcribed text of the youtube video"
                },
                {
                    "name": "start_time",
                    "dataType": ["number"],
                    "description": "start time of speech"
                },
                                    {
                    "name": "end_time",
                    "dataType": ["number"],
                    "description": "end time of speech"
                }
            ]
}]
}

In [125]:
client.get_meta()

{'hostname': 'http://[::]:8080', 'modules': {}, 'version': '1.20.2'}

In [126]:
 # cleanup from previous runs
client.schema.delete_all()

In [127]:
client.schema.create(transcription_schema)

In [106]:
client.schema.get()

{'classes': [{'class': 'Transcriptions',
   'description': 'Transcription class to store transcriptions of youtube videos',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'multiTenancyConfig': {'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': 'title of the youtube video',
     'indexFilterable': True,
     'indexSearchable': True,
     'name': 'title',
     'tokenization': 'word'},
    {'dataType': ['text'],
     'description': 'transcribed text of the youtube video',
     'indexFilterable': True,
     'indexSearchable': True,
     'name': 'text',
     'tokenization': 'word'},
    {'dataType': ['number'],
     'description': 'start time of speech',
     'indexFilterable': True,
     'indexSearchable': False,
     'name': 'start_time'},
    {'dataType': ['number'],
     'description': 'end time of speech',
     'indexFilterable': True,


In [74]:
# Go through each JSON object
# get the text
# vectorize the text
# append the vector back to the object
# load into weaviate
import requests
# curl -X POST http://127.0.0.1:8200/embed -H "Content-Type: application/json" -d '{"transcription_document":"hello world"}'
embedding_api = "http://127.0.0.1:8200/embed"


In [89]:
import multiprocessing as mp
mp.cpu_count()

In [137]:
def process_item(item:dict) -> list:
    payload = {"transcription_document": item['text']}
    res = requests.post(embedding_api, json=payload, headers=headers)
    item['vector'] = res.json()['vectors']
    return item

In [92]:
num_processes = 8
with mp.Pool(processes=num_processes) as pool:
    updated_content = pool.map(process_item, content['segments'])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [133]:
updated_content[0].keys()

dict_keys(['start', 'end', 'text', 'words', 'id', 'vector'])

Uncomment below to run batch vector embedding

In [None]:
# #go through each document and create the embedding
# num_processes = 8

# with open(os.path.join(DATA_PATH, TRANSCRIPTS[0]), 'r') as f:
#     content = json.load(f)
#     with mp.Pool(processes=num_processes) as pool:
#         updated_content = pool.map(process_item, content['segments'])


UPSERT TO WEAVIATE

In [128]:
with open('schema.json', 'w') as outfile: 
    json.dump(transcription_schema, outfile)

In [129]:
# remove current schema from Weaviate, removes all the data too
client.schema.delete_all()
# import schema using file path
client.schema.create('schema.json')
# print schema
print(json.dumps(client.schema.get(), indent=2))

{
  "classes": [
    {
      "class": "Transcriptions",
      "description": "Transcription class to store transcriptions of youtube videos",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "multiTenancyConfig": {
        "enabled": false
      },
      "properties": [
        {
          "dataType": [
            "text"
          ],
          "description": "title of the youtube video",
          "indexFilterable": true,
          "indexSearchable": true,
          "name": "title",
          "tokenization": "word"
        },
        {
          "dataType": [
            "text"
          ],
          "description": "transcribed text of the youtube video",
          "indexFilterable": true,
          "indexSearchable": true,
          "name": "text",
          "tokeniza

In [166]:
from weaviate.batch import Batch # for the typing purposes
from weaviate.util import generate_uuid5 # old way was from weaviate.tools import generate_uuid

def add_transcription(batch: Batch, transcription_data: dict) -> str:
    # Create the transcription object
    transcription_object = {
        'title': transcription_data['id'],  # Assuming 'id' in the data corresponds to the title
        'text': transcription_data['text'],
        'start_time': transcription_data['start'],
        'end_time': transcription_data['end']
    }

    # Generate a unique UUID for the transcription
    transcription_id = generate_uuid5(transcription_data['id'])

    # Add the transcription object to the batch request
    batch.add_data_object(
        data_object=transcription_object,
        class_name='Transcriptions',
        uuid=transcription_id,
        vector=transcription_data['vector']
    )

    return transcription_id

In [167]:
from tqdm import trange
data = updated_content
with client.batch as batch:
    for i in trange(1, 100):
        article_id = add_transcription(batch, data[i])
        # submit the objects from the batch to weaviate
        batch.create_objects()
        # submit the reference from the batch to weaviate
        batch.create_references()

            multi-threading. Setting `batch_size` in `client.batch.configure()`  to an int value will enabled automatic
            batching. See:
            https://weaviate.io/developers/weaviate/current/restful-api-references/batch.html#example-request-1
  stacklevel=1,
100%|██████████| 99/99 [00:00<00:00, 176.43it/s]


In [171]:
class questionObj():
    def __init__(self, query:str):
        self.query = {"text":query}


In [192]:
query = "what is vim?"
question_obj = questionObj(query)
query_vector = process_item(question_obj.query)['vector']

In [194]:
result = client.query.get(
        "Transcriptions", ["text", "start_time","end_time"]
    ).with_near_vector({ # takes a dictionary of the query vector
        "vector": query_vector, 
        "certainty":0.7 # Threshold
    }
    ).with_limit(2).with_additional(['certainty']).do() # Additional parameters of probability returned
print(json.dumps(result, indent=4))

{
    "data": {
        "Get": {
            "Transcriptions": [
                {
                    "_additional": {
                        "certainty": 0.8640323281288147
                    },
                    "end_time": 87.508,
                    "start_time": 81.207,
                    "text": "And when you're learning a sophisticated tool, so today we're going to teach you Vim, which is one powerful editor that a lot of programmers use."
                },
                {
                    "_additional": {
                        "certainty": 0.8424165546894073
                    },
                    "end_time": 293.888,
                    "start_time": 289.025,
                    "text": " Okay, so one of the really cool ideas behind Vim is that Vim is a modal editor."
                }
            ]
        }
    }
}
