In [1]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


model = SentenceTransformer('BAAI/bge-small-en-v1.5')

def create_embedding(texts):
    if isinstance(texts, str):
        texts = [texts]
    embeddings = model.encode(texts, batch_size=256)
    return embeddings.tolist()

txt_files = os.listdir("transcriptions")
my_dicts = []
chunk_id = 0

for txt_file in txt_files:
    if not txt_file.endswith('.txt'):
        continue
        
    with open(f"transcriptions/{txt_file}", encoding='utf-8') as f:
        lines = f.readlines()[2:]
    
    chunks = []
    for line in lines:
        if line.strip().startswith('['):
            timestamp_end = line.find(']')
            timestamp = line[1:timestamp_end]
            text = line[timestamp_end + 1:].strip()
            if text:
                chunks.append({'timestamp': timestamp, 'text': text})
    
    print(f"Creating embeddings for {txt_file}")
    embeddings = create_embedding([c['text'] for c in chunks])
    
    for i, chunk in enumerate(chunks):
        chunk['chunk_id'] = chunk_id
        chunk['file'] = txt_file
        chunk['embedding'] = embeddings[i]
        chunk_id += 1
        my_dicts.append(chunk)

df = pd.DataFrame.from_records(my_dicts)
df.to_pickle("embeddings.pkl")
df.to_csv("embeddings.csv", index=False)

print(f"\nTotal chunks: {len(df)}")
print(df[['file', 'timestamp', 'text']].head())



Creating embeddings for video1_COMBINED.txt
Creating embeddings for video1_ENGLISH.txt
Creating embeddings for video1_HINDI.txt
Creating embeddings for video1_transcription.txt
Creating embeddings for video2_transcription.txt
Creating embeddings for video3_COMBINED.txt
Creating embeddings for video3_ENGLISH.txt
Creating embeddings for video3_HINDI.txt
Creating embeddings for video3_transcription.txt
Creating embeddings for video4_transcription.txt

Total chunks: 2352
                 file      timestamp  \
0  video1_ENGLISH.txt  40:00 - 40:07   
1  video1_ENGLISH.txt  40:07 - 40:13   
2  video1_ENGLISH.txt  40:13 - 40:24   
3  video1_ENGLISH.txt  40:24 - 40:34   
4  video1_ENGLISH.txt  40:34 - 40:38   

                                                text  
0  count. For that we can use the count method of...  
1  We will call it grade and to create tuple we w...  
2  grades. First C is coming then D is coming. Th...  
3       finally A. We can simply print grade.count .  
4  Save and

In [None]:
import os
import json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ===============================
# Load Model (ONCE)
# ===============================
model = SentenceTransformer('BAAI/bge-small-en-v1.5')

def create_embedding(texts):
    if isinstance(texts, str):
        texts = [texts]
    embeddings = model.encode(texts, batch_size=256)
    return embeddings.tolist()

def detect_language(text):
    hindi_chars = sum(1 for c in text if '\u0900' <= c <= '\u097F')
    total_chars = len([c for c in text if c.isalpha()])
    if total_chars == 0:
        return 'english'
    return 'hindi' if hindi_chars / total_chars > 0.3 else 'english'


folders = ["transcriptions", "txt","newjsons"]
my_dicts = []
chunk_id = 0

for folder in folders:
    if not os.path.exists(folder):
        print(f"Folder '{folder}' not found, skipping...")
        continue

    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        chunks = []

    
        if file.endswith('.json'):
            with open(file_path, encoding='utf-8') as f:
                data = json.load(f)

            for chunk in data.get('chunks', []):
                text = chunk.get('text', '').strip()
                if text:
                    chunks.append({
                        'timestamp': f"{chunk.get('start', 0):.2f} - {chunk.get('end', 0):.2f}",
                        'text': text,
                        'number': chunk.get('number', ''),
                        'title': chunk.get('title', '')
                    })

       
        elif file.endswith('.txt'):
            with open(file_path, encoding='utf-8') as f:
                lines = f.readlines()[2:]

            for line in lines:
                if line.strip().startswith('['):
                    timestamp_end = line.find(']')
                    timestamp = line[1:timestamp_end]
                    text = line[timestamp_end + 1:].strip()
                    if text:
                        chunks.append({
                            'timestamp': timestamp,
                            'text': text,
                            'number': '',
                            'title': ''
                        })

        if not chunks:
            continue

        print(f"Creating embeddings for {file}")
        embeddings = create_embedding([c['text'] for c in chunks])

        for i, chunk in enumerate(chunks):
            chunk['chunk_id'] = chunk_id
            chunk['file'] = file
            chunk['language'] = detect_language(chunk['text'])
            chunk['embedding'] = embeddings[i]
            my_dicts.append(chunk)
            chunk_id += 1

df = pd.DataFrame.from_records(my_dicts)

df.to_pickle("embeddings.pkl")
df.to_csv("embeddings.csv", index=False)

print(f"\nTotal chunks: {len(df)}")
print(f"Languages: {df['language'].value_counts().to_dict()}")
print(df[['file', 'language', 'timestamp', 'text']].head())


df.to_pickle("embeddings_raw.pkl")
df.to_csv("embeddings_raw.csv", index=False)

Creating embeddings for video1_ENGLISH.txt
Creating embeddings for video1_HINDI.txt
Creating embeddings for video1_transcription.txt


KeyboardInterrupt: 