# Chunking for Video Script

In [11]:
import os
os.getcwd()
print(os.listdir())


['EmbeddingAssignment.ipynb', 'Labs', '.ipynb_checkpoints', 'api-keys', 'test_script', 'Video_Script_Chunking.ipynb']


In [15]:
import boto3
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.embeddings.bedrock import BedrockEmbeddings
from langchain.schema import Document
from dotenv import load_dotenv
from langchain_community.chat_models import BedrockChat
import os

load_dotenv()
aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
aws_region = os.getenv("AWS_DEFAULT_REGION")

# --------- 1. Setup AWS Credentials ---------
boto3_session = boto3.Session(
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key,
    region_name=aws_region  # or your Bedrock-supported region
)

# --------- 2. Load and Preprocess Your Script File ---------
# If your uploaded .ipynb is misnamed and actually a .csv or text:
try:
    df = pd.read_csv("test_script", engine="python")
    all_text = " ".join(df.astype(str).values.flatten())
except Exception:
    with open("test_script", "r") as f:
        all_text = f.read()

# --------- 3. Chunk the Text ---------
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_text(all_text)

# --------- 4. Titan Embeddings from Bedrock ---------
embeddings = BedrockEmbeddings(
    client=boto3_session.client("bedrock-runtime"),
    model_id="amazon.titan-embed-text-v1"
)

# --------- 5. Store in FAISS ---------
documents = [Document(page_content=chunk) for chunk in chunks]
vectorstore = FAISS.from_documents(documents, embedding=embeddings)

# --------- 6. Claude LLM from Bedrock ---------
llm = BedrockChat(
    client=boto3_session.client("bedrock-runtime"),
    model_id="anthropic.claude-3-sonnet-20240229-v1:0",
    model_kwargs={"temperature": 0.2, "max_tokens": 1024}
)

# --------- 7. Create RetrievalQA Chain ---------
retriever = vectorstore.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

# --------- 8. Ask a Question ---------
question = "What does the video say about prompting best practices?"
result = qa_chain({"query": question})

# --------- 9. Display Result ---------
print("\nAnswer:\n", result["result"])
print("\nRelevant Script Sections:\n")
for doc in result["source_documents"]:
    print(doc.page_content)


  llm = BedrockChat(
  result = qa_chain({"query": question})



Answer:
 The video does not mention anything about prompting best practices. It is focused on providing tips for launching a dropshipping product, such as finding a product that solves a problem or follows a trend, using tools like Google Trends to identify popular topics, finding reliable suppliers, and ordering product samples. There is no discussion about prompting best practices in this context.

Relevant Script Sections:

Hey everyone, welcome back! In today’s video, I’m going to walk you through exactly how to launch your very first dropshipping product — step by step, no fluff.
Let’s start with product selection. The first thing you need to do is find a product that actually solves a real problem or hits on a trend. I like to use tools like Google Trends or even scroll through TikTok and Instagram to see what’s getting a lot of engagement. You want something people are already talking about — it makes your marketing job ten times easier.
So there you have it — that’s the basic 

In [16]:
def read_file(file_path):
    try:
        with open(file_path, 'r') as file:
            content = file.read()
            return content
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [17]:
import spacy

def fixed_size_chunking(text, chunk_size, overlap):
    nlp = spacy.load("en_core_web_md")
    doc = nlp(text)
    tokens = [token.text for token in doc]

    chunks = []
    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk = tokens[start:end]
        chunks.append(" ".join(chunk))
        start += chunk_size - overlap  # move start forward with overlap

    return chunks

In [18]:
def semantic_embedding_chunk(text, threshold):
    """
    Splits text into semantic chunks using sentence embeddings.
    Uses spaCy for sentence segmentation and SentenceTransformer for generating embeddings.

    :param text: The full text to chunk.
    :param threshold: Cosine similarity threshold for adding a sentence to the current chunk.
    :return: A list of semantic chunks (each as a string).
    """
    # Sentence segmentation
    #doc = nlp(text)
    #sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    sentences = fixed_size_chunking(text, 100, 10)

    chunks = []
    current_chunk_sentences = []
    current_chunk_embedding = None

    for sentence in sentences:
        # Generate embedding for the current sentence
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)

        # If starting a new chunk, initialize it with the current sentence
        if current_chunk_embedding is None:
            current_chunk_sentences = [sentence]
            current_chunk_embedding = sentence_embedding
        else:
            # Compute cosine similarity between current sentence and the chunk embedding
            sim_score = util.cos_sim(sentence_embedding, current_chunk_embedding)
            if sim_score.item() >= threshold:
                # Add sentence to the current chunk and update the chunk's average embedding
                current_chunk_sentences.append(sentence)
                num_sents = len(current_chunk_sentences)
                current_chunk_embedding = ((current_chunk_embedding * (num_sents - 1)) + sentence_embedding) / num_sents
            else:
                # Finalize the current chunk and start a new one
                chunks.append(" ".join(current_chunk_sentences))
                current_chunk_sentences = [sentence]
                current_chunk_embedding = sentence_embedding

    # Append the final chunk if it exists
    if current_chunk_sentences:
        chunks.append(" ".join(current_chunk_sentences))

    return chunks

In [19]:
def semantic_embedding_chunk(text, threshold):
    """
    Splits text into semantic chunks using sentence embeddings.
    Uses spaCy for sentence segmentation and SentenceTransformer for generating embeddings.

    :param text: The full text to chunk.
    :param threshold: Cosine similarity threshold for adding a sentence to the current chunk.
    :return: A list of semantic chunks (each as a string).
    """
    # Sentence segmentation
    #doc = nlp(text)
    #sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    sentences = fixed_size_chunking(text, 100, 10)

    chunks = []
    current_chunk_sentences = []
    current_chunk_embedding = None

    for sentence in sentences:
        # Generate embedding for the current sentence
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)

        # If starting a new chunk, initialize it with the current sentence
        if current_chunk_embedding is None:
            current_chunk_sentences = [sentence]
            current_chunk_embedding = sentence_embedding
        else:
            # Compute cosine similarity between current sentence and the chunk embedding
            sim_score = util.cos_sim(sentence_embedding, current_chunk_embedding)
            if sim_score.item() >= threshold:
                # Add sentence to the current chunk and update the chunk's average embedding
                current_chunk_sentences.append(sentence)
                num_sents = len(current_chunk_sentences)
                current_chunk_embedding = ((current_chunk_embedding * (num_sents - 1)) + sentence_embedding) / num_sents
            else:
                # Finalize the current chunk and start a new one
                chunks.append(" ".join(current_chunk_sentences))
                current_chunk_sentences = [sentence]
                current_chunk_embedding = sentence_embedding

    # Append the final chunk if it exists
    if current_chunk_sentences:
        chunks.append(" ".join(current_chunk_sentences))

    return chunks

In [22]:
import spacy
from sentence_transformers import SentenceTransformer, util

file_path = "./test_script"
home_care_content = read_file(file_path)

nlp = spacy.load("en_core_web_md")
model = SentenceTransformer("all-MiniLM-L6-v2")

semantic_chunks = semantic_embedding_chunk(home_care_content, threshold=0.5)
for i, chunk in enumerate(semantic_chunks):
    print(f"Chunk {i+1}:\n{chunk}\n{'-'*60}")

Chunk 1:
Hey everyone , welcome back ! In today ’s video , I ’m going to walk you through exactly how to launch your very first dropshipping product — step by step , no fluff . 

 Let ’s start with product selection . The first thing you need to do is find a product that actually solves a real problem or hits on a trend . I like to use tools like Google Trends or even scroll through TikTok and Instagram to see what ’s getting a lot of engagement . You want something people are already talking about — . You want something people are already talking about — it makes your marketing job ten times easier . 

 Once you ’ve picked a potential product , it ’s time to find a reliable supplier . Most people head straight to AliExpress , which is fine to start , but I also recommend checking platforms like Spocket or Zendrop because they often offer faster shipping and better communication . Always , always order a sample — you want to know exactly what your customer is getting . 

 Now let ’s ta