# Chunking for Video Script

In [1]:
import os
os.getcwd()
print(os.listdir())


['ttest.py', 'Video_Script_Chunking.ipynb', 'requiremnets.txt', 'discordQASummerizer.ipynb', 'README.md', '.gitignore', 'discordQASummerizer.py', '.env', '.git', 'CHAPTER 8 - Organic (MIKE).txt']


In [3]:
import boto3
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.embeddings.bedrock import BedrockEmbeddings
from langchain.schema import Document
from dotenv import load_dotenv
from langchain_community.chat_models import BedrockChat
import os

load_dotenv()
aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
aws_region = os.getenv("AWS_DEFAULT_REGION")

# --------- 1. Setup AWS Credentials ---------
boto3_session = boto3.Session(
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key,
    region_name=aws_region  # or your Bedrock-supported region
)

# --------- 2. Load and Preprocess Your Script File ---------
# If your uploaded .ipynb is misnamed and actually a .csv or text:
try:
    df = pd.read_csv("CHAPTER 8 - Organic (MIKE).txt", engine="python")
    all_text = " ".join(df.astype(str).values.flatten())
except Exception:
    with open("CHAPTER 8 - Organic (MIKE).txt", "r") as f:
        all_text = f.read()

# --------- 3. Chunk the Text ---------
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_text(all_text)

# --------- 4. Titan Embeddings from Bedrock ---------
embeddings = BedrockEmbeddings(
    client=boto3_session.client("bedrock-runtime"),
    model_id="amazon.titan-embed-text-v1"
)

# --------- 5. Store in FAISS ---------
documents = [Document(page_content=chunk) for chunk in chunks]
vectorstore = FAISS.from_documents(documents, embedding=embeddings)

# --------- 6. Claude LLM from Bedrock ---------
llm = BedrockChat(
    client=boto3_session.client("bedrock-runtime"),
    model_id="anthropic.claude-3-sonnet-20240229-v1:0",
    model_kwargs={"temperature": 0.2, "max_tokens": 1024}
)

# --------- 7. Create RetrievalQA Chain ---------
retriever = vectorstore.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

# --------- 8. Ask a Question ---------
question = "What makes a viral video convert?"
result = qa_chain({"query": question})

# --------- 9. Display Result ---------
print("\nAnswer:\n", result["result"])
print("\nRelevant Script Sections:\n")
for doc in result["source_documents"]:
    print(doc.page_content)


  embeddings = BedrockEmbeddings(
  llm = BedrockChat(
  result = qa_chain({"query": question})



Answer:
 Based on the context provided, it seems that for a viral video to "convert" (presumably meaning to lead to some desired action like making a purchase or signing up), the key is to evoke an emotional reaction from the viewer.

The examples mention that viral videos often strike an emotional chord, whether that's making people laugh, feel angry/outraged, confused, etc. Evoking those strong emotions gets viewers engaged and compels them to like, comment, share the video, and potentially take a next step like buying a product or service.

However, the context doesn't provide specific details on what types of emotional responses are most effective for driving conversions versus just getting views and engagement. It likely depends on the specific goal and call-to-action of the video campaign.

Relevant Script Sections:

Examples of viral videos that don't convert and do convert
of how organic works and where we will be posting it's time to talk about what actually makes a video go 

In [4]:
def read_file(file_path):
    try:
        with open(file_path, 'r') as file:
            content = file.read()
            return content
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [5]:
import spacy

def fixed_size_chunking(text, chunk_size, overlap):
    nlp = spacy.load("en_core_web_md")
    doc = nlp(text)
    tokens = [token.text for token in doc]

    chunks = []
    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk = tokens[start:end]
        chunks.append(" ".join(chunk))
        start += chunk_size - overlap  # move start forward with overlap

    return chunks

In [10]:
def semantic_embedding_chunk(text, threshold):
    """
    Splits text into semantic chunks using sentence embeddings.
    Uses spaCy for sentence segmentation and SentenceTransformer for generating embeddings.

    :param text: The full text to chunk.
    :param threshold: Cosine similarity threshold for adding a sentence to the current chunk.
    :return: A list of semantic chunks (each as a string).
    """
    # Sentence segmentation
    #doc = nlp(text)
    #sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    sentences = fixed_size_chunking(text, 100, 10)

    chunks = []
    current_chunk_sentences = []
    current_chunk_embedding = None

    for sentence in sentences:
        # Generate embedding for the current sentence
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)

        # If starting a new chunk, initialize it with the current sentence
        if current_chunk_embedding is None:
            current_chunk_sentences = [sentence]
            current_chunk_embedding = sentence_embedding
        else:
            # Compute cosine similarity between current sentence and the chunk embedding
            sim_score = util.cos_sim(sentence_embedding, current_chunk_embedding)
            if sim_score.item() >= threshold:
                # Add sentence to the current chunk and update the chunk's average embedding
                current_chunk_sentences.append(sentence)
                num_sents = len(current_chunk_sentences)
                current_chunk_embedding = ((current_chunk_embedding * (num_sents - 1)) + sentence_embedding) / num_sents
            else:
                # Finalize the current chunk and start a new one
                chunks.append(" ".join(current_chunk_sentences))
                current_chunk_sentences = [sentence]
                current_chunk_embedding = sentence_embedding

    # Append the final chunk if it exists
    if current_chunk_sentences:
        chunks.append(" ".join(current_chunk_sentences))

    return chunks

In [11]:
def semantic_embedding_chunk(text, threshold):
    """
    Splits text into semantic chunks using sentence embeddings.
    Uses spaCy for sentence segmentation and SentenceTransformer for generating embeddings.

    :param text: The full text to chunk.
    :param threshold: Cosine similarity threshold for adding a sentence to the current chunk.
    :return: A list of semantic chunks (each as a string).
    """
    # Sentence segmentation
    #doc = nlp(text)
    #sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    sentences = fixed_size_chunking(text, 100, 10)

    chunks = []
    current_chunk_sentences = []
    current_chunk_embedding = None

    for sentence in sentences:
        # Generate embedding for the current sentence
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)

        # If starting a new chunk, initialize it with the current sentence
        if current_chunk_embedding is None:
            current_chunk_sentences = [sentence]
            current_chunk_embedding = sentence_embedding
        else:
            # Compute cosine similarity between current sentence and the chunk embedding
            sim_score = util.cos_sim(sentence_embedding, current_chunk_embedding)
            if sim_score.item() >= threshold:
                # Add sentence to the current chunk and update the chunk's average embedding
                current_chunk_sentences.append(sentence)
                num_sents = len(current_chunk_sentences)
                current_chunk_embedding = ((current_chunk_embedding * (num_sents - 1)) + sentence_embedding) / num_sents
            else:
                # Finalize the current chunk and start a new one
                chunks.append(" ".join(current_chunk_sentences))
                current_chunk_sentences = [sentence]
                current_chunk_embedding = sentence_embedding

    # Append the final chunk if it exists
    if current_chunk_sentences:
        chunks.append(" ".join(current_chunk_sentences))

    return chunks

In [1]:
def paragraph_chunking(text):
    # Simple split by double line breaks (standard for paragraphs)
    paragraphs = [para.strip() for para in text.split('\n\n\n') if para.strip()]
    
    return paragraphs

In [2]:
# open file and read text from file
# Example usage
file_path = "studyDropshipping/CHAPTER 8 - Organic (MIKE).txt"
file_content = read_file(file_path)

if file_content is None:
    print("Unable to read data from file: ", file_path)

# Generate chunks
paragraph_chunks = paragraph_chunking(file_content)

# Display results
for i, chunk in enumerate(paragraph_chunks):
    print(f"\n--- Chunk {i + 1} ---\n{chunk}")

NameError: name 'read_file' is not defined