### Chunk the data

Script is chunked by INTERIOR/EXTERIOR scenes. Some scripts doesn't provide this information. Those scripts are chunked by character length.

Example from "A Quiet Place":
```json
{
    "title": "A Quiet Place",
    "id": "A Quiet Place_52", 
    "text": "EXT. RIVER - LATE AFTERNOON 43\nAn immediate and jarring sound of water rushes in as we are:\nTIGHT ON MOVING WATER over rocks.\n[NOTE: THE SOUND WILL CONTINUE THROUGHOUT THE ENTIRETY OF\nTHIS SCENE]\n...",
    "text_vector": [ -0.022776233032345772,.......]
}
```


In [5]:
import os
import re
import chardet
from langchain_text_splitters import RecursiveCharacterTextSplitter
import json

In [6]:
script_directory = "../../scripts"
chunk_dir = "../../data/chunks/"
movie_list_path = "../../data/movie_list.json"

In [1]:
def detect_encoding(script_path: str): # This function was added because some files were giving encoding issues.
    """
    Detects the encoding of a file by analyzing its binary data

    :param script_path: The file path of the script whose encoding is to be detected.
    :type script_path: str
    :return: The name of the detected encoding, e.g., 'utf-8', 'ISO-8859-1', etc.
    :rtype: str
    """
    with open(script_path, 'rb') as f:  # Open in binary mode for encoding detection
        raw_data = f.read()
        result = chardet.detect(raw_data)
        detected_encoding = result['encoding']
        return detected_encoding


def create_json(scenes, script_filename, title):
    """
    Stores the chunked scenes in a JSON file.

    :param scenes: Chunked scenes
    :param script_filename: Name of the script file
    :param title: Movie title
    :return: None
    """
    chunk_json = []
    for i, scene in enumerate(scenes):
        chunk_json.append({
            "title": title,
            "id": f"{title}_{i+1}",
            "text": scene
        })

    chunk_filename = script_filename.replace('.txt', '.json')
    chunk_path = os.path.join(chunk_dir, f"{chunk_filename}")
    with open(chunk_path, 'w') as f:
        f.write(json.dumps(chunk_json, indent=4))



In [1]:
# Get list of all movies
with open(movie_list_path, 'r') as f:
    movie_list = json.load(f)

# Iterate through each movie and get script
for title in movie_list:
    script_file = movie_list[title]
    if script_file is None:  # If no script file is present
        continue
    script_path = os.path.join(script_directory, script_file)
    # Detect encoding
    try:
        with open(script_path, 'r', encoding='utf-8') as f:
            raw_text = f.read()
    except:
        encoding = detect_encoding(script_path)
        with open(script_path, 'r', encoding=encoding) as f:
            raw_text = f.read()

    # Split scenes based on INT/EXT
    text = re.sub(r'\n\s+', '\n', raw_text)  # Cleans up extra white spaces immediately following a line break
    text = re.sub(r'\s{2,}', ' ', text)
    scene_pattern = r'(?=(?:INT[\s.]*|EXT[\s.]*|INTERIOR|EXTERIOR)[^\n]*)'
    scenes = re.split(scene_pattern, text)

    # If no INT/EXT data is present, do character splitting
    if len(scenes) < 2:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=500)
        scenes = text_splitter.split_text(text)

    create_json(scenes, script_file, title)



KeyboardInterrupt

