In [None]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading PyMuPDF-1.23.21-cp310-none-manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.9 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.21 PyMuPDFb-1.23.9


In [None]:
import fitz
import pandas as pd

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def preprocess_text(text):
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        if not line.startswith("Poor Things - Final Cut"):
            cleaned_lines.append(line)
    return cleaned_lines

def parse_and_classify_screenplay(lines):
    elements = []
    current_scene = None
    current_character = None
    buffer_text = ""  # To accumulate dialogue or action descriptions
    buffer_type = None  # Track the type of the buffered text

    for line in lines:
        if line.strip().isdigit():  # Skip standalone numbers that might be page numbers
            continue
        if any(prefix in line for prefix in ('INT', 'EXT')):
            if buffer_text:  # Save any buffered text before starting a new scene
                elements.append({'Type': buffer_type, 'Scene': current_scene, 'Character': current_character, 'Text': buffer_text})
                buffer_text = ""
            current_scene = line
            current_character = None  # Reset character at the start of a new scene
            buffer_type = 'Scene Heading'
            elements.append({'Type': buffer_type, 'Scene': current_scene, 'Character': None, 'Text': line})
            buffer_type = None  # Reset buffer type after adding scene heading
        elif line.isupper() and not any(prefix in line for prefix in ('INT', 'EXT')):
            if buffer_text:  # Save any buffered text before changing character
                elements.append({'Type': buffer_type, 'Scene': current_scene, 'Character': current_character, 'Text': buffer_text})
                buffer_text = ""
            current_character = line
            buffer_type = 'Dialogue'
        else:
            element_type = 'Dialogue' if current_character else 'Action Description'
            # If the current line continues the previous type, buffer it; otherwise, save the buffered text and start a new buffer
            if element_type == buffer_type or not buffer_text:
                buffer_text += (" " + line if buffer_text else line)
                buffer_type = element_type
            else:
                elements.append({'Type': buffer_type, 'Scene': current_scene, 'Character': current_character, 'Text': buffer_text})
                buffer_text = line
                buffer_type = element_type

    # Add any remaining buffered text
    if buffer_text:
        elements.append({'Type': buffer_type, 'Scene': current_scene, 'Character': current_character, 'Text': buffer_text})

    return elements

# Main process
pdf_path = 'Poor-Things-Read-The-Screenplay.pdf'
text = extract_text_from_pdf(pdf_path)
lines = preprocess_text(text)
elements = parse_and_classify_screenplay(lines)

# Convert to pandas DataFrame
df = pd.DataFrame(elements)

# Export to CSV
csv_path = 'screenplay_analysis_consolidated.csv'
df.to_csv(csv_path, index=False)

print(f"Exported to {csv_path}")


Exported to screenplay_analysis_consolidated.csv


In [None]:
import pandas as pd

# Assuming df is your existing DataFrame with the screenplay data

# Initialize a new column for scene numbers with zeros
df['Scene Number'] = 0

# Variable to keep track of the current scene number
current_scene_number = 0
# Variable to keep track of the last seen scene to detect changes
last_scene = None

# Iterate through the DataFrame to assign scene numbers
for index, row in df.iterrows():
    if row['Type'] == 'Scene Heading':
        if row['Scene'] != last_scene:  # Check if the scene has changed
            current_scene_number += 1  # Increment scene number for a new scene
            last_scene = row['Scene']  # Update the last seen scene
    df.at[index, 'Scene Number'] = current_scene_number

# Now, df has an additional column 'Scene Number' with the scene numbers correctly assigned

# Optionally, you can reorder the DataFrame to have 'Scene Number' as one of the first columns if needed
df = df[['Scene Number', 'Type', 'Scene', 'Character', 'Text']]

# You can then export this updated DataFrame to CSV or perform further analysis
csv_path_updated = 'screenplay_analysis_with_scene_numbers.csv'
df.to_csv(csv_path_updated, index=False)

print(f"Updated DataFrame exported to {csv_path_updated}")


Updated DataFrame exported to screenplay_analysis_with_scene_numbers.csv
