# Part 1: Data Preparation

In [1]:
import pandas as pd
import spacy
import matplotlib.pyplot as plt

In [2]:
# english model
nlp = spacy.load("en_core_web_sm")

### Data cleaning

In [3]:
from data_process import extract_yoda_lines, clean_text
import re

In [4]:
yoda_raw = extract_yoda_lines('data/yoda-corpus.csv')
yoda_raw_df = pd.DataFrame(yoda_raw)
print(f"Raw rows: {len(yoda_raw_df):,}")

Raw rows: 100


In [5]:
yoda_df = yoda_raw_df.dropna(subset=["text"]).drop_duplicates(subset=["text"]).reset_index(drop=True)
print(f"After dropping nulls & duplicates: {len(yoda_df):,}")

After dropping nulls & duplicates: 99


In [6]:
# data cleaning

yoda_df["clean"] = yoda_df["text"].apply(clean_text)

yoda_df["token_count"] = yoda_df["clean"].str.split().apply(len)
yoda_df = yoda_df[yoda_df["token_count"] >= 2].reset_index(drop=True)
print(f"After cleaning & filtering short lines: {len(yoda_df):,}")


After cleaning & filtering short lines: 98


In [7]:
# Quick sanity check: sample few cleaned lines
display(yoda_df.sample(5)[["text","clean"]])

Unnamed: 0,text,clean
39,A prophecy . . . that misread could have been.,a prophecy . . . that misread could have been.
28,Premonitions . . . premonitions . . . Hmmmm ....,premonitions . . . premonitions . . . hmmmm . ...
87,"How to join the Force, he will train you. You...","how to join the force, he will train you. your..."
33,Train yourself to let go of everything you fe...,train yourself to let go of everything you fea...
70,"Faith in your new apprentice, misplaced may b...","faith in your new apprentice, misplaced may be..."


In [8]:
# Save cleaned version for reuse
# yoda_df.to_csv("data/yoda_clean.csv", index=False)

## Generate Normal English for Yoda speech

In [9]:
from chat_gpt_call import generate_normal_english_and_mapping
import asyncio

In [10]:
progress_counter = 0
progress_lock = asyncio.Lock()

async def generate_with_progress(yoda_sentence: str, idx: int, total: int):
    """
    Wrapper around generate_normal_english_and_mapping that prints progress.
    """
    global progress_counter
    try:
        result = await generate_normal_english_and_mapping(yoda_sentence)
    except Exception as e:
        result = e
    async with progress_lock:
        progress_counter += 1
        print(f"Processed {progress_counter}/{total} sentences (Task {idx})", end="\r")
    return result

In [11]:
async def process_all_yoda_sentences(yoda_df: pd.DataFrame) -> pd.DataFrame:
    tasks = []
    total = len(yoda_df)
    
    # Create a list of tasks with progress wrapper
    for idx, row in yoda_df.iterrows():
        yoda_sentence = row["text"]
        tasks.append(generate_with_progress(yoda_sentence, idx, total))
    
    # Run all tasks concurrently; capture exceptions by setting return_exceptions=True
    responses = await asyncio.gather(*tasks, return_exceptions=True)
    
    results = []
    for idx, res in enumerate(responses):
        yoda_sentence = yoda_df.iloc[idx]["text"]
        if isinstance(res, Exception):
            print(f"\nError processing row {idx}: {res}")
            results.append({
                "original": yoda_sentence,
                "normal_english": None,
                "mapping": None
            })
        else:
            results.append({
                "original": yoda_sentence,
                "normal_english": res.normal_english,
                "mapping": res.mapping
            })
    
    return pd.DataFrame(results)

In [15]:
# loop = asyncio.get_event_loop()
# normalized_df = loop.run_until_complete(process_all_yoda_sentences(yoda_df))

final_df1 = await process_all_yoda_sentences(yoda_df[:20])

Processed 50/20 sentences (Task 14)

In [14]:
final_df2 = await process_all_yoda_sentences(yoda_df[20:40])

Processed 30/20 sentences (Task 23)

In [16]:
final_df3 = await process_all_yoda_sentences(yoda_df[40:60])


Processed 70/20 sentences (Task 59)

In [17]:
final_df4 = await process_all_yoda_sentences(yoda_df[60:80])


Processed 90/20 sentences (Task 70)

In [18]:
final_df5 = await process_all_yoda_sentences(yoda_df[80:])

Processed 108/18 sentences (Task 82)

In [19]:
final_df = pd.concat([final_df1, final_df2, final_df3, final_df4, final_df5], ignore_index=True)

In [20]:
print(f"Final rows: {len(final_df):,}")

display(final_df.sample(5))

Final rows: 98


Unnamed: 0,original,normal_english,mapping
37,"Go, I will. Good relations with the Wookiees,...",I will go. I have good relations with the Wook...,"[1, 2, 0, 8, 9, 3, 4, 5, 6, 7]"
71,"Destroy you I will, just as Master Kenobi, yo...","I will destroy you, just as your apprentice wi...","[2, 3, 0, 1, 4, 5, 8, 9, 10, 11, 6, 7]"
62,"To fight this Lord Sidious, strong enough, yo...",You are not strong enough to fight this Lord S...,"[7, 8, 9, 5, 6, 0, 1, 2, 3, 4]"
96,"Remember, a Jedi's strength flows from the For...","Remember, a Jedi's strength flows from the For...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 9]"
11,"See through you, we can.",We can see through you.,"[3, 4, 0, 1, 2]"


In [41]:
yoda_df.iloc[12]["text"]

# find the max length of text and which row it is
max_index = yoda_df["text"].str.len().idxmax()
max_row = yoda_df.iloc[max_index]
print(f"Row with max length: {max_row['text']}")
print(f"Max length of text: {max_row['text'].str.len()}")
max_len = yoda_df["text"].str.len().max()
print(f"Max length of text: {max_len}")

Row with max length:  Death is a natural part of life. Rejoice for those around you who transform into the Force. Mourn them, do not. Miss them, do not. Attachment leads to jealousy. The shadow of greed, that is.


AttributeError: 'str' object has no attribute 'str'

# Save the Yoda speech to a file

In [23]:
import json

In [25]:
df_to_save = final_df.copy()
df_to_save['mapping'] = df_to_save['mapping'].apply(json.dumps)

df_to_save.to_csv("data/yoda_normalized.csv", index=False)

In [26]:
# to load:

In [None]:
import pandas as pd
import json

df_loaded = pd.read_csv("data/yoda_normalized.csv")

df_loaded['mapping'] = df_loaded['mapping'].apply(json.loads)

# Data Cleaning for Sentences

In [21]:
import pandas as pd
import spacy
import matplotlib.pyplot as plt

# english model
nlp = spacy.load("en_core_web_sm")

from data_process import extract_yoda_lines, clean_text
import re

In [27]:
yoda_raw = extract_yoda_lines('data/yoda-corpus.csv')
yoda_raw_df = pd.DataFrame(yoda_raw)
print(f"Raw rows: {len(yoda_raw_df):,}")

yoda_df = yoda_raw_df.dropna(subset=["text"]).drop_duplicates(subset=["text"]).reset_index(drop=True)
print(f"After dropping nulls & duplicates: {len(yoda_df):,}")

Raw rows: 100
After dropping nulls & duplicates: 99


In [23]:
from data_process import split_into_sentences

In [28]:
yoda_df["clean"] = yoda_df["text"].apply(clean_text)

yoda_df["raw_sentences"] = yoda_df["clean"].apply(split_into_sentences)

yoda_sent_df = yoda_df.explode("raw_sentences").reset_index(drop=True)

yoda_sent_df["token_count"] = yoda_sent_df["raw_sentences"].str.split().apply(len)
yoda_sent_df = yoda_sent_df[yoda_sent_df["token_count"] >= 2].reset_index(drop=True)

yoda_sent_df["sentences"] = yoda_sent_df["raw_sentences"].apply(clean_text)

yoda_sent_df.rename(columns={"sentences": "sentence"}, inplace=True)

In [29]:
print(f"After splitting into sentences: {len(yoda_sent_df):,}")

display(yoda_sent_df)

After splitting into sentences: 178


Unnamed: 0,movie,scene,line,character,text,slug,component,clean,raw_sentences,token_count,sentence
0,1,129,1162,YODA,"The very Republic is threatened, if involved t...",INT. TEMPLE OF THE JEDI - COUNCIL CHAMBERS - DAY,character,"the very republic is threatened, if involved t...","the very republic is threatened, if involved t...",10,"the very republic is threatened, if involved t..."
1,1,129,1164,YODA,"Hard to see, the dark side is. Discover who th...",INT. TEMPLE OF THE JEDI - COUNCIL CHAMBERS - DAY,character,"hard to see, the dark side is. discover who th...","hard to see, the dark side is.",7,"hard to see, the dark side is."
2,1,129,1164,YODA,"Hard to see, the dark side is. Discover who th...",INT. TEMPLE OF THE JEDI - COUNCIL CHAMBERS - DAY,character,"hard to see, the dark side is. discover who th...","discover who this assassin is, we must.",7,"discover who this assassin is, we must."
3,1,129,1167,YODA,"With this Naboo queen you must stay, Qui-Gon. ...",INT. TEMPLE OF THE JEDI - COUNCIL CHAMBERS - DAY,character,"with this naboo queen you must stay, qui-gon. ...","with this naboo queen you must stay, qui-gon.",8,"with this naboo queen you must stay, qui-gon."
4,1,129,1167,YODA,"With this Naboo queen you must stay, Qui-Gon. ...",INT. TEMPLE OF THE JEDI - COUNCIL CHAMBERS - DAY,character,"with this naboo queen you must stay, qui-gon. ...",protect her.,2,protect her.
...,...,...,...,...,...,...,...,...,...,...,...
173,6,48,365,YODA,"Told you, did he?",50 INT YODA'S HOUSE,character,"told you, did he?","told you, did he?",4,"told you, did he?"
174,6,48,368,YODA,"Unexpected this is, and unfortunate...",50 INT YODA'S HOUSE,character,"unexpected this is, and unfortunate...","unexpected this is, and unfortunate...",5,"unexpected this is, and unfortunate..."
175,6,48,374,YODA,"Remember, a Jedi's strength flows from the For...",50 INT YODA'S HOUSE,character,"remember, a jedi's strength flows from the for...","remember, a jedi's strength flows from the force.",8,"remember, a jedi's strength flows from the force."
176,6,48,374,YODA,"Remember, a Jedi's strength flows from the For...",50 INT YODA'S HOUSE,character,"remember, a jedi's strength flows from the for...","but beware, anger.",3,"but beware, anger."


In [30]:
from chat_gpt_call import generate_normal_english_and_mapping
import asyncio

In [31]:
progress_counter = 0
progress_lock = asyncio.Lock()

async def generate_with_progress(yoda_sentence: str, idx: int, total: int):
    """
    Wrapper around generate_normal_english_and_mapping that prints progress.
    """
    global progress_counter
    try:
        result = await generate_normal_english_and_mapping(yoda_sentence)
    except Exception as e:
        result = e
    async with progress_lock:
        progress_counter += 1
        print(f"Processed {progress_counter}/{total} sentences (Task {idx})", end="\r")
    return result

In [32]:
async def process_all_yoda_sentences(yoda_df: pd.DataFrame) -> pd.DataFrame:
    tasks = []
    total = len(yoda_df)
    
    # Create a list of tasks with progress wrapper
    for idx, row in yoda_df.iterrows():
        yoda_sentence = row["sentence"]
        tasks.append(generate_with_progress(yoda_sentence, idx, total))
    
    # Run all tasks concurrently; capture exceptions by setting return_exceptions=True
    responses = await asyncio.gather(*tasks, return_exceptions=True)
    
    results = []
    for idx, res in enumerate(responses):
        yoda_sentence = yoda_df.iloc[idx]["sentence"]
        if isinstance(res, Exception):
            print(f"\nError processing row {idx}: {res}")
            results.append({
                "original": yoda_sentence,
                "normal_english": None,
                "mapping": None
            })
        else:
            results.append({
                "original": yoda_sentence,
                "normal_english": res.normal_english,
                "mapping": res.mapping
            })
    
    return pd.DataFrame(results)

In [33]:
final_dfs = []

for i in range(0, len(yoda_sent_df), 20):
    final_dfs.append(await process_all_yoda_sentences(yoda_sent_df[i:i+20]))

Processed 178/18 sentences (Task 177)

In [34]:
final_df = pd.concat(final_dfs, ignore_index=True)
print(f"Final rows: {len(final_df):,}")
display(final_df.sample(5))

Final rows: 178


Unnamed: 0,original,normal_english,mapping
104,dismantle the coded signal quickly.,Quickly dismantle the coded signal.,"[4, 0, 1, 2, 3]"
81,i agree.,I agree.,"[0, 1]"
119,"visit the new emperor, my task is.",My task is visit the new emperor.,"[4, 5, 6, 0, 1, 2, 3]"
13,how feel you?,You feel how?,"[2, 1, 0]"
60,train yourself to let go of everything you fea...,Train yourself to let go of everything you fea...,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"


In [35]:
import json

In [36]:
df_to_save = final_df.copy()
df_to_save['mapping'] = df_to_save['mapping'].apply(json.dumps)

df_to_save.to_csv("data/yoda_sentences_normalized.csv", index=False)