# Part 1: Analysis

In [1]:
import pandas as pd
import spacy
import matplotlib.pyplot as plt

In [2]:
# english model
nlp = spacy.load("en_core_web_sm")

### Data cleaning

In [3]:
from data_process import extract_yoda_lines, clean_text
import re

In [4]:
yoda_raw = extract_yoda_lines('data/yoda-corpus.csv')
yoda_raw_df = pd.DataFrame(yoda_raw)
print(f"Raw rows: {len(yoda_raw_df):,}")

Raw rows: 100


In [5]:
yoda_df = yoda_raw_df.dropna(subset=["text"]).drop_duplicates(subset=["text"]).reset_index(drop=True)
print(f"After dropping nulls & duplicates: {len(yoda_df):,}")

After dropping nulls & duplicates: 99


In [7]:
# data cleaning

yoda_df["clean"] = yoda_df["text"].apply(clean_text)

yoda_df["token_count"] = yoda_df["clean"].str.split().apply(len)
yoda_df = yoda_df[yoda_df["token_count"] >= 2].reset_index(drop=True)
print(f"After cleaning & filtering short lines: {len(yoda_df):,}")


After cleaning & filtering short lines: 98


In [8]:
# Quick sanity check: sample few cleaned lines
display(yoda_df.sample(5)[["text","clean"]])

Unnamed: 0,text,clean
57,Dismantle the coded signal quickly. That grou...,dismantle the coded signal quickly. that group...
2,"With this Naboo queen you must stay, Qui-Gon. ...","with this naboo queen you must stay, qui-gon. ..."
24,"Confer on you, the level of Jedi Knight the Co...","confer on you, the level of jedi knight the co..."
10,Afraid are you?,afraid are you?
27,Always two there are....no more...no less. A m...,always two there are....no more...no less. a m...


In [None]:
# Save cleaned version for reuse
# yoda_df.to_csv("data/yoda_clean.csv", index=False)