# Oscars Speeches – Data Exploration
Quick look at the cleaned dataset: schema, categories, distributions, and individual speech lookup.

In [2]:
import pandas as pd

df = pd.read_csv("../data/cleaned_speeches.csv")
print(f"{len(df)} rows, {len(df.columns)} columns")
df.dtypes

189 rows, 7 columns


year            int64
ceremony        int64
category          str
film_title        str
winner_raw        str
winner_clean      str
speech_clean      str
dtype: object

In [3]:
df.head()

Unnamed: 0,year,ceremony,category,film_title,winner_raw,winner_clean,speech_clean
0,1993,66,Actor in a Leading Role,Philadelphia,Tom Hanks,Tom Hanks,Here's what I know. I could not be standing h...
1,1993,66,Actor in a Supporting Role,The Fugitive,Tommy Lee Jones,Tommy Lee Jones,"My thanks to the Academy for the very finest, ..."
2,1993,66,Actress in a Leading Role,The Piano,Holly Hunter,Holly Hunter,I'm so overwhelmed. To be with that group of ...
3,1993,66,Actress in a Supporting Role,The Piano,Anna Paquin,Anna Paquin,I'd like to thank the Academy for the honor of...
4,1993,66,Directing,Schindler's List,Steven Spielberg,Steven Spielberg,"I actually, I have friends who have won this b..."


In [4]:
df.isnull().sum()

year            0
ceremony        0
category        0
film_title      0
winner_raw      0
winner_clean    0
speech_raw      0
speech_clean    0
dtype: int64

## Categories

In [5]:
print("Unique categories:", df["category"].nunique())
df["category"].value_counts()

Unique categories: 8


category
Actor in a Leading Role          17
Actress in a Leading Role        17
Directing                        17
Best Picture                     17
Writing (Adapted Screenplay)     17
Writing (Original Screenplay)    17
Actor in a Supporting Role       16
Actress in a Supporting Role     16
Name: count, dtype: int64

## Year distribution

In [6]:
print(f"Year range: {df['year'].min()} – {df['year'].max()}")
df["year"].value_counts().sort_index()

Year range: 2000 – 2016


year
2000    8
2001    8
2002    8
2003    8
2004    8
2005    8
2006    8
2007    8
2008    7
2009    8
2010    8
2011    8
2012    8
2013    8
2014    7
2015    8
2016    8
Name: count, dtype: int64

## Speech lengths

In [7]:
df["speech_len"] = df["speech_clean"].str.len()
df["speech_len"].describe()

count     134.000000
mean     1333.738806
std       768.586721
min        21.000000
25%       871.500000
50%      1201.500000
75%      1602.500000
max      6853.000000
Name: speech_len, dtype: float64

In [8]:
# Shortest and longest speeches
print("=== 5 Shortest ===")
for _, r in df.nsmallest(5, "speech_len").iterrows():
    print(f"  {r['year']} {r['category']} – {r['winner_clean']} ({r['speech_len']} chars)")

print("\n=== 5 Longest ===")
for _, r in df.nlargest(5, "speech_len").iterrows():
    print(f"  {r['year']} {r['category']} – {r['winner_clean']} ({r['speech_len']} chars)")

=== 5 Shortest ===
  2002 Directing – Roman Polanski (21 chars)
  2011 Writing (Original Screenplay) – Written by Woody Allen (21 chars)
  2004 Actor in a Supporting Role – Morgan Freeman (312 chars)
  2007 Writing (Adapted Screenplay) – Written for the screen by Joel Coen & Ethan Coen (344 chars)
  2007 Actress in a Leading Role – Marion Cotillard (381 chars)

=== 5 Longest ===
  2016 Best Picture – Adele Romanski, Dede Gardner and Jeremy Kleiner, Producers (6853 chars)
  2012 Best Picture – Grant Heslov, Ben Affleck and George Clooney, Producers (3253 chars)
  2001 Actress in a Leading Role – Halle Berry (2792 chars)
  2013 Actor in a Leading Role – Matthew McConaughey (2777 chars)
  2013 Actress in a Leading Role – Cate Blanchett (2735 chars)


## Browse by category

In [9]:
# Change the category here to browse
cat = "Actor in a Leading Role"
df[df["category"] == cat][["year", "winner_clean", "film_title"]].sort_values("year")

Unnamed: 0,year,winner_clean,film_title
0,2000,Russell Crowe,Gladiator
8,2001,Denzel Washington,Training Day
16,2002,Adrien Brody,The Pianist
24,2003,Sean Penn,Mystic River
32,2004,Jamie Foxx,Ray
40,2005,Philip Seymour Hoffman,Capote
48,2006,Forest Whitaker,The Last King of Scotland
56,2007,Daniel Day-Lewis,There Will Be Blood
64,2008,Sean Penn,Milk
71,2009,Jeff Bridges,Crazy Heart


## Look up a specific speech
Use any of these filters (or combine them) to find a speech.

In [27]:
import textwrap

def show_speech(df, winner=None, year=None, category=None, width=80, index=None):
    """Filter and display matching speeches.
    
    If there's exactly one match, prints it directly.
    If index is None and multiple matches, prints a numbered summary.
    If index is an int, prints that single speech in full.
    """
    mask = pd.Series(True, index=df.index)
    if winner:
        mask &= df["winner_clean"].str.contains(winner, case=False)
    if year:
        mask &= df["year"] == year
    if category:
        mask &= df["category"].str.contains(category, case=False)
    results = df[mask].reset_index(drop=True)
    if results.empty:
        print("No matches found.")
        return

    # Auto-select if there's only one match
    if len(results) == 1:
        index = 0

    if index is None:
        # Summary mode: list all matches
        for i, r in results.iterrows():
            print(f"  [{i}] {r['year']} | {r['category']} – {r['winner_clean']}")
        print(f"\n{len(results)} matches. Use index=N to read a speech.")
        return

    if index < 0 or index >= len(results):
        print(f"Index out of range. Use 0–{len(results) - 1}.")
        return

    r = results.iloc[index]
    print("=" * width)
    print(f"{r['year']} | {r['category']}")
    print(f"{r['winner_clean']} – {r['film_title']}")
    print("=" * width)
    print(textwrap.fill(r["speech_clean"], width=width))
    print()

In [19]:
# Step 1: see what's available
show_speech(df, year=2010)

  [0] 2010 | Actor in a Leading Role – Colin Firth
  [1] 2010 | Actor in a Supporting Role – Christian Bale
  [2] 2010 | Actress in a Leading Role – Natalie Portman
  [3] 2010 | Actress in a Supporting Role – Melissa Leo
  [4] 2010 | Directing – Tom Hooper
  [5] 2010 | Best Picture – Iain Canning, Emile Sherman and Gareth Unwin, Producers
  [6] 2010 | Writing (Adapted Screenplay) – Screenplay by Aaron Sorkin
  [7] 2010 | Writing (Original Screenplay) – Screenplay by David Seidler

8 matches. Use index=N to read a speech.


In [23]:
show_speech(df, year=2010, winner='Christian Bale', index=0)

2010 | Actor in a Supporting Role
Christian Bale – The Fighter
Bloody hell.  Wow.  What a roomful of talented and inspirational people and what
the hell am I doing here in the midst of you?  It's such an honor.  David O.
Russell, what a great spirit, you know, on the set.  Just, just fantastic. And
thank you so much, mate, for making the work that all of us actors did actually
mean something, you know?  I mean, that's the director's job of translating it
to the audience and making it mean something.  Thank you for that.  Thank you to
Pamela Martin, likewise, as our editor.  The just incredible work of every
actor.  Melissa – I'm not gonna drop the f-bomb like she did; I've done that
plenty before – Amy, Jack, Mark, man, you know the guy who just got this whole
thing going right from the get go.   Everybody in Lowell, all the actors from
there.  Dicky and Micky.  Where's my quacker?  Is he up there?  Dicky's up there
somewhere, mate. [Dicky Eklund stands up and waves from the audience.]

In [29]:
# Step 2: read a specific speech (change the index to flip through)
show_speech(df, year=2009, index=2)

2009 | Actress in a Leading Role
Sandra Bullock – The Blind Side
Did I really earn this or did I just wear you all down?  I would like to thank
the Academy for allowing me in the last month to have the most incredible ride,
with rooms full of artists that I see tonight and that I've worked with before
and I hope to work with in the future, who inspire me and blaze trails for us.
Four of them, that I've fallen deeply in love with, I share this night with and
I share this award with.  Gabby, I love you so much. You are exquisite.  You are
beyond words to me.  Carey, your grace and your elegance and your beauty and
your talent makes me sick.  Helen, I feel like we are family, through(?) family,
and I don't have the words to express just what I think of you.  And Meryl, you
know what I think of you, and you are such a good kisser.   I have so many
people to thank for my good fortune in this lifetime, and this is a once-in-a-
lifetime experience, I know.  To the family that allowed me to pl

In [28]:
show_speech(df, category="Best Picture", year=2005)

2005 | Best Picture
Paul Haggis and Cathy Schulman, Producers – Crash
Thank you.  Oh my gosh.  Oh, thank you so, so much.  What an amazing night.
Thank you to all  the members of the Academy—   PAUL HAGGIS:  Can we thank them
by name, every single one?   CATHY SCHULMAN:  —for embracing our film about love
and about tolerance, about truth.  Thank you to the people all around the world
who have been touched by this message. And we are humbled by the other nominees
in this category; you have made this year one of the most breathtaking and
stunning, maverick years in American cinema, thank you.   We'd like to thank
Lions Gate.  Boy, did you do a job.  Jon Feltheimer and everyone in every office
of that building; and we would not be here today if it were not for Tom
Ortenberg and for Sarah Greenberg, thank you.  Thank you also to our financiers:
Andy Reimer, Jan Körbelin, Marina Grasic, Bob Yari. To our producers, our
partners:  Mark Harris and Bob Yari and Don Cheadle and Bobby Moresco, th

## Inspect labels
Look up all labels for a speech by searching on winner name, film title, category, or year.

In [4]:
import textwrap as _tw

_labels_df = pd.read_csv("../data/speeches_with_labels.csv")
_LABEL_COLS = ["distinctiveness", "redacted_speech", "plot_hint", "golden_snippet", "snippet_grading"]

def show_labels(winner=None, film=None, category=None, year=None, index=None, width=90):
    """Look up labels for a speech. Searches by substring on winner/film/category.

    If multiple matches, prints a numbered list. Use index=N to select one.
    If one match (or index given), prints all labels clearly.
    """
    mask = pd.Series(True, index=_labels_df.index)
    if winner:
        mask &= _labels_df["winner_clean"].str.contains(winner, case=False, na=False)
    if film:
        mask &= _labels_df["film_title"].str.contains(film, case=False, na=False)
    if category:
        mask &= _labels_df["category"].str.contains(category, case=False, na=False)
    if year:
        mask &= _labels_df["year"] == year

    results = _labels_df[mask].reset_index(drop=True)
    if results.empty:
        print("No matches found.")
        return

    if len(results) == 1:
        index = 0

    if index is None:
        for i, r in results.iterrows():
            print(f"  [{i}] {r['year']} | {r['category']} | {r['film_title']} | {r['winner_clean']}")
        print(f"\n{len(results)} matches. Use index=N to select one.")
        return

    if index < 0 or index >= len(results):
        print(f"Index out of range. Use 0-{len(results) - 1}.")
        return

    r = results.iloc[index]
    print("=" * width)
    print(f"{r['year']} | {r['category']}")
    print(f"{r['winner_clean']} - {r['film_title']}")
    print("=" * width)

    for col in _LABEL_COLS:
        val = r.get(col)
        if pd.isna(val):
            print(f"\n{col}: [not labeled]")
        elif col in ("redacted_speech", "golden_snippet"):
            print(f"\n{col}:")
            print(_tw.fill(str(val), width=width))
        else:
            print(f"\n{col}: {val}")
    print()

In [5]:
# Search by film title substring — multiple matches show a list
show_labels(film="gravity")

2013 | Directing
Alfonso Cuarón - Gravity

distinctiveness: 4

redacted_speech:
"""[REDACT: ALFONSO CUARÓN]: Wow, thank you. Thanks to the Academy. Like any other human
endeavor making a film can be a transformative experience. And I want to thank "[REDACT:
Gravity]," because for many of us involved in this film it was definitely a transformative
experience. And it's good because it took so long that if not it would be like a waste of
time. And why it really sucks is that while for a lot of these people that transformation
was wisdom, for me it was just the color of my hair.  I want to share this with all these
wise people who made this movie happen. My amazing son and co-writer, Jonás [REDACT:
Cuarón]. Sandra Bullock, Sandy, you're "[REDACT: Gravity]." You are the soul, the heart,
of the film. You're the most amazing collaborator and one of the best people I ever met.
George Clooney, for your absolute trust. David Heyman, Chivo and Tim Webber, for making
this film happen. The wise guy