# TDD with Assert
A quick introduction for how to use assert in Jupyter Notebooks to take advantage of Test Driven Development.

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

The assert() function tests whether a condition is true. If not, it will raise an AssertionError. This makes it a beneficial tool for debugging code:

In [2]:
assert(5 < 6)
assert(10 > 20)

AssertionError: 

Importing and preparing the example dataset.

This dataset contains imdb data from 2006 to 2016 and has been downloaded from https://www.kaggle.com/PromptCloudHQ/imdb-data.

In [3]:
# import data
imdb_df = pd.read_csv("imdb_movie_dataset.csv")
display(imdb_df.head())


Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [39]:
# create seaparate row for each title and actor
imdb_actors_df = imdb_df[['Title', 'Actors']]
imdb_actors_df['Actors'] = imdb_actors_df['Actors'].apply(lambda x: x.split(','))
imdb_actors_df = imdb_actors_df.set_index('Title').Actors.apply(pd.Series).stack().reset_index(level=0).rename(columns={0:'Actors'})

imdb_movies_df = imdb_df[['Title', 'Director', 'Year', 'Runtime (Minutes)', 'Rating', 'Revenue (Millions)']]

# merge the movie information such as revenue, runtime etc. back to the table
imdb_actors_df = imdb_actors_df.merge(imdb_movies_df, on='Title', how='inner')
imdb_actors_df = imdb_actors_df.rename(columns={'Runtime (Minutes)':'Runtime', 'Revenue (Millions)':'Revenue'})
imdb_actors_df.columns = map(str.lower, imdb_actors_df.columns)

sample = imdb_actors_df.sample(5).head()

while len(set(sample['actors'])) == 5:
      sample = imdb_actors_df.sample(5).head()  

display(sample)


Unnamed: 0,title,actors,director,year,runtime,rating,revenue
1462,Star Trek Into Darkness,Zoe Saldana,J.J. Abrams,2013,132,7.8,228.76
3001,Ouija: Origin of Evil,Annalise Basso,Mike Flanagan,2016,99,6.1,34.9
1288,Fast Five,Vin Diesel,Justin Lin,2011,131,7.3,209.81
1312,The Last Witch Hunter,Vin Diesel,Breck Eisner,2015,106,6.0,27.36
3929,Miracles from Heaven,Martin Henderson,Patricia Riggen,2016,109,7.0,61.69


The task: Get a table with each actors last movie.

In [40]:
# check values for one actor to define a testcase.
# Brad Pitt's last movie was 'Allied' which can be used for the testcase.
display(imdb_actors_df[imdb_actors_df['actors'] == 'Brad Pitt'].sort_values('year'))

Unnamed: 0,title,actors,director,year,runtime,rating,revenue
3547,Babel,Brad Pitt,Alejandro González Iñárritu,2006,143,7.5,34.3
2464,The Assassination of Jesse James by the Coward...,Brad Pitt,Andrew Dominik,2007,160,7.5,3.9
1712,The Curious Case of Benjamin Button,Brad Pitt,David Fincher,2008,166,7.8,127.49
308,Inglourious Basterds,Brad Pitt,Quentin Tarantino,2009,153,8.3,120.52
2348,Moneyball,Brad Pitt,Bennett Miller,2011,133,7.6,75.61
3551,The Tree of Life,Brad Pitt,Terrence Malick,2011,139,6.8,13.3
1740,World War Z,Brad Pitt,Marc Forster,2013,116,7.0,202.35
972,Fury,Brad Pitt,David Ayer,2014,134,7.6,85.71
284,Allied,Brad Pitt,Robert Zemeckis,2016,124,7.1,40.07


In [41]:
def brad_pitt_test(last_movies):
    # check whether there's only one movie by Brad Pitt
    assert(len(last_movies[(last_movies['actors'] == 'Brad Pitt')] == 1))

    # check whether this movie is indeed his last movie (Allied)
    assert(len(last_movies[(last_movies['actors'] == 'Brad Pitt') & (last_movies['title'] == 'Allied')]) == 1)

In [42]:
# create table with last movies - first try
imdb_actors_df['movie_number'] = imdb_actors_df.groupby("actors")["year"].rank("dense")
last_movies_df = imdb_actors_df[imdb_actors_df['movie_number'] == 1]

# show the results for brad pitt
display(last_movies_df[last_movies_df['actors'] == 'Brad Pitt'])

# check the output with the predefined testcase
brad_pitt_test(last_movies_df)

Unnamed: 0,title,actors,director,year,runtime,rating,revenue,movie_number
3547,Babel,Brad Pitt,Alejandro González Iñárritu,2006,143,7.5,34.3,1.0


AssertionError: 

In [43]:
# create table with last movies - second try
imdb_actors_df['movie_number_desc'] = imdb_actors_df.groupby("actors")["year"].rank("dense", ascending=False)
last_movies_desc_df = imdb_actors_df[imdb_actors_df['movie_number_desc'] == 1]

# show the results for brad pitt
display(last_movies_desc_df[last_movies_desc_df['actors'] == 'Brad Pitt'])

# check the output with the predefined testcase
brad_pitt_test(last_movies_desc_df)

Unnamed: 0,title,actors,director,year,runtime,rating,revenue,movie_number,movie_number_desc
284,Allied,Brad Pitt,Robert Zemeckis,2016,124,7.1,40.07,8.0,1.0


In [47]:
# create table with last movies - second try
imdb_actors_df['movie_number_desc'] = imdb_actors_df.groupby("actors")["year"].rank("dense", ascending=False)
last_movies_desc_df = imdb_actors_df.copy()

# check when was the last time this actor produced a movie before her last movie
last_movies_desc_df['prev_movie_year'] = last_movies_desc_df.sort_values('year', ascending=False) \
    .groupby(['actors'])['year'].shift(-1)
last_movies_desc_df['prev_year_diff'] = abs(last_movies_desc_df['prev_movie_year'] - last_movies_desc_df['year'])

# filter for last movies
last_movies_desc_df = last_movies_desc_df[(last_movies_desc_df['movie_number_desc'] == 1) & \
                                     (last_movies_desc_df['prev_year_diff'] > 1)]

# only movies that were produced after 2015
last_movies_desc_df = last_movies_desc_df[last_movies_desc_df['year'] > 2015]

# show the results for brad pitt
display(last_movies_desc_df[last_movies_desc_df['actors'] == 'Brad Pitt'])

# check the output with the predefined testcase
brad_pitt_test(last_movies_desc_df)

Unnamed: 0,title,actors,director,year,runtime,rating,revenue,movie_number,movie_number_desc,prev_movie_year,prev_year_diff
284,Allied,Brad Pitt,Robert Zemeckis,2016,124,7.1,40.07,8.0,1.0,2014.0,2.0
