# Analysis of Directors

## Top 20 Directors with the largest movies number in IMDB TOP 250

In [3]:
import pandas as pd
# Analysis of Director
# Load the dataset
df = pd.read_csv("data/Top250Movies.csv")
# preprocessing
df = df.replace("Not Available", 0)
df[["box_office"]] = df[["box_office"]].astype(int)
df = df.set_index(["movie_name","rating", "box_office"])["directors"].str.split(",", expand=True)\
    .stack().reset_index(drop=True, level=-1).reset_index().rename(columns={0: "director"})
# Aggragation
df = df.replace("Not Available", 0)
tmp = df.groupby(by=["director"])
tmp= tmp.agg({'rating':'mean', 'movie_name':'count', 'box_office': 'sum'})
tmp = tmp.rename(columns={"rating": "avg_rating", "movie_name": "movie_cnt", 'box_office': 'total_box_office'})
# List the top 20
output = tmp.sort_values(by=['movie_cnt'], ascending=False)
output.head(20)

Unnamed: 0_level_0,avg_rating,movie_cnt,total_box_office
director,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Steven Spielberg,8.371429,7,3254923011
Martin Scorsese,8.314286,7,1208286870
Stanley Kubrick,8.314286,7,196273199
Christopher Nolan,8.557143,7,4221515692
Akira Kurosawa,8.3,7,4749510
Alfred Hitchcock,8.316667,6,77131451
Quentin Tarantino,8.42,5,1145280602
Charles Chaplin,8.36,5,1555588
Billy Wilder,8.32,5,19296243
Hayao Miyazaki,8.325,4,763364320


## Top 20 Directors with the largest Box office in IMDB TOP 250

In [4]:
# List the top 20
output = tmp.sort_values(by=['total_box_office'], ascending=False)
output.head(20)


Unnamed: 0_level_0,avg_rating,movie_cnt,total_box_office
director,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Joe Russo,8.4,2,4851854139
Anthony Russo,8.4,2,4851854139
Christopher Nolan,8.557143,7,4221515692
Steven Spielberg,8.371429,7,3254923011
Peter Jackson,8.866667,3,2992606438
Pete Docter,8.2,3,2173654859
Jon Watts,8.2,1,1921847111
Lee Unkrich,8.35,2,1881653155
Lee Unkrich(co-director),8.15,2,1521345698
Joseph Kosinski,8.3,1,1488732821


## Top 20 Directors with the largest average rating in IMDB TOP 250

In [5]:
# List the top 20
output = tmp.sort_values(by=['avg_rating'], ascending=False)
output.head(20)

Unnamed: 0_level_0,avg_rating,movie_cnt,total_box_office
director,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Frank Darabont,8.95,2,315685878
Francis Ford Coppola,8.9,3,403184603
Peter Jackson,8.866667,3,2992606438
T.J. Gnanavel,8.8,1,0
Irvin Kershner,8.7,1,538375067
Lana Wachowski,8.7,1,467222728
Lilly Wachowski,8.7,1,467222728
Robert Zemeckis,8.65,2,1061563227
Fernando Meirelles,8.6,1,30680793
Masaki Kobayashi,8.6,1,0


# Analysis of Movie Stars

## Top 20 Movie Stars with the largest movies number in IMDB TOP 250

In [6]:
# Load the dataset
df = pd.read_csv("data/Top250Movies.csv")
# preprocessing
df = df.replace("Not Available", 0)
df[["box_office"]] = df[["box_office"]].astype(int)
df = df.set_index(["movie_name","rating", "box_office"])["casts"].str.split(",", expand=True)\
    .stack().reset_index(drop=True, level=-1).reset_index().rename(columns={0: "star"})
# Aggragation
df = df.replace("Not Available", 0)
tmp = df.groupby(by=["star"])
tmp= tmp.agg({'rating':'mean', 'movie_name':'count', 'box_office': 'sum'})
tmp = tmp.rename(columns={"rating": "avg_rating", "movie_name": "movie_cnt", 'box_office': 'total_box_office'})
# List the top 20
output = tmp.sort_values(by=['movie_cnt'], ascending=False)
output.head(20)

Unnamed: 0_level_0,avg_rating,movie_cnt,total_box_office
star,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Robert De Niro,8.377778,9,1579527223
Morgan Freeman,8.542857,7,3193226493
Harrison Ford,8.4,7,2799580320
John Ratzenberger,8.242857,7,4780445521
Leonardo DiCaprio,8.366667,6,2256086857
Christian Bale,8.4,6,3033797632
Michael Caine,8.583333,6,4181468614
Tom Hanks,8.45,6,2909130129
Robert Duvall,8.62,5,427474506
Takashi Shimura,8.34,5,570747


## Top 20 Movie Stars with the largest box office in IMDB TOP 250
#### Here we only lists movie stars who have at least four movies in IMDB TOP 250

In [7]:
# List the top 20
output = tmp[tmp["movie_cnt"] >= 4]
output = output.sort_values(by=['total_box_office'], ascending=False)
output.head(20)


Unnamed: 0_level_0,avg_rating,movie_cnt,total_box_office
star,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mark Ruffalo,8.28,5,5319386805
John Ratzenberger,8.242857,7,4780445521
Michael Caine,8.583333,6,4181468614
Orlando Bloom,8.675,4,3646870453
Viggo Mortensen,8.7,4,3314359094
Morgan Freeman,8.542857,7,3193226493
Willem Dafoe,8.15,4,3175112892
Elijah Wood,8.725,4,3066643153
Christian Bale,8.4,6,3033797632
Tom Hanks,8.45,6,2909130129


## Top 20 Movie Stars with the largest average rating in IMDB TOP 250
#### Here we only lists movie stars who have at least four movies in IMDB TOP 250

In [8]:
# List the top 20
output = tmp[tmp["movie_cnt"] >= 4]
output = output.sort_values(by=['avg_rating'], ascending=False)
output.head(20)

Unnamed: 0_level_0,avg_rating,movie_cnt,total_box_office
star,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Elijah Wood,8.725,4,3066643153
Al Pacino,8.7,4,551625256
Viggo Mortensen,8.7,4,3314359094
Orlando Bloom,8.675,4,3646870453
Robert Duvall,8.62,5,427474506
Michael Caine,8.583333,6,4181468614
Morgan Freeman,8.542857,7,3193226493
Gary Oldman,8.525,4,2480646210
Kenny Baker,8.5,4,1840946042
Tom Hanks,8.45,6,2909130129


# Analysis of Writers

## Top 20 Writers with the largest movies number in IMDB TOP 250

In [9]:
# Load the dataset
df = pd.read_csv("data/Top250Movies.csv")
# preprocessing
df = df.replace("Not Available", 0)
df[["box_office"]] = df[["box_office"]].astype(int)
df = df.set_index(["movie_name","rating", "box_office"])["writers"].str.split(",", expand=True)\
    .stack().reset_index(drop=True, level=-1).reset_index().rename(columns={0: "writer"})
# Aggragation
df = df.replace("Not Available", 0)
tmp = df.groupby(by=["writer"])
tmp= tmp.agg({'rating':'mean', 'movie_name':'count', 'box_office': 'sum'})
tmp = tmp.rename(columns={"rating": "avg_rating", "movie_name": "movie_cnt", 'box_office': 'total_box_office'})
# List the top 20
output = tmp.sort_values(by=['movie_cnt'], ascending=False)
output.head(20)

Unnamed: 0_level_0,avg_rating,movie_cnt,total_box_office
writer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Stanley Kubrick,8.314286,7,196273199
Christopher Nolan,8.557143,7,4221515692
Akira Kurosawa,8.283333,6,4749510
Jonathan Nolan,8.58,5,3010994597
Billy Wilder,8.32,5,19296243
George Lucas,8.44,5,2652977028
Quentin Tarantino,8.42,5,1145280602
Charles Chaplin,8.36,5,1555588
Pete Docter,8.26,5,3089403335
Andrew Stanton,8.3,4,2924702537


## Top 20 Writers with the largest box office in IMDB TOP 250

In [10]:
# List the top 20
output = output.sort_values(by=['total_box_office'], ascending=False)
output.head(20)



Unnamed: 0_level_0,avg_rating,movie_cnt,total_box_office
writer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Stan Lee,8.333333,3,6773701250
Stephen McFeely,8.4,2,4851854139
Christopher Markus,8.4,2,4851854139
Christopher Nolan,8.557143,7,4221515692
Pete Docter,8.26,5,3089403335
Jonathan Nolan,8.58,5,3010994597
J.R.R. Tolkien,8.866667,3,2992606438
Fran Walsh,8.866667,3,2992606438
Philippa Boyens,8.866667,3,2992606438
Andrew Stanton,8.3,4,2924702537


## Top 20 Writers with the largest average rating in IMDB TOP 250
#### Here we only lists movie stars who have at least four movies in IMDB TOP 250

In [11]:
# List the top 20
output = tmp[tmp["movie_cnt"] >= 2]
output = output.sort_values(by=['avg_rating'], ascending=False)
output.head(20)

Unnamed: 0_level_0,avg_rating,movie_cnt,total_box_office
writer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mario Puzo,9.1,2,298303735
Frank Darabont,8.95,2,315685878
Francis Ford Coppola,8.9,3,403184603
Fran Walsh,8.866667,3,2992606438
J.R.R. Tolkien,8.866667,3,2992606438
Philippa Boyens,8.866667,3,2992606438
Roger Avary,8.6,2,216842406
Stephen King,8.6,4,415309096
Jonathan Nolan,8.58,5,3010994597
Christopher Nolan,8.557143,7,4221515692
