## Basic Info

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv(
    "movie_data.txt", 
    delimiter = "\t", 
    names = ["ID","title","year",
             "rating","votes",
             "duration","genres"],
    index_col=0)

In [4]:
df.head()

Unnamed: 0_level_0,title,year,rating,votes,duration,genres
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0111161,The Shawshank Redemption (1994),1994,9.2,619479,142 mins.,Crime|Drama
tt0110912,Pulp Fiction (1994),1994,9.0,490065,154 mins.,Crime|Thriller
tt0137523,Fight Club (1999),1999,8.8,458173,139 mins.,Drama|Mystery|Thriller
tt0133093,The Matrix (1999),1999,8.7,448114,136 mins.,Action|Adventure|Sci-Fi
tt1375666,Inception (2010),2010,8.9,385149,148 mins.,Action|Adventure|Sci-Fi|Thriller


In [5]:
df.tail(11)

Unnamed: 0_level_0,title,year,rating,votes,duration,genres
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0101356,Another You (1991),1991,4.9,1359,98 mins.,Comedy|Crime
tt0421090,Zerophilia (2005),2005,6.3,1359,90 mins.,Comedy|Romance
tt0067227,The Merchant of Four Seasons (1971),1971,7.6,1359,88 mins.,Drama
tt0339727,Stateside (2004),2004,5.8,1358,97 mins.,Drama|Music|Romance
tt0218581,Scarlet Diva (2000),2000,5.2,1358,91 mins.,Drama
tt0118635,Aprile (1998),1998,6.7,1358,78 mins.,Comedy
tt0807721,Meduzot (2007),2007,7.0,1357,78 mins.,Drama
tt0339642,Daltry Calhoun (2005),2005,5.2,1357,100 mins.,Comedy|Drama|Music|Romance
tt0060880,The Quiller Memorandum (1966),1966,6.5,1356,104 mins.,Drama|Mystery|Thriller
tt0152836,Taal (1999),1999,6.5,1356,179 mins.,Musical|Romance


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, tt0111161 to tt0279977
Data columns (total 6 columns):
title       10000 non-null object
year        10000 non-null int64
rating      10000 non-null float64
votes       10000 non-null int64
duration    10000 non-null object
genres      9999 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 546.9+ KB


## Cleaning

In [7]:
df = df.dropna(axis = 1)
print(df.columns)

Index(['title', 'year', 'rating', 'votes', 'duration'], dtype='object')


In [8]:
df.duration = df.duration.apply(lambda dur:int(dur.split(" ")[0])*60)


df.title = df.title.apply(lambda title: title.split("(")[0])
print(df.columns)
df.head()

Index(['title', 'year', 'rating', 'votes', 'duration'], dtype='object')


Unnamed: 0_level_0,title,year,rating,votes,duration
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0111161,The Shawshank Redemption,1994,9.2,619479,8520
tt0110912,Pulp Fiction,1994,9.0,490065,9240
tt0137523,Fight Club,1999,8.8,458173,8340
tt0133093,The Matrix,1999,8.7,448114,8160
tt1375666,Inception,2010,8.9,385149,8880


## Concatenating and Merging Data Frames

### Loading Movie Crew data

In [9]:
movie_crew_df = pd.read_csv("crew_data.tsv",
                            delimiter = "\t",
                           names=["ID","directors","writers"],
                           index_col=0,
                           skiprows=1,
                           usecols=["ID","directors"])
movie_crew_df.columns

Index(['directors'], dtype='object')

In [10]:
movie_crew_df.head()

Unnamed: 0_level_0,directors
ID,Unnamed: 1_level_1
tt0000001,nm0005690
tt0000002,nm0721526
tt0000003,nm0721526
tt0000004,nm0721526
tt0000005,nm0005690


### Loading Person Data

In [11]:
person_df = pd.read_csv("name_data.tsv",
                       delimiter="\t",
                        names=["nconst","primaryName"],
                       usecols=["nconst","primaryName"],
                       skiprows=1)
person_df.columns = ["person_ID","name"]
person_df.set_index = 0

In [12]:
person_df.head()

Unnamed: 0,person_ID,name
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman


### Merging Crew and Movie Data

In [13]:
df=pd.merge(df, movie_crew_df,left_index=True,right_index=True)
df.head()

Unnamed: 0_level_0,title,year,rating,votes,duration,directors
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0111161,The Shawshank Redemption,1994,9.2,619479,8520,nm0001104
tt0110912,Pulp Fiction,1994,9.0,490065,9240,nm0000233
tt0137523,Fight Club,1999,8.8,458173,8340,nm0000399
tt0133093,The Matrix,1999,8.7,448114,8160,"nm0905154,nm0905152"
tt1375666,Inception,2010,8.9,385149,8880,nm0634240


#### Dealing with multiple directors

In [14]:
df = df[df['directors'].str.contains(",")==False]
df.rename(columns={'directors':'director_id'},inplace=True)
df.head()

Unnamed: 0_level_0,title,year,rating,votes,duration,director_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0111161,The Shawshank Redemption,1994,9.2,619479,8520,nm0001104
tt0110912,Pulp Fiction,1994,9.0,490065,9240,nm0000233
tt0137523,Fight Club,1999,8.8,458173,8340,nm0000399
tt1375666,Inception,2010,8.9,385149,8880,nm0634240
tt0109830,Forrest Gump,1994,8.7,368994,8520,nm0000709


### Merging with Person Data

In [15]:
df = pd.merge(df,person_df,
              left_on='director_id',
              right_on='person_ID')
df.head()

Unnamed: 0,title,year,rating,votes,duration,director_id,person_ID,name
0,The Shawshank Redemption,1994,9.2,619479,8520,nm0001104,nm0001104,Frank Darabont
1,The Green Mile,1999,8.4,243660,11340,nm0001104,nm0001104,Frank Darabont
2,The Mist,2007,7.4,90987,7560,nm0001104,nm0001104,Frank Darabont
3,The Majestic,2001,6.8,27241,9120,nm0001104,nm0001104,Frank Darabont
4,Pulp Fiction,1994,9.0,490065,9240,nm0000233,nm0000233,Quentin Tarantino


In [16]:
df.rename(columns={'name':'director'},inplace=True)
df.head()

Unnamed: 0,title,year,rating,votes,duration,director_id,person_ID,director
0,The Shawshank Redemption,1994,9.2,619479,8520,nm0001104,nm0001104,Frank Darabont
1,The Green Mile,1999,8.4,243660,11340,nm0001104,nm0001104,Frank Darabont
2,The Mist,2007,7.4,90987,7560,nm0001104,nm0001104,Frank Darabont
3,The Majestic,2001,6.8,27241,9120,nm0001104,nm0001104,Frank Darabont
4,Pulp Fiction,1994,9.0,490065,9240,nm0000233,nm0000233,Quentin Tarantino


#### Extra clean up

In [17]:
df = df.drop(columns=["person_ID"])
df.head()

Unnamed: 0,title,year,rating,votes,duration,director_id,director
0,The Shawshank Redemption,1994,9.2,619479,8520,nm0001104,Frank Darabont
1,The Green Mile,1999,8.4,243660,11340,nm0001104,Frank Darabont
2,The Mist,2007,7.4,90987,7560,nm0001104,Frank Darabont
3,The Majestic,2001,6.8,27241,9120,nm0001104,Frank Darabont
4,Pulp Fiction,1994,9.0,490065,9240,nm0000233,Quentin Tarantino


## Data Exploration

### Ten longest movies

In [19]:
result=df.sort_values(["duration"], ascending=[False])
result.head(10)

Unnamed: 0,title,year,rating,votes,duration,director_id,director
8060,Satantango,1994,8.5,2698,27000,nm0850601,Béla Tarr
7959,War and Peace,1967,7.8,2833,25620,nm0094083,Sergey Bondarchuk
5230,Gettysburg,1993,7.7,12093,15660,nm0561813,Ron Maxwell
1941,Hamlet,1996,7.7,19698,14520,nm0000110,Kenneth Branagh
6246,Love Exposure,2008,8.0,1922,14220,nm0814469,Sion Sono
8132,La Belle Noiseuse,1991,7.6,2621,14160,nm0729626,Jacques Rivette
6126,Ludwig,1972,7.6,1581,14100,nm0899581,Luchino Visconti
341,Once Upon a Time in America,1984,8.4,85533,13740,nm0001466,Sergio Leone
4881,Lagaan: Once Upon a Time in India,2001,8.1,14190,13440,nm0332950,Ashutosh Gowariker
3956,The Ten Commandments,1956,7.9,22789,13200,nm0001124,Cecil B. DeMille


### Best rated movies ordered by Rating and Title

In [20]:
result=df.sort_values(["rating","title"],ascending=[False,True])
result.head(10)

Unnamed: 0,title,year,rating,votes,duration,director_id,director
145,The Godfather,1972,9.2,474189,10500,nm0000338,Francis Ford Coppola
0,The Shawshank Redemption,1994,9.2,619479,8520,nm0001104,Frank Darabont
5636,Outrageous Class,1975,9.0,9823,5220,nm0251027,Ertem Egilmez
4,Pulp Fiction,1994,9.0,490065,9240,nm0000233,Quentin Tarantino
146,The Godfather: Part II,1974,9.0,291169,12000,nm0000338,Francis Ford Coppola
339,"The Good, the Bad and the Ugly",1966,9.0,195238,9660,nm0001466,Sergio Leone
631,12 Angry Men,1957,8.9,148155,5760,nm0001486,Sidney Lumet
18,Inception,2010,8.9,385149,8880,nm0634240,Christopher Nolan
215,One Flew Over the Cuckoo's Nest,1975,8.9,255503,7980,nm0001232,Milos Forman
52,Schindler's List,1993,8.9,325888,11700,nm0000229,Steven Spielberg


### Average Duration of a Movie

In [28]:
result = df.duration.mean()
print(result)
print(f"The average duration of a movie is {result/60}")

6242.946031746032
The average duration of a movie is 104.04910052910053


### Ten most productive directors

In [42]:
grouped = df.groupby(["director"])[["director"]].count()
grouped.rename(columns={"director":"count"},inplace=True)
grouped.head()

Unnamed: 0_level_0,count
director,Unnamed: 1_level_1
A.R. Murugadoss,1
Aaron Norris,3
Aaron Schneider,1
Abbas Kiarostami,7
Abbas Tyrewala,1


In [43]:
result= grouped.sort_values(["count"],ascending=[False])
result.head(10)

Unnamed: 0_level_0,count
director,Unnamed: 1_level_1
Woody Allen,40
Clint Eastwood,31
Sidney Lumet,27
Steven Spielberg,24
Robert Altman,24
Brian De Palma,23
John Huston,23
Joel Schumacher,22
Martin Scorsese,21
Blake Edwards,21


### How many movies were made in the 2000's

In [61]:
result=df[(df.year>1999) & (df.year<2010)].year.count()
print(f"The number of movies made in the 2000s is {result}")

The number of movies made in the 2000s is 3903


### Movies made by Akira Kurosawa ordered by year DESC

In [64]:
result=df[df.director=="Akira Kurosawa"].sort_values(["year"],ascending=[False])[["title","year","director"]]
result.head(20)

Unnamed: 0,title,year,director
868,Madadayo,1993,Akira Kurosawa
866,Rhapsody in August,1991,Akira Kurosawa
855,Ran,1985,Akira Kurosawa
860,Kagemusha,1980,Akira Kurosawa
862,Dersu Uzala,1975,Akira Kurosawa
867,Dodes'ka-den,1970,Akira Kurosawa
864,Red Beard,1965,Akira Kurosawa
863,High and Low,1963,Akira Kurosawa
861,Sanjuro,1962,Akira Kurosawa
856,Yojimbo,1961,Akira Kurosawa
