# Exploratory Data Analysis

In [18]:
# Import requirements
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [19]:
# Set defaults
%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

In [20]:
# Load data
movies = pd.read_csv("archive/Movie_Movies.csv", low_memory=False)
genres = pd.read_csv("archive/Movie_Genres.csv", low_memory=False)
ratings = pd.read_csv("archive/Movie_AdditionalRating.csv", low_memory=False)

In [21]:
# Overview
movies.head()

Unnamed: 0,Awards,Country,DVD,Director,Language,Plot,Poster,Production,Rated,Released,Runtime,Title,Type,Website,Year,imdbID,imdbRating,imdbVotes
0,,USA,,Rose Cummings,English,Rachel constantly hears her baby cry from the ...,,,,26 Apr 2012,20 min,Baby's Breath,movie,,2012,tt2268369,,
1,,USA,,James Byrne,,The struggle against unfortunate circumstances...,,,,,9 min,Winter Trees,movie,,2008,tt1560760,,
2,,USA,,Dimitri Buchowetzki,,,,,,27 Mar 1926,50 min,The Crown of Lies,movie,,1926,tt0016750,,
3,,USA,,Julia Hechler,English,"A Gift introduces Samuel Green, Washington Sta...",,,,27 May 2013,2 min,A Gift,movie,,2013,tt3405286,,
4,,Sri Lanka,,Udara Siriruwan,Sinhalese,,,,,20 Mar 2014,23 min,Journey,movie,,2014,tt3816698,,


In [22]:
# General info
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178687 entries, 0 to 178686
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Awards      20340 non-null   object 
 1   Country     165998 non-null  object 
 2   DVD         15949 non-null   object 
 3   Director    151353 non-null  object 
 4   Language    153196 non-null  object 
 5   Plot        97476 non-null   object 
 6   Poster      48414 non-null   object 
 7   Production  16446 non-null   object 
 8   Rated       23906 non-null   object 
 9   Released    116974 non-null  object 
 10  Runtime     119497 non-null  object 
 11  Title       178686 non-null  object 
 12  Type        178686 non-null  object 
 13  Website     6747 non-null    object 
 14  Year        178686 non-null  object 
 15  imdbID      178686 non-null  object 
 16  imdbRating  62073 non-null   float64
 17  imdbVotes   62029 non-null   object 
dtypes: float64(1), object(17)
memory usage: 24.5

In [23]:
# Description on some columns
movies.describe()

Unnamed: 0,imdbRating
count,62073.0
mean,6.416236
std,1.360472
min,1.0
25%,5.6
50%,6.5
75%,7.3
max,10.0


In [24]:
movies.isnull().sum()

Awards        158347
Country        12689
DVD           162738
Director       27334
Language       25491
Plot           81211
Poster        130273
Production    162241
Rated         154781
Released       61713
Runtime        59190
Title              1
Type               1
Website       171940
Year               1
imdbID             1
imdbRating    116614
imdbVotes     116658
dtype: int64

In [12]:
movies[movies['imdbVotes'].isna()]

Unnamed: 0,Awards,Country,DVD,Director,Language,Plot,Poster,Production,Rated,Released,Runtime,Title,Type,Website,Year,imdbID,imdbRating,imdbVotes
0,,USA,,Rose Cummings,English,Rachel constantly hears her baby cry from the ...,,,,26 Apr 2012,20 min,Baby's Breath,movie,,2012,tt2268369,,
1,,USA,,James Byrne,,The struggle against unfortunate circumstances...,,,,,9 min,Winter Trees,movie,,2008,tt1560760,,
2,,USA,,Dimitri Buchowetzki,,,,,,27 Mar 1926,50 min,The Crown of Lies,movie,,1926,tt0016750,,
3,,USA,,Julia Hechler,English,"A Gift introduces Samuel Green, Washington Sta...",,,,27 May 2013,2 min,A Gift,movie,,2013,tt3405286,,
4,,Sri Lanka,,Udara Siriruwan,Sinhalese,,,,,20 Mar 2014,23 min,Journey,movie,,2014,tt3816698,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177194,1 nomination.,USA,,Paula Saslow,Spanish,,https://images-na.ssl-images-amazon.com/images...,,,10 Jan 2016,,Diablo,movie,,2016,tt3813570,,
177199,,"China, South Korea",17 May 2016,Kim Daewoo,English,,,A24 Films,,,,The Witch,movie,http://thewitch-movie.com/,2016,tt5092276,,
177387,,USA,30 May 2017,Jess Carson,English,Quinn and Julie connect after their respective...,,Open Road Films,,,,Collide,movie,http://collidefilm.com/,2017,tt2834052,,
177395,,UK,,Iannis Aliferis,English,A career soldier returns to his loving and emp...,https://images-na.ssl-images-amazon.com/images...,Petit Film,,02 Dec 2017,5 min,Raw,movie,,2017,tt7650514,,


In [25]:
movies['imdbVotes'] = pd.to_numeric(movies['imdbVotes'].str.replace(",",""))

In [26]:
movies.imdbVotes.describe()

count    6.202900e+04
mean     7.704287e+03
std      4.844430e+04
min      5.000000e+00
25%      1.200000e+01
50%      3.900000e+01
75%      2.630000e+02
max      1.929977e+06
Name: imdbVotes, dtype: float64

In [17]:
# Sort by rating
movies.filter('imdbVotes'>1000).sort_values("imdbRating", ascending=False).head(10)

TypeError: '>' not supported between instances of 'str' and 'int'

In [11]:
movies.sort_values("imdbRating", ascending=False)

Unnamed: 0,Awards,Country,DVD,Director,Language,Plot,Poster,Production,Rated,Released,Runtime,Title,Type,Website,Year,imdbID,imdbRating,imdbVotes
147179,,Greece,,Leonard Thimo,"Greek, English",,,,,01 Mar 2014,,The Twenty-Eight Hits for Laughs 4th Season,movie,,2014,tt3565880,10.0,7
39888,,USA,,,English,,https://images-na.ssl-images-amazon.com/images...,,,09 Dec 2016,6 min,Fate of Revenge,movie,,2016,tt6323164,10.0,6
90847,,USA,,Ray Ellingsen,English,Reach your full career potential by developing...,https://images-na.ssl-images-amazon.com/images...,,,01 Sep 2007,60 min,Skills for Actors: Voice Skills,movie,,2007,tt1260398,10.0,6
124395,1 win.,USA,,Lukas Hassel,English,The event on a young boy's birthday has conseq...,https://images-na.ssl-images-amazon.com/images...,,,10 Aug 2017,,"The Son, the Father",movie,,2017.0,tt6860566,10.0,6
30948,,Australia,,Neil Mansfield,,A pretty young lady rides her old blue bicycle...,,,,,6 min,The Girl in the Sun,movie,,2013,tt2705624,10.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177194,1 nomination.,USA,,Paula Saslow,Spanish,,https://images-na.ssl-images-amazon.com/images...,,,10 Jan 2016,,Diablo,movie,,2016,tt3813570,,
177199,,"China, South Korea",17 May 2016,Kim Daewoo,English,,,A24 Films,,,,The Witch,movie,http://thewitch-movie.com/,2016,tt5092276,,
177387,,USA,30 May 2017,Jess Carson,English,Quinn and Julie connect after their respective...,,Open Road Films,,,,Collide,movie,http://collidefilm.com/,2017,tt2834052,,
177395,,UK,,Iannis Aliferis,English,A career soldier returns to his loving and emp...,https://images-na.ssl-images-amazon.com/images...,Petit Film,,02 Dec 2017,5 min,Raw,movie,,2017,tt7650514,,


In [None]:
movies.isnull().sum()

In [None]:
movies.Director.value_counts().head(10)

In [None]:
director_counts = movies['Director'].value_counts().head(10).to_frame().reset_index()
director_counts.columns = ['Director','NumberOfMoviesProduced']
director_counts.style.hide_index()

In [None]:
ratings.info()

In [None]:
ratings.drop('Unnamed: 0', axis=1, inplace=True)

movies_rating = (ratings
                  .set_index("imdbID")
                  .join(movies.set_index("imdbID"),
                        how="left")
                 )

In [None]:
movies_rating.head()


