In [1]:
# Import python libraries
import numpy as np
import pandas as pd
%matplotlib notebook

In [3]:
# Import data
dataset = pd.read_csv('../datasets/movie_dataset.csv', encoding='utf-8')

In [33]:
dataset

Unnamed: 0,director_name,duration,gross,genres,movie_title,title_year,country,budget,imdb_score,movie_facebook_likes,actor1,actor2,actor3,GOB
0,Martin Scorsese,240.0,116866727.0,Biography|Comedy|Crime|Drama,The Wolf of Wall Street,2013,USA,100000000.0,8.2,138000,Leonardo DiCaprio,Matthew McConaughey,Jon Favreau,1.168667
1,Shane Black,195.0,408992272.0,Action|Adventure|Sci-Fi,Iron Man 3,2013,USA,200000000.0,7.2,95000,Robert Downey Jr.,Jon Favreau,Don Cheadle,2.044961
2,Quentin Tarantino,187.0,54116191.0,Crime|Drama|Mystery|Thriller|Western,The Hateful Eight,2015,USA,44000000.0,7.9,114000,Craig Stark,Jennifer Jason Leigh,Zoë Bell,1.229913
3,Kenneth Lonergan,186.0,46495.0,Drama,Margaret,2011,USA,14000000.0,6.5,0,Matt Damon,Kieran Culkin,John Gallagher Jr.,0.003321
4,Peter Jackson,186.0,258355354.0,Adventure|Fantasy,The Hobbit: The Desolation of Smaug,2013,USA,225000000.0,7.9,83000,Aidan Turner,Adam Brown,James Nesbitt,1.148246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Steve McQueen,134.0,56667870.0,Biography|Drama|History,12 Years a Slave,2013,USA,20000000.0,8.1,83000,Quvenzhané Wallis,Scoot McNairy,Taran Killam,2.833394
95,Richard J. Lewis,134.0,7501404.0,Comedy|Drama,Barney's Version,2010,CANADA,0.0,7.3,0,Mark Addy,Atom Egoyan,Paul Gross,0.000000
96,Paul Greengrass,134.0,107100855.0,Biography|Drama|Thriller,Captain Phillips,2013,USA,55000000.0,7.9,65000,Tom Hanks,Chris Mulkey,Michael Chernus,1.947288
97,David Ayer,134.0,85707116.0,Action|Drama|War,Fury,2014,USA,68000000.0,7.6,82000,Brad Pitt,Logan Lerman,Jim Parrack,1.260399


In [9]:
dataset['language'].drop_duplicates()

0    English
Name: language, dtype: object

In [10]:
# Drop useless attributes
dataset.drop(['color','language'], axis=1, inplace=True)

In [12]:
# Handle text attributes
dataset['director_name'].fillna('', inplace=True)

In [13]:
# Handle numeric attributes
dataset['gross'].fillna(0, inplace=True)

In [15]:
# dataset['gross']=pd.to_numeric(dataset['gross']).astype('float64')
dataset['budget'].fillna(0, inplace=True)

In [16]:
# Unify countries names
dataset['country']=dataset['country'].str.upper()


In [18]:
dataset['country'] = np.where(dataset['country']=='UNITED STATES','USA', dataset['country'])

In [20]:
# Bad data entry
dataset['director_name'] = np.where(dataset['director_name']=='N/A','', dataset['director_name'])
dataset['director_name'] = np.where(dataset['director_name']=='Nan','', dataset['director_name'])
dataset['director_name'] = np.where(dataset['director_name']=='Null','', dataset['director_name'])
dataset['movie_title'] = dataset['movie_title'].str.replace('Â', '')


In [22]:
# Handling outliers
dataset["gross"]=dataset["gross"].astype(float)
dataset["duration"]=dataset["duration"].astype(float)
dataset["budget"]=dataset["budget"].astype(float)

In [28]:
dataset['duration'] = np.where(dataset['duration']<=10,0, dataset['duration'])
dataset['duration'] = np.where(dataset['duration']>300,0, dataset['duration'])
dataset['imdb_score'] = np.where(dataset['imdb_score']<=0,0, dataset['imdb_score'])
dataset['title_year'] = np.where(dataset['title_year']<2010,0, dataset['title_year'])

In [27]:
dataset[dataset['title_year']<2010]

Unnamed: 0,director_name,duration,gross,genres,movie_title,title_year,country,budget,imdb_score,actors,movie_facebook_likes
5,,183.0,330249062.0,Action|Adventure|Sci-Fi,Batman v Superman: Dawn of Justice,202,USA,250000000.0,6.9,"Henry Cavill,Lauren Cohan,Alan D. Purwin",197000
81,Daniel Espinosa,137.0,1206135.0,Crime|Drama|Thriller,Child 44,205,CZECH REPUBLIC,50000000.0,6.4,"Tom Hardy,Fares Fares,Michael Nardone",18000


In [30]:
# spliting actors
actor_list = dataset["actors"].str.split(",", n = 2, expand = True) 
dataset["actor1"]= actor_list[0] 
dataset["actor2"]= actor_list[1] 
dataset["actor3"]= actor_list[2] 
dataset.drop(columns=['actors'], inplace=True)

In [32]:
# Add a new metric GOB(Gross over Budget)
dataset['GOB'] = dataset.apply(lambda row: row['gross']/row['budget'] if row['budget']!=0 else 0, axis=1)


In [35]:
# Adding new feature
top_GOB=dataset.sort_values('GOB',ascending=False).head(15)

# dataset['title_year'] = dataset['title_year'].apply(np.int64)
# dataset['duration'] = dataset['duration'].apply(np.int64)

Unnamed: 0,director_name,duration,gross,genres,movie_title,title_year,country,budget,imdb_score,movie_facebook_likes,actor1,actor2,actor3,GOB
43,Tate Taylor,146.0,169705587.0,Drama,The Help,2011,USA,25000000.0,8.1,75000,Emma Stone,Bryce Dallas Howard,Mike Vogel,6.788223
15,Richard Linklater,165.0,25359200.0,Drama,Boyhood,2014,USA,4000000.0,8.0,92000,Ellar Coltrane,Lorelei Linklater,Libby Villari,6.3398
14,F. Gary Gray,167.0,161029270.0,Biography|Crime|Drama|History|Music,Straight Outta Compton,2015,USA,28000000.0,7.9,76000,Aldis Hodge,Neil Brown Jr.,R. Marcos Taylor,5.751045
59,,142.0,407999255.0,Adventure|Drama|Sci-Fi|Thriller,The Hunger Games,2012,USA,78000000.0,7.3,140000,Jennifer Lawrence,Josh Hutcherson,Anthony Reynolds,5.23076
75,,138.0,150117807.0,Crime|Drama,American Hustle,2013,USA,40000000.0,7.3,63000,Jennifer Lawrence,Christian Bale,Bradley Cooper,3.752945
44,Francis Lawrence,146.0,424645577.0,Adventure|Sci-Fi|Thriller,The Hunger Games: Catching Fire,2013,USA,130000000.0,7.6,82000,Jennifer Lawrence,Josh Hutcherson,Sandra Ellis Lafferty,3.266504
76,Robert Zemeckis,138.0,93749203.0,Drama|Thriller,Flight,2012,USA,31000000.0,7.3,64000,Denzel Washington,Bruce Greenwood,Nadine Velazquez,3.024168
94,Steve McQueen,134.0,56667870.0,Biography|Drama|History,12 Years a Slave,2013,USA,20000000.0,8.1,83000,Quvenzhané Wallis,Scoot McNairy,Taran Killam,2.833394
8,Joss Whedon,173.0,623279547.0,Action|Adventure|Sci-Fi,The Avengers,2012,USA,220000000.0,8.1,123000,Chris Hemsworth,Robert Downey Jr.,Scarlett Johansson,2.833089
9,Joss Whedon,173.0,623279547.0,Action|Adventure|Sci-Fi,The Avengers,2012,USA,220000000.0,8.1,123000,Chris Hemsworth,Robert Downey Jr.,Scarlett Johansson,2.833089


In [None]:
dataset.to_csv('output_IMDB.csv')