<b>Data Cleansing (IMDB Dataset)</b><br>
Faisal Sugangga (chomillera)<br>
IG: faisalsugangga<br>
Email: faisalsugangga@gmail.com

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from tabulate import tabulate as tbl

In [2]:
dataset = pd.read_csv('movie_sample_dataset.csv')

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   color                 88 non-null     object 
 1   director_name         88 non-null     object 
 2   duration              99 non-null     int64  
 3   gross                 91 non-null     float64
 4   genres                98 non-null     object 
 5   movie_title           99 non-null     object 
 6   title_year            99 non-null     int64  
 7   language              99 non-null     object 
 8   country               99 non-null     object 
 9   budget                95 non-null     float64
 10  imdb_score            99 non-null     float64
 11  actors                99 non-null     object 
 12  movie_facebook_likes  99 non-null     int64  
dtypes: float64(3), int64(3), object(7)
memory usage: 10.2+ KB


In [4]:
dataset.head()

Unnamed: 0,color,director_name,duration,gross,genres,movie_title,title_year,language,country,budget,imdb_score,actors,movie_facebook_likes
0,Color,Martin Scorsese,240,116866727.0,Biography|Comedy|Crime|Drama,The Wolf of Wall Street,2013,English,USA,100000000.0,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",138000
1,Color,Shane Black,195,408992272.0,Action|Adventure|Sci-Fi,Iron Man 3,2013,English,USA,200000000.0,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",95000
2,color,Quentin Tarantino,187,54116191.0,Crime|Drama|Mystery|Thriller|Western,The Hateful Eight,2015,English,USA,44000000.0,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",114000
3,Color,Kenneth Lonergan,186,46495.0,Drama,Margaret,2011,English,usa,14000000.0,6.5,"Matt Damon,Kieran Culkin,John Gallagher Jr.",0
4,Color,Peter Jackson,186,258355354.0,Adventure|Fantasy,The Hobbit: The Desolation of Smaug,2013,English,USA,225000000.0,7.9,"Aidan Turner,Adam Brown,James Nesbitt",83000


# Melakukan drop pada kolom yang tidak diperlukan

In [5]:
dataset['movie_facebook_likes'].head(100)

0     138000
1      95000
2     114000
3          0
4      83000
       ...  
94     83000
95         0
96     65000
97     82000
98     16000
Name: movie_facebook_likes, Length: 99, dtype: int64

In [6]:
dataset['color'].head(100)

0      Color
1      Color
2     color 
3      Color
4      Color
       ...  
94     Color
95     Color
96     Color
97     Color
98     Color
Name: color, Length: 99, dtype: object

In [7]:
dataset['actors'].head(100)

0     Leonardo DiCaprio,Matthew McConaughey,Jon Favreau
1             Robert Downey Jr.,Jon Favreau,Don Cheadle
2             Craig Stark,Jennifer Jason Leigh,Zoë Bell
3           Matt Damon,Kieran Culkin,John Gallagher Jr.
4                 Aidan Turner,Adam Brown,James Nesbitt
                            ...                        
94         Quvenzhané Wallis,Scoot McNairy,Taran Killam
95                     Mark Addy,Atom Egoyan,Paul Gross
96               Tom Hanks,Chris Mulkey,Michael Chernus
97                   Brad Pitt,Logan Lerman,Jim Parrack
98        Johnny Cannizzaro,Steve Schirripa,Scott Vance
Name: actors, Length: 99, dtype: object

In [8]:
dataset_drop = dataset.drop(['movie_facebook_likes','color','actors'], axis=1)

In [9]:
dataset_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   director_name  88 non-null     object 
 1   duration       99 non-null     int64  
 2   gross          91 non-null     float64
 3   genres         98 non-null     object 
 4   movie_title    99 non-null     object 
 5   title_year     99 non-null     int64  
 6   language       99 non-null     object 
 7   country        99 non-null     object 
 8   budget         95 non-null     float64
 9   imdb_score     99 non-null     float64
dtypes: float64(3), int64(2), object(5)
memory usage: 7.9+ KB


# Mengecek data N/A dan memutuskan apa yang akan dilakukan terhadap data yang N/A

In [10]:
dataset_drop.isna().sum()

director_name    11
duration          0
gross             8
genres            1
movie_title       0
title_year        0
language          0
country           0
budget            4
imdb_score        0
dtype: int64

## Kolom director_name

In [11]:
dataset_drop['director_name'].unique()

array(['Martin Scorsese', 'Shane Black', 'Quentin Tarantino',
       'Kenneth Lonergan', 'Peter Jackson', nan, 'Edward Hall',
       'Joss Whedon', 'Tom Tykwer', 'Null', 'Christopher Spencer',
       'Christopher Nolan', 'F. Gary Gray', 'Richard Linklater',
       'Michael Bay', 'Tom Hooper', 'Kathryn Bigelow', 'Ridley Scott',
       'Denis Villeneuve', 'Gnana Rajasekaran', 'Marc Webb', 'Nan',
       'Mike Leigh', 'Gore Verbinski', 'David Fincher', 'Bryan Singer',
       'Jay Oliva', 'Paul Thomas Anderson', 'Sam Mendes',
       'Michael Patrick King', 'Tate Taylor', 'Francis Lawrence',
       'Steven Spielberg', 'Guillaume Canet', 'Adam McKay', 'Zack Snyder',
       'Baz Luhrmann', 'Timur Bekmambetov', 'Justin Chadwick',
       'Oliver Stone', 'David Dobkin', 'Ryan Murphy', 'James Wan',
       'Derek Cianfrance', "Gavin O'Connor", 'Gary Ross',
       'Terrence Malick', 'Robert Zemeckis', 'Darren Aronofsky',
       'James Mangold', 'Daniel Espinosa', 'Walter Salles',
       'Angelina Jo

In [12]:
dataset_drop['director_name'] = dataset_drop['director_name'].fillna(' ')

In [13]:
dataset_drop['director_name'].unique()
# sepertinya masih muncul kolom Nan dan Null

array(['Martin Scorsese', 'Shane Black', 'Quentin Tarantino',
       'Kenneth Lonergan', 'Peter Jackson', ' ', 'Edward Hall',
       'Joss Whedon', 'Tom Tykwer', 'Null', 'Christopher Spencer',
       'Christopher Nolan', 'F. Gary Gray', 'Richard Linklater',
       'Michael Bay', 'Tom Hooper', 'Kathryn Bigelow', 'Ridley Scott',
       'Denis Villeneuve', 'Gnana Rajasekaran', 'Marc Webb', 'Nan',
       'Mike Leigh', 'Gore Verbinski', 'David Fincher', 'Bryan Singer',
       'Jay Oliva', 'Paul Thomas Anderson', 'Sam Mendes',
       'Michael Patrick King', 'Tate Taylor', 'Francis Lawrence',
       'Steven Spielberg', 'Guillaume Canet', 'Adam McKay', 'Zack Snyder',
       'Baz Luhrmann', 'Timur Bekmambetov', 'Justin Chadwick',
       'Oliver Stone', 'David Dobkin', 'Ryan Murphy', 'James Wan',
       'Derek Cianfrance', "Gavin O'Connor", 'Gary Ross',
       'Terrence Malick', 'Robert Zemeckis', 'Darren Aronofsky',
       'James Mangold', 'Daniel Espinosa', 'Walter Salles',
       'Angelina Jo

In [14]:
dataset_drop.isna().sum()

director_name    0
duration         0
gross            8
genres           1
movie_title      0
title_year       0
language         0
country          0
budget           4
imdb_score       0
dtype: int64

In [15]:
# Untuk memastikan, kita akan gunakan metode replace untuk memastikan kolom Null dan Nan ikut terganti

dataset_drop['director_name'] = dataset_drop['director_name'].replace('Null', ' ')
dataset_drop['director_name'] = dataset_drop['director_name'].replace('Nan', ' ')

In [16]:
dataset_drop['director_name'].unique()

array(['Martin Scorsese', 'Shane Black', 'Quentin Tarantino',
       'Kenneth Lonergan', 'Peter Jackson', ' ', 'Edward Hall',
       'Joss Whedon', 'Tom Tykwer', 'Christopher Spencer',
       'Christopher Nolan', 'F. Gary Gray', 'Richard Linklater',
       'Michael Bay', 'Tom Hooper', 'Kathryn Bigelow', 'Ridley Scott',
       'Denis Villeneuve', 'Gnana Rajasekaran', 'Marc Webb', 'Mike Leigh',
       'Gore Verbinski', 'David Fincher', 'Bryan Singer', 'Jay Oliva',
       'Paul Thomas Anderson', 'Sam Mendes', 'Michael Patrick King',
       'Tate Taylor', 'Francis Lawrence', 'Steven Spielberg',
       'Guillaume Canet', 'Adam McKay', 'Zack Snyder', 'Baz Luhrmann',
       'Timur Bekmambetov', 'Justin Chadwick', 'Oliver Stone',
       'David Dobkin', 'Ryan Murphy', 'James Wan', 'Derek Cianfrance',
       "Gavin O'Connor", 'Gary Ross', 'Terrence Malick',
       'Robert Zemeckis', 'Darren Aronofsky', 'James Mangold',
       'Daniel Espinosa', 'Walter Salles', 'Angelina Jolie Pitt',
       'Set

## Kolom duration

In [17]:
# Mengecek kolom duration, karena tidak ada N/A langsung kita cek outliernya
fig = px.box(dataset_drop, y='duration')
fig.show()

In [18]:
dataset_drop['duration'].unique()

array([240, 195, 187, 186, 183, -50, 180, 173, 172, 158, 170, 169, 167,
       165, 580, 164, 157, 156, 154, 153, 151, 150, 650, 149, 148, 147,
       146, 144, 143, 142, 141, 140, 139, 138, 137, 136, 135, 134,   5],
      dtype=int64)

In [19]:
# Merubah semua nilai negatif menjadi nilai positif karena pada dasarnya duration tidak mungkin terdapat negative value
dataset_drop['duration'] = dataset_drop['duration'].abs()

In [20]:
dataset_drop['duration'].unique()

array([240, 195, 187, 186, 183,  50, 180, 173, 172, 158, 170, 169, 167,
       165, 580, 164, 157, 156, 154, 153, 151, 150, 650, 149, 148, 147,
       146, 144, 143, 142, 141, 140, 139, 138, 137, 136, 135, 134,   5],
      dtype=int64)

## Kolom gross

In [21]:
# Mengecek kolom gross
dataset_drop['gross'].unique()

array([1.16866727e+08, 4.08992272e+08, 5.41161910e+07, 4.64950000e+04,
       2.58355354e+08, 3.30249062e+08, 3.03001229e+08,            nan,
       6.23279547e+08, 2.70985800e+07, 1.02515793e+08, 5.96961760e+07,
       1.87991439e+08, 1.61029270e+08, 2.53592000e+07, 1.62804648e+08,
       2.45428137e+08, 4.48130642e+08, 2.55108370e+08, 1.48775460e+08,
       9.57207160e+07, 1.05219735e+08, 1.83635922e+08, 3.52358779e+08,
       6.09628780e+07, 2.62030663e+08, 2.28430993e+08, 6.50070450e+07,
       1.82204440e+08, 3.95850000e+06, 8.92899100e+07, 1.67735396e+08,
       2.33914986e+08, 2.92568851e+08, 8.09331800e+06, 2.00074175e+08,
       4.07197282e+08, 9.53289370e+07, 1.69705587e+08, 4.24645577e+08,
       7.98833590e+07, 5.28224180e+07, 4.12290000e+04, 1.63772740e+07,
       1.54985087e+08, 2.17531200e+06, 2.91021565e+08, 3.04360277e+08,
       1.44812796e+08, 7.23060650e+07, 2.02853933e+08, 4.07999255e+08,
       4.58991599e+08, 8.32474800e+06, 4.73075500e+07, 4.71050850e+07,
      

In [22]:
# Mengganti data N/A di kolom gross menjadi 0
dataset_drop['gross'].fillna(0, inplace=True)

In [23]:
dataset_drop['gross'].unique()

array([1.16866727e+08, 4.08992272e+08, 5.41161910e+07, 4.64950000e+04,
       2.58355354e+08, 3.30249062e+08, 3.03001229e+08, 0.00000000e+00,
       6.23279547e+08, 2.70985800e+07, 1.02515793e+08, 5.96961760e+07,
       1.87991439e+08, 1.61029270e+08, 2.53592000e+07, 1.62804648e+08,
       2.45428137e+08, 4.48130642e+08, 2.55108370e+08, 1.48775460e+08,
       9.57207160e+07, 1.05219735e+08, 1.83635922e+08, 3.52358779e+08,
       6.09628780e+07, 2.62030663e+08, 2.28430993e+08, 6.50070450e+07,
       1.82204440e+08, 3.95850000e+06, 8.92899100e+07, 1.67735396e+08,
       2.33914986e+08, 2.92568851e+08, 8.09331800e+06, 2.00074175e+08,
       4.07197282e+08, 9.53289370e+07, 1.69705587e+08, 4.24645577e+08,
       7.98833590e+07, 5.28224180e+07, 4.12290000e+04, 1.63772740e+07,
       1.54985087e+08, 2.17531200e+06, 2.91021565e+08, 3.04360277e+08,
       1.44812796e+08, 7.23060650e+07, 2.02853933e+08, 4.07999255e+08,
       4.58991599e+08, 8.32474800e+06, 4.73075500e+07, 4.71050850e+07,
      

## Kolom genres

In [24]:
# Mengecek kolom Genres
dataset_drop['genres'].unique()

array(['Biography|Comedy|Crime|Drama', 'Action|Adventure|Sci-Fi',
       'Crime|Drama|Mystery|Thriller|Western', 'Drama',
       'Adventure|Fantasy', 'Drama|Romance', 'Drama|Sci-Fi',
       'Crime|Drama|Mystery|Thriller', nan, 'Adventure|Drama|Sci-Fi',
       'Biography|Crime|Drama|History|Music', 'Drama|Western',
       'Action|Thriller', 'Drama|Musical|Romance',
       'Drama|History|Thriller', 'Action|Adventure|Drama|History',
       'Adventure|Drama|Thriller|Western', 'Biography|Drama|History',
       'Action|Adventure|Fantasy', 'Action|Adventure|Drama',
       'Biography|Drama|History|War', 'Action|Adventure|Western',
       'Action|Adventure|Fantasy|Sci-Fi|Thriller',
       'Action|Animation|Crime|Sci-Fi|Thriller',
       'Action|Adventure|Sci-Fi|Thriller',
       'Comedy|Crime|Drama|Mystery|Romance', 'Action|Adventure|Thriller',
       'Comedy|Drama|Romance', 'Adventure|Sci-Fi|Thriller', 'Drama|War',
       'Action|Drama|Thriller|War', 'Crime|Drama|Thriller', 'Comedy',
       'A

In [25]:
pd.set_option('display.max_rows', None)
dataset_drop['genres']

0                 Biography|Comedy|Crime|Drama
1                      Action|Adventure|Sci-Fi
2         Crime|Drama|Mystery|Thriller|Western
3                                        Drama
4                            Adventure|Fantasy
5                      Action|Adventure|Sci-Fi
6                            Adventure|Fantasy
7                                Drama|Romance
8                      Action|Adventure|Sci-Fi
9                      Action|Adventure|Sci-Fi
10                                Drama|Sci-Fi
11                Crime|Drama|Mystery|Thriller
12                                         NaN
13                      Adventure|Drama|Sci-Fi
14         Biography|Crime|Drama|History|Music
15                                       Drama
16                               Drama|Western
17                     Action|Adventure|Sci-Fi
18                             Action|Thriller
19                           Adventure|Fantasy
20                       Drama|Musical|Romance
21           

In [26]:
# Mengganti data NaN di kolom genres menjadi "Other"
dataset_drop['genres'].fillna('Other', inplace=True)

In [27]:
pd.set_option('display.max_rows', None)
dataset_drop['genres']

0                 Biography|Comedy|Crime|Drama
1                      Action|Adventure|Sci-Fi
2         Crime|Drama|Mystery|Thriller|Western
3                                        Drama
4                            Adventure|Fantasy
5                      Action|Adventure|Sci-Fi
6                            Adventure|Fantasy
7                                Drama|Romance
8                      Action|Adventure|Sci-Fi
9                      Action|Adventure|Sci-Fi
10                                Drama|Sci-Fi
11                Crime|Drama|Mystery|Thriller
12                                       Other
13                      Adventure|Drama|Sci-Fi
14         Biography|Crime|Drama|History|Music
15                                       Drama
16                               Drama|Western
17                     Action|Adventure|Sci-Fi
18                             Action|Thriller
19                           Adventure|Fantasy
20                       Drama|Musical|Romance
21           

## Mengecek kolom movie_title

In [28]:
dataset_drop['movie_title'].unique()

array(['The Wolf of Wall Street', 'Iron Man 3', 'The Hateful Eight',
       'Margaret', 'The Hobbit: The Desolation of Smaug',
       'Batman v Superman: Dawn of Justice',
       'The Hobbit: An Unexpected Journey', 'Restless', 'The Avengers',
       'Cloud Atlas', 'The Girl with the Dragon Tattoo', 'Son of God',
       'Interstellar', 'Straight Outta Compton', 'Boyhood',
       'Django Unchained', 'Transformers: Age of Extinction',
       'The Dark Knight Rises',
       'The Hobbit: The Battle of the Five Armies', 'Les Misérables',
       'Zero Dark Thirty', 'Robin Hood', 'The Revenant',
       'Transformers: Dark of the Moon', 'Prisoners', 'Ramanujan',
       'The Amazing Spider-Man', 'The Martian', 'Exodus: Gods and Kings',
       'Lincoln', 'Mr. Turner', 'The Lone Ranger', 'Gone Girl',
       'X-Men: Days of Future Past',
       'Batman: The Dark Knight Returns, Part 2', 'Inception',
       'Inherent Vice', 'Spectre', 'Captain America: Civil War',
       'Sex and the City 2', 'The 

In [29]:
pd.set_option('display.max_rows', None)
dataset_drop['movie_title']

0                         The Wolf of Wall Street
1                                      Iron Man 3
2                               The Hateful Eight
3                                        Margaret
4             The Hobbit: The Desolation of Smaug
5              Batman v Superman: Dawn of Justice
6               The Hobbit: An Unexpected Journey
7                                        Restless
8                                    The Avengers
9                                    The Avengers
10                                    Cloud Atlas
11                The Girl with the Dragon Tattoo
12                                     Son of God
13                                   Interstellar
14                         Straight Outta Compton
15                                        Boyhood
16                               Django Unchained
17                Transformers: Age of Extinction
18                          The Dark Knight Rises
19      The Hobbit: The Battle of the Five Armies


## Mengecek kolom title_year

In [30]:
fig = px.box(dataset_drop, y='title_year')
fig.show()

In [31]:
dataset_drop['title_year'].unique()

array([2013, 2015, 2011,  202, 2012, 2014, 2010, 2016,  205], dtype=int64)

In [32]:
# Terdapat value yang aneh yaitu 202 dan 205, kita asumsikan ini adalah typo. Seharusnya adalah 2002 dan 2005
## oleh karena itu kita akan ganti value yang aneh tersebut
dataset_drop['title_year'].replace({202:2002,205:2005}, inplace=True)


In [33]:
dataset_drop['title_year'].unique()

array([2013, 2015, 2011, 2002, 2012, 2014, 2010, 2016, 2005], dtype=int64)

## Mengecek kolom language

In [34]:
dataset_drop['language'].unique()

array(['English'], dtype=object)

## Mengecek kolom country

In [35]:
dataset_drop['country'].unique()

array(['USA', 'usa', 'UK', 'Germany', 'New Zealand', 'India',
       'United States', 'France', 'Australia', 'Czech Republic',
       'Kyrgyzstan', 'Canada'], dtype=object)

In [37]:
# Untuk USA, usa dan United States harus digabungkan. Disini saya memilih untuk merubah semuanya ke USA
dataset_drop['country'] = dataset_drop['country'].str.upper()

In [38]:
dataset_drop['country'].unique()

array(['USA', 'UK', 'GERMANY', 'NEW ZEALAND', 'INDIA', 'UNITED STATES',
       'FRANCE', 'AUSTRALIA', 'CZECH REPUBLIC', 'KYRGYZSTAN', 'CANADA'],
      dtype=object)

In [40]:
dataset_drop['country'].replace({'UNITED STATES':'USA'}, inplace=True)

In [41]:
dataset_drop['country'].unique()

array(['USA', 'UK', 'GERMANY', 'NEW ZEALAND', 'INDIA', 'FRANCE',
       'AUSTRALIA', 'CZECH REPUBLIC', 'KYRGYZSTAN', 'CANADA'],
      dtype=object)

## Mengecek kolom budget

In [42]:
dataset_drop['budget'].unique()

array([1.000e+08, 2.000e+08, 4.400e+07, 1.400e+07, 2.250e+08, 2.500e+08,
       1.800e+08,       nan, 2.200e+08, 1.020e+08, 9.000e+07, 2.200e+07,
       1.650e+08, 2.800e+07, 4.000e+06, 2.100e+08, 6.100e+07, 4.000e+07,
       1.350e+08, 1.950e+08, 4.600e+07, 2.300e+08, 1.080e+08, 1.400e+08,
       6.500e+07, 2.150e+08, 3.500e+06, 1.600e+08, 2.000e+07, 2.450e+08,
       2.500e+07, 1.300e+08, 6.600e+07, 5.000e+07, 2.550e+07, 3.200e+07,
       1.780e+08, 1.050e+08, 1.735e+04, 7.800e+07, 3.500e+07, 4.500e+07,
       6.000e+07, 1.900e+08, 1.500e+07, 8.500e+07, 3.000e+07, 3.100e+07,
       1.250e+08, 8.000e+07, 1.200e+08, 1.700e+08, 7.500e+07, 7.000e+07,
       1.400e+06, 5.500e+07, 6.800e+07])

In [43]:
dataset_drop['budget'].fillna(0, inplace=True)

In [44]:
dataset_drop['budget'].unique()

array([1.000e+08, 2.000e+08, 4.400e+07, 1.400e+07, 2.250e+08, 2.500e+08,
       1.800e+08, 0.000e+00, 2.200e+08, 1.020e+08, 9.000e+07, 2.200e+07,
       1.650e+08, 2.800e+07, 4.000e+06, 2.100e+08, 6.100e+07, 4.000e+07,
       1.350e+08, 1.950e+08, 4.600e+07, 2.300e+08, 1.080e+08, 1.400e+08,
       6.500e+07, 2.150e+08, 3.500e+06, 1.600e+08, 2.000e+07, 2.450e+08,
       2.500e+07, 1.300e+08, 6.600e+07, 5.000e+07, 2.550e+07, 3.200e+07,
       1.780e+08, 1.050e+08, 1.735e+04, 7.800e+07, 3.500e+07, 4.500e+07,
       6.000e+07, 1.900e+08, 1.500e+07, 8.500e+07, 3.000e+07, 3.100e+07,
       1.250e+08, 8.000e+07, 1.200e+08, 1.700e+08, 7.500e+07, 7.000e+07,
       1.400e+06, 5.500e+07, 6.800e+07])

In [46]:
fig = px.box(dataset_drop, y='budget')
fig.show()

## Mengecek kolom imdb_score

In [47]:
dataset_drop['imdb_score'].unique()

array([ 8.2,  7.2,  7.9,  6.5,  6.9,  8.1, -7.5,  7.8,  5.6,  8.6,  8. ,
        8.5,  5.7,  7.5,  7.6,  7.4,  6.7,  6.3,  7. ,  6.1,  6.8,  8.4,
        8.8,  4.3,  7.1,  7.3,  3. ,  6. ,  5.8,  5.3,  6.4,  6.6, -1.2,
        5.9,  8.7])

In [49]:
# Terdapat value yang minus, yang dimana seharusnya dalam sebuah rating paling kecil adalah 0
## kita akan menggunakan metode abs seperti sebelumnya untuk menghilangkan negative value

dataset_drop['imdb_score'] = dataset_drop['imdb_score'].abs()

In [50]:
dataset_drop['imdb_score'].unique()

array([8.2, 7.2, 7.9, 6.5, 6.9, 8.1, 7.5, 7.8, 5.6, 8.6, 8. , 8.5, 5.7,
       7.6, 7.4, 6.7, 6.3, 7. , 6.1, 6.8, 8.4, 8.8, 4.3, 7.1, 7.3, 3. ,
       6. , 5.8, 5.3, 6.4, 6.6, 1.2, 5.9, 8.7])

# Mengecek dataset baru yang sudah dicleansing

In [51]:
table = dataset_drop.values.tolist()
header = dataset_drop.columns.tolist()
print(tbl(table, headers=header, tablefmt='fancy_grid'))

╒══════════════════════╤════════════╤══════════════════╤══════════════════════════════════════════╤═════════════════════════════════════════════╤══════════════╤════════════╤════════════════╤══════════════╤══════════════╕
│ director_name        │   duration │            gross │ genres                                   │ movie_title                                 │   title_year │ language   │ country        │       budget │   imdb_score │
╞══════════════════════╪════════════╪══════════════════╪══════════════════════════════════════════╪═════════════════════════════════════════════╪══════════════╪════════════╪════════════════╪══════════════╪══════════════╡
│ Martin Scorsese      │        240 │      1.16867e+08 │ Biography|Comedy|Crime|Drama             │ The Wolf of Wall Street                     │         2013 │ English    │ USA            │     1e+08    │          8.2 │
├──────────────────────┼────────────┼──────────────────┼──────────────────────────────────────────┼─────────────────

In [55]:
dataset_drop.columns.tolist()

['director_name',
 'duration',
 'gross',
 'genres',
 'movie_title',
 'title_year',
 'language',
 'country',
 'budget',
 'imdb_score']

# Melakukan EDA

In [56]:
# Menghitung jumlah film per tahun
films_per_year = dataset_drop['title_year'].value_counts().reset_index()
films_per_year.columns = ['title_year', 'count']

# Membuat grafik
fig = px.bar(films_per_year, x='title_year', y='count',
             labels={'count': 'Jumlah Film', 'title_year': 'Tahun'},
             title='Jumlah Film per Tahun')

# Menampilkan grafik
fig.show()
