### Prepare Dataset

In [7]:
import pandas as pd

# Dataset was originally taken from: https://www.kaggle.com/datasets/utsh0dey/25k-movie-dataset
movie_df = pd.read_csv('datasets/IMDB_dataset_25k.csv')
movie_df.rename(columns={'movie title': 'title',
                         'Run Time': 'time',
                         'Rating': 'rating',
                         'User Rating': 'user_rating',
                         'Generes': 'genres',
                         'Overview': 'overview',
                         'Plot Kyeword': 'keywords',
                         'Director': 'director',
                         'Top 5 Casts': 'top_5_casts',
                         'Writer': 'writer',
                }, inplace=True)

movie_df.head()

Unnamed: 0,title,time,rating,user_rating,genres,overview,keywords,director,top_5_casts,writer,year,path
0,Top Gun: Maverick,"$170,000,000 (estimated)",8.6,187K,"['Action', 'Drama']",After more than thirty years of service as one...,"['fighter jet', 'sequel', 'u.s. navy', 'fighte...",Joseph Kosinski,"['Jack Epps Jr.', 'Peter Craig', 'Tom Cruise',...",Jim Cash,-2022,/title/tt1745960/
1,Jurassic World Dominion,2 hours 27 minutes,6.0,56K,"['Action', 'Adventure', 'Sci-Fi']",Four years after the destruction of Isla Nubla...,"['dinosaur', 'jurassic park', 'tyrannosaurus r...",Colin Trevorrow,"['Colin Trevorrow', 'Derek Connolly', 'Chris P...",Emily Carmichael,-2022,/title/tt8041270/
2,Top Gun,"$15,000,000 (estimated)",6.9,380K,"['Action', 'Drama']",As students at the United States Navy's elite ...,"['pilot', 'male camaraderie', 'u.s. navy', 'gr...",Tony Scott,"['Jack Epps Jr.', 'Ehud Yonay', 'Tom Cruise', ...",Jim Cash,-1986,/title/tt0092099/
3,Lightyear,"$71,101,257",5.2,32K,"['Animation', 'Action', 'Adventure']",While spending years attempting to return home...,"['galaxy', 'spaceship', 'robot', 'rocket', 'sp...",Angus MacLane,"['Jason Headley', 'Matthew Aldrich', 'Chris Ev...",Angus MacLane,-2022,/title/tt10298810/
4,Spiderhead,not-released,5.4,23K,"['Action', 'Crime', 'Drama']","In the near future, convicts are offered the c...","['discover', 'medical', 'test', 'reality', 'fi...",Joseph Kosinski,"['Rhett Reese', 'Paul Wernick', 'Chris Hemswor...",George Saunders,-2022,/title/tt9783600/


In [8]:
# Check how much NaN values I have
movie_df.isnull().sum(axis = 0)

title            0
time             0
rating           0
user_rating      0
genres           0
overview       244
keywords         0
director         0
top_5_casts      0
writer           0
year           778
path             0
dtype: int64

In [9]:
# Remove all movies with an empty overview
movie_df = movie_df[movie_df['overview'].notna()]
movie_df.isnull().sum(axis = 0)

title            0
time             0
rating           0
user_rating      0
genres           0
overview         0
keywords         0
director         0
top_5_casts      0
writer           0
year           746
path             0
dtype: int64

In [10]:
# Also remove 'time' column because of its ambiguity and uselessness
movie_df.drop(columns=['time'], inplace=True)

# Remove extra symbold in year column
movie_df['year'] = movie_df['year'].str.lstrip('-')

In [11]:
movie_df

Unnamed: 0,title,rating,user_rating,genres,overview,keywords,director,top_5_casts,writer,year,path
0,Top Gun: Maverick,8.6,187K,"['Action', 'Drama']",After more than thirty years of service as one...,"['fighter jet', 'sequel', 'u.s. navy', 'fighte...",Joseph Kosinski,"['Jack Epps Jr.', 'Peter Craig', 'Tom Cruise',...",Jim Cash,2022,/title/tt1745960/
1,Jurassic World Dominion,6,56K,"['Action', 'Adventure', 'Sci-Fi']",Four years after the destruction of Isla Nubla...,"['dinosaur', 'jurassic park', 'tyrannosaurus r...",Colin Trevorrow,"['Colin Trevorrow', 'Derek Connolly', 'Chris P...",Emily Carmichael,2022,/title/tt8041270/
2,Top Gun,6.9,380K,"['Action', 'Drama']",As students at the United States Navy's elite ...,"['pilot', 'male camaraderie', 'u.s. navy', 'gr...",Tony Scott,"['Jack Epps Jr.', 'Ehud Yonay', 'Tom Cruise', ...",Jim Cash,1986,/title/tt0092099/
3,Lightyear,5.2,32K,"['Animation', 'Action', 'Adventure']",While spending years attempting to return home...,"['galaxy', 'spaceship', 'robot', 'rocket', 'sp...",Angus MacLane,"['Jason Headley', 'Matthew Aldrich', 'Chris Ev...",Angus MacLane,2022,/title/tt10298810/
4,Spiderhead,5.4,23K,"['Action', 'Crime', 'Drama']","In the near future, convicts are offered the c...","['discover', 'medical', 'test', 'reality', 'fi...",Joseph Kosinski,"['Rhett Reese', 'Paul Wernick', 'Chris Hemswor...",George Saunders,2022,/title/tt9783600/
...,...,...,...,...,...,...,...,...,...,...,...
24397,Delicatessen,7.6,85K,"['Comedy', 'Crime']",Post-apocalyptic surrealist black comedy about...,"['surrealist', 'black comedy', 'human meat', '...",Marc Caro,"['Jean-Pierre Jeunet', 'Marc Caro', 'Gilles Ad...",Jean-Pierre Jeunet,1991,/title/tt0101700/
24398,Bitch Ass,5.5,52,"['Crime', 'Horror']",A gang initiation goes wrong when a group of f...,[],Bill Posley,"['Bill Posley', 'Teon Kelley', 'Tunde Laleye',...",Jonathan Colomb,2022,/title/tt13991504/
24399,Bullwhip,5.1,398,"['Crime', 'Romance', 'Western']","In order to avoid the hangman's noose, a cowbo...","['taming of the shrew', 'fur trader', 'busines...",Harmon Jones,"['Guy Madison', 'Rhonda Fleming', 'James Griff...",Adele Buffington,1958,/title/tt0051438/
24400,The Freshman,6.4,20K,"['Comedy', 'Crime']",An N.Y.C. film school student accepts a job wi...,"['endangered species', 'fish out of water', 'g...",Andrew Bergman,"['Marlon Brando', 'Matthew Broderick', 'Bruno ...",Andrew Bergman,1990,/title/tt0099615/


In [12]:
# Export cleaned dataset as a .csv file
movie_df.to_csv('datasets/IMDB_cleaned.csv')