In [170]:
import pandas as pd
import numpy as np

In [171]:
df = pd.read_csv("imdb-movies-dataset.csv")
df

Unnamed: 0,Poster,Title,Year,Certificate,Duration (min),Genre,Rating,Metascore,Director,Cast,Votes,Description,Review Count,Review Title,Review
0,https://m.media-amazon.com/images/M/MV5BYWRkZj...,The Idea of You,2023.0,R,115.0,"Comedy, Drama, Romance",6.4,67.0,Michael Showalter,"Anne Hathaway, Nicholas Galitzine, Ella Rubin,...",28744,"Solène, a 40-year-old single mom, begins an un...",166,Hypocrisy as an idea,"This film, as well as the reaction to it, is a..."
1,https://m.media-amazon.com/images/M/MV5BZGI4NT...,Kingdom of the Planet of the Apes,2023.0,PG-13,145.0,"Action, Adventure, Sci-Fi",7.3,66.0,Wes Ball,"Owen Teague, Freya Allan, Kevin Durand, Peter ...",22248,"Many years after the reign of Caesar, a young ...",183,A phenomenal start to another trilogy!,"I'm a big fan of all the planet of the apes, a..."
2,https://m.media-amazon.com/images/M/MV5BZjIyOT...,Unfrosted,2023.0,PG-13,97.0,"Biography, Comedy, History",5.5,42.0,Jerry Seinfeld,"Isaac Bae, Jerry Seinfeld, Chris Rickett, Rach...",18401,"In 1963 Michigan, business rivals Kellogg's an...",333,not funny,Pretty much the worst criticism you can lay on...
3,https://m.media-amazon.com/images/M/MV5BMjA5Zj...,The Fall Guy,2023.0,PG-13,126.0,"Action, Comedy, Drama",7.3,73.0,David Leitch,"Ryan Gosling, Emily Blunt, Aaron Taylor-Johnso...",38953,A down-and-out stuntman must find the missing ...,384,Everything you needed and more!,Just got out of the Austin premier at SXSW and...
4,https://m.media-amazon.com/images/M/MV5BNTk1MT...,Challengers,2023.0,R,131.0,"Drama, Romance, Sport",7.7,82.0,Luca Guadagnino,"Zendaya, Mike Faist, Josh O'Connor, Darnell Ap...",32517,"Tashi, a former tennis prodigy turned coach, t...",194,"Watch ""Match Point"" instead",This is a tough one. I liked the concept and t...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,https://m.media-amazon.com/images/M/MV5BMzg5MW...,The Greatest Show on Earth,2020.0,U,152.0,"Drama, Family, Romance",6.5,76.0,Cecil B. DeMille,"James Stewart, Charlton Heston, Betty Hutton, ...",16078,"The dramatic lives of trapeze artists, a clown...",128,"Hey, doesn't anyone remember Last Emperor?",It constantly amazes me that people carp that ...
9996,https://m.media-amazon.com/images/M/MV5BYzA0ZG...,Berserk: Ougon Jidai-hen I - Haou no Tamago,2020.0,,76.0,"Animation, Action, Adventure",7.5,,Toshiyuki Kubooka,"Hiroaki Iwanaga, Carrie Keranen, Takahiro Saku...",14300,A lone sellsword named Guts gets recruited int...,12,Masterfully directed climatic epic saga,Few stories can capture your mind and soul in ...
9997,https://m.media-amazon.com/images/M/MV5BM2U1Mj...,Is-slottet,2020.0,,78.0,"Mystery, Drama",6.5,,Per Blom,"Line Storesund, Hilde Nyeggen Martinsen, Meret...",740,A couple of twelve-year-old Norwegian girls st...,4,Beautiful Film,"This film might not be to everyone's taste, it..."
9998,https://m.media-amazon.com/images/M/MV5BMTAwOD...,Loving Pablo,2020.0,A,123.0,"Biography, Crime, Drama",6.4,42.0,Fernando León de Aranoa,"Javier Bardem, Penélope Cruz, Peter Sarsgaard,...",22447,A journalist strikes up a romantic relationshi...,84,That film should be in Spanish,Why anyone (the director?) made Spanish actors...


In [172]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Poster          10000 non-null  object 
 1   Title           10000 non-null  object 
 2   Year            9850 non-null   float64
 3   Certificate     7370 non-null   object 
 4   Duration (min)  9664 non-null   float64
 5   Genre           9993 non-null   object 
 6   Rating          9596 non-null   float64
 7   Metascore       7555 non-null   float64
 8   Director        9995 non-null   object 
 9   Cast            9961 non-null   object 
 10  Votes           9596 non-null   object 
 11  Description     10000 non-null  object 
 12  Review Count    9999 non-null   object 
 13  Review Title    9483 non-null   object 
 14  Review          9484 non-null   object 
dtypes: float64(4), object(11)
memory usage: 1.1+ MB


# Preprocessing needed on each column

- drop poster, metascore
- movies missing ratings are also missing votes so drop those movies as it cocnsists of less then 5% of data
- rename Duration (min) to Duration
- rename Review Count to Review_count
- rename Review Title to Review_title
- convert year to int16 to reduce memory usage
- convert duration to int16
- remove "," from Votes and convert into int32
- remove "," from Review Counts and convert into int16
- fill certificate, genre, review_title, review null values with N/A

In [173]:
df.drop(columns=["Poster", "Metascore"], inplace=True)

In [174]:
df.dropna(subset=["Rating", "Duration (min)", "Cast", "Review Count"], axis=0, inplace=True)

In [175]:
df.rename(columns={
    "Duration (min)": "Duration",
    "Review Count": "Review_count",
    "Review Title": "Review_title"
}, inplace=True)

In [176]:
df["Year"].fillna(0, inplace=True)

In [177]:
df["Year"] = df["Year"].astype(dtype=np.int16)

In [178]:
df["Duration"] = df["Duration"].astype(dtype=np.int16)

In [179]:
df["Votes"] = df["Votes"].str.replace(",", "").astype(dtype=np.int32)

In [180]:
df["Review_count"] = df["Review_count"].str.replace(",", "").astype(dtype=np.int32)

In [181]:
df.fillna({
    "Certificate": "N/A",
    "Review_title": "N/A",
    "Review": "N/A"
}, inplace=True)

In [182]:
df.reset_index(drop=True, inplace=True)

In [183]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9579 entries, 0 to 9578
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         9579 non-null   object 
 1   Year          9579 non-null   int16  
 2   Certificate   9579 non-null   object 
 3   Duration      9579 non-null   int16  
 4   Genre         9579 non-null   object 
 5   Rating        9579 non-null   float64
 6   Director      9579 non-null   object 
 7   Cast          9579 non-null   object 
 8   Votes         9579 non-null   int32  
 9   Description   9579 non-null   object 
 10  Review_count  9579 non-null   int32  
 11  Review_title  9579 non-null   object 
 12  Review        9579 non-null   object 
dtypes: float64(1), int16(2), int32(2), object(8)
memory usage: 785.9+ KB


In [184]:
df.describe().round()

Unnamed: 0,Year,Duration,Rating,Votes,Review_count
count,9579.0,9579.0,9579.0,9579.0,9579.0
mean,1977.0,109.0,6.0,96513.0,278.0
std,241.0,23.0,1.0,182126.0,467.0
min,0.0,45.0,1.0,6.0,0.0
25%,1997.0,95.0,6.0,11413.0,68.0
50%,2011.0,105.0,6.0,37229.0,147.0
75%,2021.0,118.0,7.0,100799.0,311.0
max,2025.0,746.0,10.0,2894940.0,10078.0


In [185]:
df.to_csv("Cleaned_movies_data.csv", index=False)

### Based on
- Overall
- Artist
- Director
- Genre
- Year wise -> multiselect if possible