## Importing the important libraries

In [96]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
colors = ['#235E72']

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [97]:
df = pd.read_csv('data/data.csv')
df.head()

Unnamed: 0,Movie Name,IMDb Rating,Popularity,Release Date,Box Office,Runtime,Genre,Director,Cast,Production Company,Country,Language
0,The Kerala Story Bangla,6.5,,,,12 minutes,"['Short', 'Drama']",['Satyajit Das'],[],[],,['Bengali']
1,Generation Z,,,,,,"['Horror', 'Comedy']",[],"['Chris Reilly', 'Ellora Torchia', 'Viola Pret...","['All3Media International', 'The Forge', 'Zwei...",,[]
2,Criminal Justice: Adhura Sach,7.7,,"August 26, 2022 (India)",,45 minutes,"['Crime', 'Drama', 'Mystery']",[],"['Pankaj Tripathi', 'Swastika Mukherjee', 'Pur...","['Applause Entertainment Ltd.', 'BBC Studios']",India,['Hindi']
3,Madhil Mel Kaadhal,,,,,,['Romance'],['Anjana Ali Khan'],"['Mugen Rao', 'Divya Bharathi', 'Sakshi Agarwal']",[],,['Tamil']
4,Saawariya,5.2,,"November 9, 2007 (India)","$18,525,631",2 hours 22 minutes,"['Drama', 'Musical', 'Romance']",['Sanjay Leela Bhansali'],"['Ranbir Kapoor', 'Sonam Kapoor', 'Salman Khan']","['SPE Films', 'SLB Films Pvt. Ltd.']",,['Hindi']


In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10004 entries, 0 to 10003
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Movie Name          9920 non-null   object 
 1   IMDb Rating         8671 non-null   float64
 2   Popularity          187 non-null    object 
 3   Release Date        6249 non-null   object 
 4   Box Office          2570 non-null   object 
 5   Runtime             7699 non-null   object 
 6   Genre               10004 non-null  object 
 7   Director            10004 non-null  object 
 8   Cast                10004 non-null  object 
 9   Production Company  10004 non-null  object 
 10  Country             5998 non-null   object 
 11  Language            10004 non-null  object 
dtypes: float64(1), object(11)
memory usage: 938.0+ KB


## Data Cleaning

In [99]:
# Droping the columns that have many null values or are not relevant to the analysis
df = df.drop(['Popularity', 'Production Company', 'Box Office', 'Language', 'Country'], axis=1)

In [100]:
#Drop rows with missing values in IMDB rating
df = df.dropna(subset=['IMDb Rating'])

In [101]:
### Extracting the year of release column from date of release and filling the missing values with mode
df['Release Date'].fillna('July 2023', inplace=True)
years = df['Release Date'].apply(lambda x: (x.split('(')[0])).apply(lambda x: x.split()[-1] if len(x.split(','))==1 else x.split(',')[1]) 
df['Year'] = years
df['Year'] = df['Year'].astype(int)

In [102]:
df.drop('Release Date', axis=1, inplace=True)

In [103]:
# Replacing cast and director columns with the first entry in the list
df['Cast'] = df['Cast'].apply(lambda x: x[1:-1].split(', ')[0] if x != '[]' else np.NaN)
df.dropna(subset=['Cast'], inplace=True)

df['Director'] = df['Director'].apply(lambda x: x[1:-1].split(',')[0] if x!='[]' else np.NaN)
df.dropna(subset=['Director'], inplace=True)

In [104]:
## Cleaning the runtime column by replacing it with number of minutes
df['Runtime'] = df['Runtime'].fillna('120 minutes')
df['Runtime'] = df['Runtime'].apply(lambda x: int(x.split()[0]) if 'hours' not in x else (int(x.split()[0])*60 + int(x.split()[2]) if len(x.split())>2 else int(x.split()[0])*60))

In [105]:
## Opening the Genre column and filling the missing values with mode
df['Genre'] = df['Genre'].apply(lambda x: x[1:-1].split(','))
df = df.explode('Genre')
df['Genre'].fillna(df['Genre'].mode()[0], inplace=True)
df['Genre'] = df['Genre'].apply(lambda x: x.lstrip().rstrip())
df.head()

Unnamed: 0,Movie Name,IMDb Rating,Runtime,Genre,Director,Cast,Year
4,Saawariya,5.2,142,'Drama','Sanjay Leela Bhansali','Ranbir Kapoor',2007
4,Saawariya,5.2,142,'Musical','Sanjay Leela Bhansali','Ranbir Kapoor',2007
4,Saawariya,5.2,142,'Romance','Sanjay Leela Bhansali','Ranbir Kapoor',2007
5,Teri Baaton Mein Aisa Uljha Jiya,7.1,141,'Comedy','Amit Joshi','Shahid Kapoor',2024
5,Teri Baaton Mein Aisa Uljha Jiya,7.1,141,'Drama','Amit Joshi','Shahid Kapoor',2024


# Exploratory Data Analysis

### Descriptive analysis

In [106]:
df.describe()

Unnamed: 0,IMDb Rating,Runtime,Year
count,14801.0,14801.0,14801.0
mean,6.37388,118.789676,2013.286805
std,1.423202,57.803212,13.350599
min,1.2,1.0,1949.0
25%,5.4,121.0,2009.0
50%,6.6,138.0,2019.0
75%,7.4,153.0,2023.0
max,9.9,321.0,2024.0


In [107]:
# Find the row with the highest rating
max_rating_row = df[df['IMDb Rating'] == df['IMDb Rating'].max()]
movie_highest_rating = max_rating_row['Movie Name'].values[0]

print("Movie with the highest rating:", movie_highest_rating)
print("Rating: ", df['IMDb Rating'].max())
print('\n', '='*100, '\n')


# Find the row with the lowest rating
min_rating_row = df[df['IMDb Rating'] == df['IMDb Rating'].min()]
movie_lowest_rating = min_rating_row['Movie Name'].values[0]

print("Movie with the highest rating:", movie_lowest_rating)
print("Rating: ", df['IMDb Rating'].min())

Movie with the highest rating: Gotya
Rating:  9.9


Movie with the highest rating: Sadak 2
Rating:  1.2


In [108]:
# Group the dataset by the 'Director' column and count the number of movies each director has directed
director_counts = df['Director'].value_counts()

# Find the director with the highest number of movies directed
most_prolific_director = director_counts.idxmax()
num_movies_directed = director_counts.max()

print("Director with the most movies directed:", most_prolific_director)
print("Number of movies directed by", most_prolific_director, ":", num_movies_directed)
print('\n', '='*100, '\n')


# Group the dataset by the 'Director' column and count the number of movies each director has directed
director_counts = df['Director'].value_counts()

# Find the director with the lowest number of movies directed
least_prolific_director = director_counts.idxmin()
num_movies_directed = director_counts.min()

print("Director with the most movies directed:", least_prolific_director)
print("Number of movies directed by", least_prolific_director, ":", num_movies_directed)

Director with the most movies directed: 'Priyadarshan'
Number of movies directed by 'Priyadarshan' : 97


Director with the most movies directed: 'J.K. Bharavi'
Number of movies directed by 'J.K. Bharavi' : 1


### Univariate Analysis

In [109]:
fig_year = px.histogram(df, x = 'Year', histnorm='probability density', nbins = 30, color_discrete_sequence = colors)
fig_year.update_traces(selector=dict(type='histogram'))
fig_year.update_layout(title='Distribution of Year', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Year', yaxis_title='Probability Density', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), bargap=0.02, plot_bgcolor = 'white')
fig_year.show()

In [110]:
fig_duration = px.histogram(df, x = 'Runtime', histnorm='probability density', nbins = 40, color_discrete_sequence = colors)
fig_duration.update_traces(selector=dict(type='histogram'))
fig_duration.update_layout(title='Distribution of Duration', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Duration', yaxis_title='Probability Density', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), bargap=0.02, plot_bgcolor = 'white')
fig_duration.show()

In [111]:
fig_rating = px.histogram(df, x = 'IMDb Rating', histnorm='probability density', nbins = 40, color_discrete_sequence = colors)
fig_rating.update_traces(selector=dict(type='histogram'))
fig_rating.update_layout(title='Distribution of Rating', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Rating', yaxis_title='Probability Density', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), bargap=0.02, plot_bgcolor = 'white')
fig_rating.show()

### Bivariate Analysis

In [112]:
year_avg_rating = df.groupby('Year')['IMDb Rating'].mean().reset_index()

top_5_years = year_avg_rating.nlargest(10, 'IMDb Rating')
fig = px.bar(top_5_years, x='Year', y='IMDb Rating', title='Top 10 Years by Average Rating', color = "IMDb Rating", color_continuous_scale = "darkmint")
fig.update_xaxes(type='category')  
fig.update_layout(xaxis_title='Year', yaxis_title='Average Rating', plot_bgcolor = 'white')
fig.show()

In [113]:
# Group data by Year and calculate the average rating
average_rating_by_year = df.groupby('Year')['IMDb Rating'].mean().reset_index()

# Create the line plot with Plotly Express
fig = px.line(average_rating_by_year, x='Year', y='IMDb Rating', color_discrete_sequence=['#559C9E'])
fig.update_layout(title='Are there any trends in ratings across year?', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Year', yaxis_title='Rating', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), plot_bgcolor = 'white')
fig.show()

In [114]:
# Group data by Year and calculate the average rating
average_rating_by_year = df.groupby(['Year', 'Genre'])['IMDb Rating'].mean().reset_index()

# Get the top 3 genres
top_3_genres = df['Genre'].value_counts().head(3).index

# Filter the data to include only the top 3 genres
average_rating_by_year = average_rating_by_year[average_rating_by_year['Genre'].isin(top_3_genres)]

# Create the line plot with Plotly Express
fig = px.line(average_rating_by_year, x='Year', y='IMDb Rating', color = "Genre", color_discrete_sequence=['#559C9E', '#0B1F26', '#00CC96'])

# Customize the layout
fig.update_layout(title='Average Rating by Year for Top 3 Genres', xaxis_title='Year', yaxis_title='Average Rating', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), plot_bgcolor='white')

# Show the plot
fig.show()

In [115]:
fig_dur_rat = px.scatter(df, x = 'Runtime', y = 'IMDb Rating', trendline='ols', color = "IMDb Rating", color_continuous_scale = "darkmint")
fig_dur_rat.update_layout(title='Does length of movie have any impact on rating?', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Duration of Movie in Minutes', yaxis_title='Rating of a movie', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), plot_bgcolor = 'white')
fig_dur_rat.show()