In [3]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [4]:
df = pd.read_csv('/content/imdb_top_1000.csv')

**Project 8: IMDb Movie Ratings & Sentiment Analysis**

In [5]:
df.head(2)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411


Data Cleaning and EDA

In [6]:
df.shape

(1000, 16)

In [7]:
df.columns

Index(['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director',
       'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')

In [8]:
# Check missing values
df.isnull().sum()

Unnamed: 0,0
Poster_Link,0
Series_Title,0
Released_Year,0
Certificate,101
Runtime,0
Genre,0
IMDB_Rating,0
Overview,0
Meta_score,157
Director,0


In [9]:
df.dtypes

Unnamed: 0,0
Poster_Link,object
Series_Title,object
Released_Year,object
Certificate,object
Runtime,object
Genre,object
IMDB_Rating,float64
Overview,object
Meta_score,float64
Director,object


In [10]:
# Change the datatype on the Gross column to float
df['Gross'] = df['Gross'].str.replace(',', '').astype(float)

In [11]:
# Inpute mean of the column in each of the missing values in df
df['Meta_score'] = df['Meta_score'].fillna(df['Meta_score'].mean())


In [12]:
# Imputing mean for the Gross column
df['Gross'] = df['Gross'].fillna(df['Gross'].mean())

In [13]:
# Taking the unique features of the certificate column
df['Certificate'].unique()

array(['A', 'UA', 'U', 'PG-13', 'R', nan, 'PG', 'G', 'Passed', 'TV-14',
       '16', 'TV-MA', 'Unrated', 'GP', 'Approved', 'TV-PG', 'U/A'],
      dtype=object)

In [14]:
## Replace nan with PG certificate
df['Certificate'] = df['Certificate'].fillna('PG')

# Replace '16' with 'PG-13'
df['Certificate'] = df['Certificate'].replace('16', 'PG-13')

In [15]:
# Check missing values again in Certificate
df['Certificate'].isnull().sum()

np.int64(0)

In [16]:
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


**Questions**

In [17]:
# Genres with the highest rating
df.groupby('Genre')['IMDB_Rating'].mean().sort_values(ascending=False)

Unnamed: 0_level_0,IMDB_Rating
Genre,Unnamed: 1_level_1
"Animation, Drama, War",8.50
"Action, Sci-Fi",8.40
"Drama, Musical",8.40
"Drama, Mystery, War",8.35
Western,8.35
...,...
"Action, Adventure, Mystery",7.60
"Action, Adventure, Family",7.60
"Action, Adventure, Crime",7.60
"Animation, Drama, Romance",7.60


Animation, Drama and war is the genre with the highest rating

In [18]:
# Effect of Runtime on Rating
df.groupby('Runtime')['IMDB_Rating'].mean().sort_values(ascending=False)

Unnamed: 0_level_0,IMDB_Rating
Runtime,Unnamed: 1_level_1
175 min,9.20
195 min,8.90
207 min,8.60
179 min,8.45
229 min,8.40
...,...
157 min,7.75
242 min,7.70
79 min,7.70
71 min,7.65


Movies with high runtime attracts lesser Ratings. Optimum runtime attacks the highest rating and movies that are too short likewise has reduced rating

Sentiment Analysis on overview


In [19]:
# Create a new dataFrame with Overviews and Genres
df_new = df[['Overview', 'Genre']]

In [20]:
df_new

Unnamed: 0,Overview,Genre
0,Two imprisoned men bond over a number of years...,Drama
1,An organized crime dynasty's aging patriarch t...,"Crime, Drama"
2,When the menace known as the Joker wreaks havo...,"Action, Crime, Drama"
3,The early life and career of Vito Corleone in ...,"Crime, Drama"
4,A jury holdout attempts to prevent a miscarria...,"Crime, Drama"
...,...,...
995,A young New York socialite becomes interested ...,"Comedy, Drama, Romance"
996,Sprawling epic covering the life of a Texas ca...,"Drama, Western"
997,"In Hawaii in 1941, a private is cruelly punish...","Drama, Romance, War"
998,Several survivors of a torpedoed merchant ship...,"Drama, War"


In [21]:
# Take the value_count of Genres
df_new['Genre'].value_counts()

Unnamed: 0_level_0,count
Genre,Unnamed: 1_level_1
Drama,85
"Drama, Romance",37
"Comedy, Drama",35
"Comedy, Drama, Romance",31
"Action, Crime, Drama",30
...,...
"Action, Adventure, Family",1
"Action, Crime, Mystery",1
"Animation, Drama, Romance",1
"Drama, War, Western",1
