# **Top 10000 Popular Movies Dataset**

### Including libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Reading csv file as df

In [None]:
df = pd.read_csv('./popular_10000_movies_tmdb.csv')
#First five rows of DataFrame
df.head()

### Shape of DataFrame

In [None]:
df.shape

### List of columns in DataFrame

In [None]:
df.columns

In [None]:
df['tagline'][0]

### Dropping useless columns from DataFrame

In [None]:
df.drop(['id','overview','tagline'], axis=1, inplace=True)

### List of final columns

In [None]:
df.columns

### Checking for null values in DataFrame

In [None]:
df.isnull().sum()

### Dropping rows with null values from DataFrame

In [None]:
df.dropna(axis=0, inplace=True)

### Unique languages present in "original_language" column

In [None]:
df['original_language'].unique()

### Rows with "xx" as original_language

In [None]:
df[df['original_language'] == 'xx']

### Dropping rows with "xx" original_language

In [None]:
df = df[~(df['original_language'] == 'xx')]

### Rows with "sh" as original_language

In [None]:
df[df['original_language'] == 'sh']

### Dropping rows with "sh" original_language

In [None]:
df = df[~(df['original_language'] == 'sh')]

### No. of duplicate entries in the DataFrame

In [None]:
df.duplicated().sum()

### No. of movies with unique titles

In [None]:
df['title'].nunique()

### No. of movies with same title

In [None]:
df['title'].duplicated().sum()

### Rows with duplicate movie titles

In [None]:
df[df['title'].isin(df['title'][df['title'].duplicated()])].sort_values('title')

### Rows with empty genres column

In [None]:
df[df['genres'].str.len() == 2]

### Dropping rows with empty genres column

In [None]:
df = df[~(df['genres'].str.len() == 2)]

### Rows with empty production_companies column

In [None]:
df[df['production_companies'].str.len() == 2]

### Dropping rows with empty production_companies column

In [None]:
df = df[~(df['production_companies'].str.len() == 2)]

### Zeroes Count in Columns

In [None]:
(df == 0).sum()

### Rows with zero vote_average or vote_count or runtime values

In [None]:
df[(df['vote_average'] == 0) | (df['vote_count'] == 0) | (df['runtime'] == 0)]

### Dropping rows with zero vote_average or vote_count or runtime values

In [None]:
df = df[~((df['vote_average'] == 0) | (df['vote_count'] == 0) | (df['runtime'] == 0))]

### Descriptive Statistics of DataFrame

In [None]:
df.describe()#include='object')

### Adding profit column in DataFrame

In [None]:
df['profit'] = df['revenue'] - df['budget']

In [None]:
df.head()

### Saving our DataFrame as csv file

In [None]:
df.to_csv('movies-2.csv', index=False)

### Top 10 Movies of Every Category

In [None]:
max_budget = df.sort_values('budget', ascending=False).head(10)
#max_budget
max_revenue = df.sort_values('revenue', ascending=False).head(10)
#max_revenue
max_profit = df.sort_values('profit', ascending=False).head(10)
#max_profit
max_loss = df.sort_values('profit', ascending=True).head(10)
max_loss['loss'] = -max_loss['profit']
#max_loss
max_popularity = df.sort_values('popularity', ascending=False).head(10)
#max_popularity
max_vc = df.sort_values('vote_count', ascending=False).head(10)
#max_vc
max_va = df.sort_values('vote_average', ascending=False).head(10)
#max_va

### Visualization of Top 10 Movies of Every Category

In [None]:
fig, axs = plt.subplots(4,1,figsize=(10,28))

col_map = plt.get_cmap('tab20')
axs[0].barh(max_budget['title'], max_budget['budget'], color=col_map.colors)
axs[0].set_title('Top 10 highest budget movies')
axs[0].set_xlabel('Budget')
axs[0].set_ylabel('Movies')
for i in range(len(max_budget)):
    axs[0].text(max_budget['budget'].iloc[i], max_budget['title'].iloc[i], max_budget['budget'].iloc[i])

col_map = plt.get_cmap('tab10')
axs[1].barh(max_revenue['title'], max_revenue['revenue'], color=col_map.colors)
axs[1].set_title('Top 10 highest revenue movies')
axs[1].set_xlabel('Revenue')
axs[1].set_ylabel('Movies')
for i in range(len(max_revenue)):
    axs[1].text(max_revenue['revenue'].iloc[i], max_revenue['title'].iloc[i], max_revenue['revenue'].iloc[i])

col_map = plt.get_cmap('tab20c')
axs[2].barh(max_profit['title'], max_profit['profit'], color=col_map.colors)
axs[2].set_title('Top 10 most profitable movies')
axs[2].set_xlabel('Profit')
axs[2].set_ylabel('Movies')
for i in range(len(max_profit)):
    axs[2].text(max_profit['profit'].iloc[i], max_profit['title'].iloc[i], max_profit['profit'].iloc[i])
    
col_map = plt.get_cmap('tab20')
axs[3].barh(max_loss['title'], max_loss['loss'], color=col_map.colors)
axs[3].set_title('Top 10 most loss making movies')
axs[3].set_xlabel('Loss')
axs[3].set_ylabel('Movies')
for i in range(len(max_loss)):
    axs[3].text(max_loss['loss'].iloc[i], max_loss['title'].iloc[i], max_loss['loss'].iloc[i])

plt.show()

In [None]:
fig, axs = plt.subplots(3,1,figsize=(10,21))

col_map = plt.get_cmap('tab20')
axs[0].barh(max_popularity['title'], max_popularity['popularity'], color=col_map.colors)
axs[0].set_title('Top 10 most popular movies')
axs[0].set_xlabel('Popularity')
axs[0].set_ylabel('Movies')
for i in range(len(max_popularity)):
    axs[0].text(max_popularity['popularity'].iloc[i], max_popularity['title'].iloc[i], max_popularity['popularity'].iloc[i])

col_map = plt.get_cmap('tab10')
axs[1].barh(max_vc['title'], max_vc['vote_count'], color=col_map.colors)
axs[1].set_title('Top 10 highest vote count movies')
axs[1].set_xlabel('Vote Count')
axs[1].set_ylabel('Movies')
for i in range(len(max_vc)):
    axs[1].text(max_vc['vote_count'].iloc[i], max_vc['title'].iloc[i], max_vc['vote_count'].iloc[i])

col_map = plt.get_cmap('tab20c')
axs[2].barh(max_va['title'], max_va['vote_average'], color=col_map.colors)
axs[2].set_title('Top 10 highest vote average movies')
axs[2].set_xlabel('Vote Average')
axs[2].set_ylabel('Movies')
for i in range(len(max_va)):
    axs[2].text(max_va['vote_average'].iloc[i], max_va['title'].iloc[i], max_va['vote_average'].iloc[i])

plt.show()

### Count of Movies wrt Language in Dataset

In [None]:
df['original_language'].value_counts()

### No. of all unique genres in Dataset

In [None]:
genres_col = df['genres']
# Empty set to store unique genres
unique_genres = set()
# Iterate over each row in the genres column
for genres_list in genres_col:
    genres = eval(genres_list)  # Convert the string representation of list to a list
    unique_genres.update(genres)  # Add the genres to the set
    
len(unique_genres)

### 5 Most Popular Movies of Every Genre

In [None]:
for genre in unique_genres:
    movies = df[df['genres'].str.contains(genre)].sort_values('popularity', ascending=False).head(5)
    movie_title = movies['title']
    print(genre)
    for i ,title in enumerate(movie_title):
        print(i+1, title)
    print('\n')

### 5 Most Profitable Movies of Every Genre

In [None]:
for genre in unique_genres:
    movies = df[df['genres'].str.contains(genre)].sort_values('profit', ascending=False).head(5)
    movie_title = movies['title']
    print(genre)
    for i ,title in enumerate(movie_title):
        print(i+1, title)
    print('\n')

### Top 5 production companies with maximum movie count

In [None]:
companies_column = df['production_companies']

# Create an empty dictionary to store company names and their movie counts
company_counts = {}

# Iterate over each row in the companies column
for companies_list in companies_column:
    companies = eval(companies_list)  # Convert the string representation of list to a list
    for company in companies:
        if company in company_counts:
            company_counts[company] += 1  # Increment the movie count
        else:
            company_counts[company] = 1  # Add the company with initial movie count

sorted_companies = sorted(company_counts.items(), key=lambda x: x[1], reverse=True)

top_5_companies = sorted_companies[:5]

for company, count in top_5_companies:
    print(company, ': ', count)

### Most popular movie in every 5 years with their popularity

In [None]:
# Convert the release_date column to datetime
df['release_date'] = pd.to_datetime(df['release_date'])

# Create a new column for the release half_decade
df['release_hd'] = (df['release_date'].dt.year // 5) * 5

popular_movies = df.groupby('release_hd').apply(lambda x: x.nlargest(1, 'popularity')) # 1 is for no. of rows we want in every decade

for i, r in popular_movies.iterrows():
    print(f"Year: {r['release_hd']}-{r['release_hd']+4}")
    print("Movie:", r['title'])
    print("Popularity:", r['popularity'])
    print("\n")

### Average Popularity by Year

In [None]:
df['year'] = df['release_date'].dt.year
avg_pop = df.groupby('year')['popularity'].mean()

### Visualization of Categories by Year

In [None]:
fig, axs = plt.subplots(3,2,figsize=(15,21))

axs[0,0].plot(avg_pop.index, avg_pop.values)
axs[0,0].set_title('Average Popularity with time')
axs[0,0].set_xlabel('Year')
axs[0,0].set_ylabel('Average Popularity')
axs[0,0].set_xticks(np.arange(1900, 2030, step=10),rotation=45)

axs[0,1].plot(df.groupby('year')['vote_count'].mean(),color='green')
axs[0,1].set_title('Average Vote Count with time')
axs[0,1].set_xlabel('Year')
axs[0,1].set_ylabel('Vote Count')
axs[0,1].set_xticks(np.arange(1900, 2030, step=10),rotation=45)

axs[1,0].plot(df.groupby('year')['vote_average'].mean(),color='red')
axs[1,0].set_title('Vote Average with time')
axs[1,0].set_xlabel('Year')
axs[1,0].set_ylabel('Vote Average')
axs[1,0].set_xticks(np.arange(1900, 2030, step=10),rotation=45)

axs[1,1].plot(df.groupby('year')['budget'].mean(), color='orange')
axs[1,1].set_title('Average Budget with time')
axs[1,1].set_xlabel('Year')
axs[1,1].set_ylabel('Budget')
axs[1,1].set_xticks(np.arange(1900, 2030, step=10),rotation=45)

axs[2,0].plot(df.groupby('year')['revenue'].mean(),color='blue')
axs[2,0].set_title('Revenue with time')
axs[2,0].set_xlabel('Year')
axs[2,0].set_ylabel('Revenue')
axs[2,0].set_xticks(np.arange(1900, 2030, step=10),rotation=45)

axs[2,1].plot(df.groupby('year')['profit'].mean(), color='violet')
axs[2,1].set_title('Profit with time')
axs[2,1].set_xlabel('Year')
axs[2,1].set_ylabel('Profit')
axs[2,1].set_xticks(np.arange(1900, 2030, step=10),rotation=45)

plt.show()

### Movie Count with time

In [None]:
movie_count = df['year'].value_counts().sort_index()
fig, ax = plt.subplots(figsize=(12,10))
plt.plot(movie_count, color='yellow')
plt.title('Movies Count with time')
plt.xlabel('Year')
plt.ylabel('Movies Count')
plt.xticks(np.arange(1900, 2030, step=10),rotation=45)
for i in range(5):
    ax.text(movie_count.index[-5+i], movie_count.values[-5+i], movie_count.values[-5+i])
plt.show()

### Visualization of Rise and Drop of Popularity with different genres over the time

In [None]:
for genre in unique_genres:
    movies = df[df['genres'].str.contains(genre)]
    avg_popularity = movies.groupby('year')['popularity'].mean()
    plt.plot(avg_popularity.index, avg_popularity.values)
    plt.title('Rise or Drop in Popularity of '+str(genre)+' Movies over Time')
    plt.xlabel('Year')
    plt.ylabel('Popularity')
    plt.xticks(np.arange(1900, 2030, step=5),rotation=90)
    plt.grid(axis='y')
    plt.show()

### Visualization of Rise and Drop of Popularity with different languages over the time

In [None]:
unique_lang = {"English","Japanese","French","Chinese","German","Russian"}
   
for language in unique_lang:
    movies = df[df['original_language'].str.contains(language)]
    avg_popularity = movies.groupby('year')['popularity'].mean()
    plt.plot(avg_popularity.index, avg_popularity.values, color='red')
    plt.title('Rise or Drop in Popularity in '+str(language)+' Movies over Time')
    plt.xlabel('Year')
    plt.ylabel('Popularity')
    plt.xticks(np.arange(1900, 2030, step=5),rotation=90)
    plt.grid(axis='y')
    plt.show()