<a href="https://colab.research.google.com/github/dkalenov/TMDB-Movie-Data-Analysis/blob/main/TMDB_Movie_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Functions for data processing and EDA

In [2]:
# Function for displaying basic information about the dataset

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def basic_data_preprocessing(dataframe):
    # Convert all column titles to lowercase with "_" between words
    dataframe.columns = [x.lower().replace(' ', '_') for x in dataframe.columns.values]

    # Basic information about the dataset
    rows_num, columns_num = dataframe.shape
    print(f'Number of records: {rows_num}')
    print(f'Number of columns: {columns_num}\n')
    print('-' * 75)

    # Dataset information
    print("\nDataset Information:")
    display(dataframe.info())
    print()
    print('-' * 75)

    # Checking for missing values
    missing_values = dataframe.isnull().sum()
    if missing_values.sum() > 0:
        print("Missing values:")
        display(pd.DataFrame(missing_values, columns=['Missing Count']))
        display(dataframe[dataframe.isna().any(axis=1)])
    else:
        print("No missing values.")
    print('-' * 75)

    # Checking for zero-filled columns
    zero_columns = dataframe.columns[(dataframe == 0.0).any()]
    zero_count = dataframe[zero_columns].apply(lambda x: x.value_counts().get(0, 0))
    if zero_count.sum() != 0:
        print('Rows with value 0 found:')
        display(pd.DataFrame(zero_count, columns=['Zero Count']))
    else:
        print('No rows with value 0.')
    print('-' * 75)

    # Checking for duplicates
    if dataframe.duplicated().sum() > 0:
        print("Duplicate data found. Number of duplicates:", dataframe.duplicated().sum())
        display(dataframe[dataframe.duplicated(keep=False)])
    else:
        print("No duplicates found.")
    print('-' * 75)

    # Checking for negative values in the dataframe
    # Excluding columns with string data type (object type)
    dataframe_digits = dataframe.select_dtypes(exclude=['object'])
    if (dataframe_digits < 0).sum().sum() > 0:
        print("Negative values found:")
        display(pd.DataFrame((dataframe_digits < 0).sum(), columns=['Negative Value Count']))
    else:
        print('No negative values found.')
    print('-' * 75)

    # Statistical description of the data
    print("\nStatistical Description of the Data:\n")
    display(dataframe.describe())
    print('-' * 75)

In [3]:
# Function for compression of the data type of columns in a pandas DataFrame

import pandas as pd
import numpy as np

def data_type_compression(dataframe):

    for column in dataframe.columns:
        unique_values = dataframe[column].unique()

        # Check if the column has two unique values (potentially a boolean column)
        # Uncomment the code below if you want to convert such columns to boolean type
        # if len(unique_values) == 2:
        #   dataframe[column] = dataframe[column].astype(bool)

        # Check if the column has an integer data type
        if dataframe[column].dtype == int:
            max_value = np.max(dataframe[column])

            # Assign the appropriate integer type based on the maximum value in the column
            if max_value <= np.iinfo(np.int8).max:
                dataframe[column] = dataframe[column].astype(np.int8)
            elif max_value <= np.iinfo(np.int16).max:
                dataframe[column] = dataframe[column].astype(np.int16)
            elif max_value <= np.iinfo(np.int32).max:
                dataframe[column] = dataframe[column].astype(np.int32)
            else:
                dataframe[column] = dataframe[column].astype(np.int64)

        # Check if the column has a float data type
        elif dataframe[column].dtype == float:
            max_value = np.max(dataframe[column])

            # Assign the appropriate float type based on the maximum value in the column
            if max_value <= np.finfo(np.float16).max:
                dataframe[column] = dataframe[column].astype(np.float16)
            elif max_value <= np.finfo(np.float32).max:
                dataframe[column] = dataframe[column].astype(np.float32)
            else:
                dataframe[column] = dataframe[column].astype(np.float64)

In [5]:
# Function for visualizing the distribution of different features by a target feature

import matplotlib.pyplot as plt
import seaborn as sns

def data_distribution_by_target(dataframe, target_feature):
    for column in dataframe.columns:
        if column != target_feature:
            # Create a new figure with size (8, 6)
            plt.figure(figsize=(8, 6))

            # Set style for grid (white with no grid lines)
            sns.set_style("whitegrid", {'axes.grid': False})

            # Categorical feature: barplot
            if dataframe[column].dtype == 'object':
                #plt.xticks(rotation=45, ha='right')
                plot = sns.barplot(x=column, y=target_feature, data=dataframe)

            # Date/time feature: lineplot
            elif dataframe[column].dtype == 'datetime64[ns]':
                plt.xticks(rotation=45, ha='right')
                plot = sns.lineplot(x=column, y=target_feature, data=dataframe)

            # Boolean feature: barplot
            elif dataframe[column].dtype == 'bool':
                plt.xticks([0, 1], ['False', 'True'])
                #plt.xticks(rotation=45, ha='right')
                plot = sns.barplot(x=column, y=target_feature, data=dataframe)

            # Numeric feature: histogram with kernel density estimation (kde)
            elif dataframe[column].dtype in ['float64', 'float32', 'float16']:
                # Determine the number of bins for the histogram
                n_bins = min(30, len(dataframe[column].unique()))
                # Calculate the bin width and range
                binwidth = (dataframe[column].max() - dataframe[column].min()) / n_bins
                binrange = (dataframe[column].min(), dataframe[column].max())
                # Create a histogram with KDE
                sns.histplot(data=dataframe, x=column, bins=n_bins, binwidth=binwidth, binrange=binrange, kde=True, cumulative=False)
                plt.axvline(x=dataframe[column].mean(), color='r', linestyle='--', linewidth=2)   # Add a vertical line at the mean value
                #plt.xticks(rotation=45, ha='right')

            # Other feature types: scatterplot with regression line or countplot
            else:
                if dataframe[column].nunique() > 6:
                    # Create a scatterplot with regression line
                    plot = sns.scatterplot(data=dataframe, x=column, y=target_feature, alpha=0.7)
                    sns.regplot(data=dataframe, x=column, y=target_feature, scatter=False, lowess=True, line_kws={"color": "C1"})
                    # plt.xticks(rotation=45, ha='right')
                    plt.axvline(x=dataframe[column].mean(), color='r', linestyle='--', linewidth=2)
                else:
                    # Create a countplot
                    plot = sns.countplot(data=dataframe, x=column, y=target_feature)

            plot.set_title(f'Data Distribution by {target_feature}: {column}')  # Set title
            plot.set_xlabel(column)  # Set x-axis label
            plot.set_ylabel(target_feature)  # Set y-axis label
            plt.tight_layout() # Adjust plot layout
            plt.show()

## EDA

In [9]:
import sqlite3
import pandas as pd
#import ydata_profiling
import matplotlib.pyplot as plt

In [10]:
import pandas as pd

url = 'https://raw.githubusercontent.com/dkalenov/TMDB-Movie-Data-Analysis/main/movies_tmdb.csv'
data = pd.read_csv(url)
data.head()

Unnamed: 0.1,Unnamed: 0,id,title,release_date,genres,original_language,vote_average,vote_count,popularity,overview,budget,production_companies,revenue,runtime,tagline
0,0,385687,Fast X,2023-05-17,"['Action', 'Crime', 'Thriller']",English,7.4,1347.0,8363.473,Over many missions and against impossible odds...,340000000.0,"['Universal Pictures', 'Original Film', 'One R...",652000000.0,142.0,The end of the road begins.
1,1,603692,John Wick: Chapter 4,2023-03-22,"['Action', 'Thriller', 'Crime']",English,7.9,2896.0,4210.313,"With the price on his head ever increasing, Jo...",90000000.0,"['Thunder Road', '87Eleven', 'Summit Entertain...",431769200.0,170.0,"No way back, one way out."
2,2,502356,The Super Mario Bros. Movie,2023-04-05,"['Animation', 'Family', 'Adventure', 'Fantasy'...",English,7.8,4628.0,3394.458,"While working underground to fix a water main,...",100000000.0,"['Universal Pictures', 'Illumination', 'Ninten...",1308767000.0,92.0,
3,3,569094,Spider-Man: Across the Spider-Verse,2023-05-31,"['Action', 'Adventure', 'Animation', 'Science ...",English,8.8,1160.0,2859.047,"After reuniting with Gwen Stacy, Brooklynâ€™s ...",100000000.0,"['Columbia Pictures', 'Sony Pictures Animation...",313522200.0,140.0,It's how you wear the mask that matters
4,4,536437,Hypnotic,2023-05-11,"['Mystery', 'Thriller', 'Science Fiction']",English,6.5,154.0,2654.854,A detective becomes entangled in a mystery inv...,70000000.0,"['Studio 8', 'Solstice Productions', 'Ingeniou...",0.0,94.0,Control is an illusion.


In [11]:
df = data.copy()

In [12]:
basic_data_preprocessing(df)

Number of records: 10001
Number of columns: 15

---------------------------------------------------------------------------

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   unnamed:_0            10001 non-null  object 
 1   id                    10001 non-null  int64  
 2   title                 10001 non-null  object 
 3   release_date          9978 non-null   object 
 4   genres                10001 non-null  object 
 5   original_language     10000 non-null  object 
 6   vote_average          10000 non-null  float64
 7   vote_count            10000 non-null  float64
 8   popularity            10000 non-null  float64
 9   overview              9924 non-null   object 
 10  budget                9999 non-null   float64
 11  production_companies  9999 non-null   object 
 12  revenue               9999

None


---------------------------------------------------------------------------
Missing values:


Unnamed: 0,Missing Count
unnamed:_0,0
id,0
title,0
release_date,23
genres,0
original_language,1
vote_average,1
vote_count,1
popularity,1
overview,77


Unnamed: 0,unnamed:_0,id,title,release_date,genres,original_language,vote_average,vote_count,popularity,overview,budget,production_companies,revenue,runtime,tagline
2,2,502356,The Super Mario Bros. Movie,2023-04-05,"['Animation', 'Family', 'Adventure', 'Fantasy'...",English,7.8,4628.0,3394.458,"While working underground to fix a water main,...",100000000.0,"['Universal Pictures', 'Illumination', 'Ninten...",1.308767e+09,92.0,
12,12,1010581,My Fault,2023-06-08,"['Romance', 'Drama']",Spanish,8.3,313.0,1170.670,"Noah must leave her city, boyfriend, and frien...",0.0,"['Pokeepsie Films', 'Amazon Studios']",0.000000e+00,117.0,
19,19,1098110,Blood & Gold,2023-04-21,"['Action', 'Drama', 'War']",German,6.7,164.0,957.200,"At the end of World War II, a German soldier i...",0.0,['Rat Pack Filmproduktion'],0.000000e+00,100.0,
21,21,605886,To Catch a Killer,2023-04-06,"['Action', 'Crime', 'Thriller', 'Mystery']",English,6.9,258.0,920.656,Baltimore. New Year's Eve. A talented but trou...,0.0,"['FilmNation Entertainment', 'RainMaker Films']",2.002210e+05,119.0,
23,23,1115710,The Mount 2,2023-05-12,['Horror'],English,4.5,4.0,916.637,"A year after the incident at the Mount, the po...",0.0,[],0.000000e+00,81.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9975,9974,1129975,Kiss - Hellfest 2023,2023-06-15,['Music'],French,0.0,0.0,12.320,,0.0,[],0.000000e+00,0.0,
9977,9976,110160,Laurence Anyways,2012-05-18,"['Drama', 'Romance']",French,7.7,784.0,12.319,The story of an impossible love between a woma...,9500000.0,"['MK2 Films', 'Lyla Films']",0.000000e+00,168.0,
9979,9978,39858,Entrails of a Beautiful Woman,1986-09-23,"['Horror', 'Science Fiction']",Japanese,4.8,20.0,12.316,A nurse investigates a sex trafficking ring af...,0.0,"['June Theater', 'Nikkatsu Corporation']",0.000000e+00,68.0,
9985,9984,285213,The Pirates,2014-08-06,"['Action', 'Adventure', 'Comedy', 'History', '...",Korean,7.0,175.0,12.309,"At the cusp of the founding of Joseon Dynasty,...",13000000.0,"['Lotte Entertainment', 'Harimao Pictures', 'C...",6.440000e+07,130.0,


---------------------------------------------------------------------------
Rows with value 0 found:


Unnamed: 0,Zero Count
id,1
vote_average,290
vote_count,289
budget,4649
revenue,4395
runtime,175


---------------------------------------------------------------------------
No duplicates found.
---------------------------------------------------------------------------
No negative values found.
---------------------------------------------------------------------------

Statistical Description of the Data:



Unnamed: 0,id,vote_average,vote_count,popularity,budget,revenue,runtime
count,10001.0,10000.0,10000.0,10000.0,9999.0,9999.0,9999.0
mean,300146.5,6.31909,1558.2948,33.54234,19938000.0,60843370.0,100.809581
std,337990.1,1.460605,2887.861217,126.495621,38705820.0,155307300.0,27.850165
min,0.0,0.0,0.0,12.297,0.0,0.0,0.0
25%,11504.0,5.9,143.0,14.599,0.0,0.0,90.0
50%,118406.0,6.6,519.0,18.5825,1250000.0,2133452.0,100.0
75%,537915.0,7.1,1584.0,28.0145,23000000.0,51062300.0,115.0
max,1136631.0,10.0,33822.0,8363.473,460000000.0,2923706000.0,366.0


---------------------------------------------------------------------------


In [13]:
# compress int and float data
data_type_compression(df)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   unnamed:_0            10001 non-null  object 
 1   id                    10001 non-null  int32  
 2   title                 10001 non-null  object 
 3   release_date          9978 non-null   object 
 4   genres                10001 non-null  object 
 5   original_language     10000 non-null  object 
 6   vote_average          10000 non-null  float16
 7   vote_count            10000 non-null  float16
 8   popularity            10000 non-null  float16
 9   overview              9924 non-null   object 
 10  budget                9999 non-null   float32
 11  production_companies  9999 non-null   object 
 12  revenue               9999 non-null   float32
 13  runtime               9999 non-null   float16
 14  tagline               7383 non-null   object 
dtypes: float16(4), floa

In [18]:
# Remove columns unnecessary for analysis

df.drop(['unnamed:_0','overview','tagline', 'original_language'], axis=1, inplace=True)