In [1]:
import pandas as pd 
import numpy as np 
import re 
import matplotlib.pyplot as plt 
from bs4 import BeautifulSoup 
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.express as px

In [2]:
df = pd.read_csv("../data/TMDB_movie_dataset_v11.csv", encoding='utf-8')

In [3]:
df.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

In [4]:
df.dtypes

id                        int64
title                    object
vote_average            float64
vote_count                int64
status                   object
release_date             object
revenue                   int64
runtime                   int64
adult                      bool
backdrop_path            object
budget                    int64
homepage                 object
imdb_id                  object
original_language        object
original_title           object
overview                 object
popularity              float64
poster_path              object
tagline                  object
genres                   object
production_companies     object
production_countries     object
spoken_languages         object
keywords                 object
dtype: object

In [5]:
#drop duplicate movies
print(df["id"].is_unique)
df = df.drop_duplicates(subset="id", keep="first")
print(df["id"].is_unique)


False
True


In [6]:
#keep only the relevant columns
df_relevant = df[['title', 'release_date', 'vote_average','revenue','overview']].copy(deep = True)

In [7]:
df_relevant['release_date'] = pd.to_datetime(df_relevant['release_date'], format='%Y-%m-%d')
print(df_relevant['release_date'].isna().sum())
# Add a new column 'year' with the year extracted from the date
df_relevant['year'] = df_relevant['release_date'].dt.year
df_relevant['month'] = df_relevant['release_date'].dt.month

print(df_relevant['year'].isna().sum())
print(df_relevant['month'].isna().sum())

df_relevant['revenue'] = df_relevant['revenue'].replace(0, np.nan)

df_relevant['vote_average'] = df_relevant['vote_average'].replace(0, np.nan)


185133
185133
185133


In [8]:
df_relevant.head(5)

Unnamed: 0,title,release_date,vote_average,revenue,overview,year,month
0,Inception,2010-07-15,8.364,825532800.0,"Cobb, a skilled thief who commits corporate es...",2010.0,7.0
1,Interstellar,2014-11-05,8.417,701729200.0,The adventures of a group of explorers who mak...,2014.0,11.0
2,The Dark Knight,2008-07-16,8.512,1004558000.0,Batman raises the stakes in his war on crime. ...,2008.0,7.0
3,Avatar,2009-12-15,7.573,2923706000.0,"In the 22nd century, a paraplegic Marine is di...",2009.0,12.0
4,The Avengers,2012-04-25,7.71,1518816000.0,When an unexpected enemy emerges and threatens...,2012.0,4.0


In [9]:
df_movies=pd.read_csv("../generated/" + "cleaned_data_with_theme.csv",index_col = 'Wikipedia movie ID')
nan_counts_before = df_movies.isna().sum()
print(nan_counts_before)

Movie name                      0
Movie release month         42365
Movie release year           6901
Movie release date          42365
Movie box office revenue    73337
Movie runtime               20450
Movie languages                 0
Movie countries                 0
Movie genres                    0
Summary                     39532
Sentiment                   39532
Compound Score              39532
Theme                       39643
dtype: int64


In [10]:
# even duplicates with different ID's

duplicates_relevant = df_relevant.duplicated(subset=['year', 'title']).sum()
print(f"Duplicates in df_relevant: {duplicates_relevant}")

duplicates_movies = df_movies.duplicated(subset=['Movie release year', 'Movie name']).sum()
print(f"Duplicates in df_movies: {duplicates_movies}")

#duplicates = df_movies[df_movies.duplicated(keep=False)]  # `keep=False` marks all duplicates
#print(duplicates)


df_movies = df_movies.drop_duplicates(subset=['Movie release year', 'Movie name'], keep="first")
df_relevant = df_relevant.drop_duplicates(subset=['year', 'title'], keep="first")

duplicates_relevant = df_relevant.duplicated(subset=['year', 'title']).sum()
print(f"Duplicates in df_relevant: {duplicates_relevant}")

duplicates_movies = df_movies.duplicated(subset=['Movie release year', 'Movie name']).sum()
print(f"Duplicates in df_movies: {duplicates_movies}")

Duplicates in df_relevant: 27344
Duplicates in df_movies: 187
Duplicates in df_relevant: 0
Duplicates in df_movies: 0


In [11]:
print(df_relevant.shape)
print(df_movies.shape)

(1105379, 7)
(81550, 13)


In [12]:
df_merged = pd.merge(
    df_movies, 
    df_relevant,
    left_on=['Movie release year', 'Movie name'],
    right_on=['year', 'title'],
    how='left')


df_merged.index = df_movies.index
print(df_merged.shape)

df_merged.head(5)


(81550, 20)


Unnamed: 0_level_0,Movie name,Movie release month,Movie release year,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Summary,Sentiment,Compound Score,Theme,title,release_date,vote_average,revenue,overview,year,month
Wikipedia movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
975900,Ghosts of Mars,8.0,2001.0,2001-08-24,24771180.0,98.0,['English'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th...",negative,-0.9913,Survival,Ghosts of Mars,2001-08-24,5.127,14010832.0,"In 2176, a Martian police unit is sent to pick...",2001.0,8.0
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2.0,2000.0,2000-02-16,,95.0,['English'],['United States of America'],"['Mystery', 'Biographical film', 'Drama', 'Cri...",,,,,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,,Dramatization of the story behind the murder o...,2000.0,2.0
28463795,Brun bitter,,1988.0,,,83.0,['Norwegian'],['Norway'],"['Crime Fiction', 'Drama']",,,,,,NaT,,,,,
9363483,White Of The Eye,,1987.0,,,110.0,['English'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",A series of murders of rich young women throug...,negative,-0.9983,Nihilism,,NaT,,,,,
261236,A Woman in Flames,,1983.0,,,106.0,['German'],['Germany'],['Drama'],"Eva, an upper class housewife, becomes frustra...",positive,0.9604,Empowerment,A Woman in Flames,1983-05-11,5.3,,"Eva, an upper-class housewife, frustratedly le...",1983.0,5.0


In [13]:
inflation_data = {
    1914: 1.3, 1915: 0.9, 1916: 7.7, 1917: 17.8, 1918: 17.3, 
    1919: 15.2, 1920: 15.6, 1921: -10.9, 1922: -6.2, 1923: 1.8, 
    1924: 0.4, 1925: 2.4, 1926: 0.9, 1927: -1.9, 1928: -1.2, 
    1929: 0.0, 1930: -2.7, 1931: -8.9, 1932: -10.3, 1933: -5.2, 
    1934: 3.5, 1935: 2.6, 1936: 1.0, 1937: 3.7, 1938: -2.0, 
    1939: -1.3, 1940: 0.7, 1941: 5.1, 1942: 10.9, 1943: 6.0, 
    1944: 1.6, 1945: 2.3, 1946: 8.5, 1947: 14.4, 1948: 7.7, 
    1949: -1.0, 1950: 1.1, 1951: 7.9, 1952: 2.3, 1953: 0.8, 
    1954: 0.3, 1955: -0.3, 1956: 1.5, 1957: 3.3, 1958: 2.7, 
    1959: 1.08, 1960: 1.5, 1961: 1.1, 1962: 1.2, 1963: 1.2, 
    1964: 1.3, 1965: 1.6, 1966: 3.0, 1967: 2.8, 1968: 4.3, 
    1969: 5.5, 1970: 5.8, 1971: 4.3, 1972: 3.3, 1973: 6.2, 
    1974: 11.1, 1975: 9.1, 1976: 5.7, 1977: 6.5, 1978: 7.6, 
    1979: 11.3, 1980: 13.5, 1981: 10.3, 1982: 6.1, 1983: 3.2, 
    1984: 4.3, 1985: 3.5, 1986: 1.9, 1987: 3.7, 1988: 4.1, 
    1989: 4.8, 1990: 5.4, 1991: 4.2, 1992: 3.0, 1993: 3.0, 
    1994: 2.6, 1995: 2.8, 1996: 2.9, 1997: 2.3, 1998: 1.6, 
    1999: 2.2, 2000: 3.4, 2001: 2.8, 2002: 1.6, 2003: 2.3, 
    2004: 2.7, 2005: 3.4, 2006: 3.2, 2007: 2.9, 2008: 3.8, 
    2009: -0.4, 2010: 1.6, 2011: 3.2, 2012: 2.1, 2013: 1.5, 
    2014: 1.6, 2015: 0.1, 2016: 1.3, 2017: 2.1, 2018: 2.4, 
    2019: 1.8, 2020: 1.2, 2021: 4.7, 2022: 8.0, 2023: 4.1, 2024: 3.2
}

#source : minesota website

def adjust_for_inflation(year, amount, inflation_data):
    """
    Adjusts the given amount from the provided year to 2024 based on annual inflation rates.

    Parameters:
    year (int): The starting year.
    amount (float): The amount to be adjusted.
    inflation_data (dict): A dictionary with years as keys and inflation rates as values.

    Returns:
    float: The inflation-adjusted amount for 2024.
    """
    adjusted_value = amount
    
    for y in range(year.astype(int), 2024):
        if y in inflation_data:
            inflation_rate = inflation_data[y]
            # Adjust for inflation for each year
            adjusted_value *= (1 + inflation_rate / 100)
    return adjusted_value


for i in range(len(df_merged)):
    if pd.notna(df_merged.iloc[i]['Movie release year']) and pd.notna(df_merged.iloc[i]['revenue']):
        release_year = df_merged['Movie release year'].iloc[i]
        
        adjusted_revenue = adjust_for_inflation(release_year, df_merged.iloc[i]['revenue'], inflation_data)
        
        df_merged.iloc[i, df_merged.columns.get_loc('revenue')] = adjusted_revenue
 
df_merged.head(5)

Unnamed: 0_level_0,Movie name,Movie release month,Movie release year,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Summary,Sentiment,Compound Score,Theme,title,release_date,vote_average,revenue,overview,year,month
Wikipedia movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
975900,Ghosts of Mars,8.0,2001.0,2001-08-24,24771180.0,98.0,['English'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th...",negative,-0.9913,Survival,Ghosts of Mars,2001-08-24,5.127,24771180.0,"In 2176, a Martian police unit is sent to pick...",2001.0,8.0
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2.0,2000.0,2000-02-16,,95.0,['English'],['United States of America'],"['Mystery', 'Biographical film', 'Drama', 'Cri...",,,,,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,,Dramatization of the story behind the murder o...,2000.0,2.0
28463795,Brun bitter,,1988.0,,,83.0,['Norwegian'],['Norway'],"['Crime Fiction', 'Drama']",,,,,,NaT,,,,,
9363483,White Of The Eye,,1987.0,,,110.0,['English'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",A series of murders of rich young women throug...,negative,-0.9983,Nihilism,,NaT,,,,,
261236,A Woman in Flames,,1983.0,,,106.0,['German'],['Germany'],['Drama'],"Eva, an upper class housewife, becomes frustra...",positive,0.9604,Empowerment,A Woman in Flames,1983-05-11,5.3,,"Eva, an upper-class housewife, frustratedly le...",1983.0,5.0


In [14]:
df_merged['Summary'] = df_merged['Summary'].fillna(df_merged['overview'])

df_merged['Movie box office revenue'] = df_merged['Movie box office revenue'].fillna(df_merged['revenue'])




In [15]:
df_merged['Movie release date'] = pd.to_datetime(df_merged['Movie release date']).dt.strftime('%Y-%m-%d')
df_merged['Movie release month'] = df_merged['Movie release month'].fillna(df_merged['month'])
df_merged['Movie release year'] = df_merged['Movie release year'].fillna(df_merged['year'])
df_merged['Movie release date'] = df_merged['Movie release date'].fillna(df_merged['release_date'])
df_merged['Movie release date'] = pd.to_datetime(df_merged['Movie release date']).dt.strftime('%Y-%m-%d')

df_merged.drop(columns = ['release_date','year','month','overview','revenue','release_date','title'],inplace = True)

df_merged.head()

Unnamed: 0_level_0,Movie name,Movie release month,Movie release year,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Summary,Sentiment,Compound Score,Theme,vote_average
Wikipedia movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
975900,Ghosts of Mars,8.0,2001.0,2001-08-24,24771180.0,98.0,['English'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th...",negative,-0.9913,Survival,5.127
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2.0,2000.0,2000-02-16,,95.0,['English'],['United States of America'],"['Mystery', 'Biographical film', 'Drama', 'Cri...",Dramatization of the story behind the murder o...,,,,
28463795,Brun bitter,,1988.0,,,83.0,['Norwegian'],['Norway'],"['Crime Fiction', 'Drama']",,,,,
9363483,White Of The Eye,,1987.0,,,110.0,['English'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",A series of murders of rich young women throug...,negative,-0.9983,Nihilism,
261236,A Woman in Flames,5.0,1983.0,1983-05-11,,106.0,['German'],['Germany'],['Drama'],"Eva, an upper class housewife, becomes frustra...",positive,0.9604,Empowerment,5.3


In [16]:
nan_counts = df_merged.isna().sum()
print(nan_counts)

Movie name                      0
Movie release month         21171
Movie release year           6834
Movie release date          21171
Movie box office revenue    70535
Movie runtime               20379
Movie languages                 0
Movie countries                 0
Movie genres                    0
Summary                     19378
Sentiment                   39438
Compound Score              39438
Theme                       39549
vote_average                39682
dtype: int64


In [41]:
df_merged.to_csv("../generated/merged_movies_data.csv")

In [18]:
nan_counts_before-nan_counts

Compound Score                 94.0
Movie box office revenue     2802.0
Movie countries                 0.0
Movie genres                    0.0
Movie languages                 0.0
Movie name                      0.0
Movie release date          21194.0
Movie release month         21194.0
Movie release year             67.0
Movie runtime                  71.0
Sentiment                      94.0
Summary                     20154.0
Theme                          94.0
vote_average                    NaN
dtype: float64

In [19]:
df_merged.sample(5)

Unnamed: 0_level_0,Movie name,Movie release month,Movie release year,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Summary,Sentiment,Compound Score,Theme,vote_average
Wikipedia movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
22290948,Spain Again,2.0,1969.0,1969-02-03,,108.0,"['English', 'Spanish']",['Spain'],['Drama'],David (Mark Stevens) is a physician who return...,,,,5.8
21582030,Ishq Be Parwah,,2008.0,,,,['Urdu'],['Pakistan'],[''],,,,,
3546190,The New Maverick,9.0,1978.0,1978-09-03,,100.0,['English'],[''],['Western'],Gambling brothers Bret (James Garner) and Bart...,,,,8.0
33699244,All About My Wife,5.0,2012.0,2012-05-17,,121.0,[''],['South Korea'],"['Romance Film', 'Comedy film']","After seven years of marriage, mild-mannered D...",positive,0.9194,Deception,6.6
4569027,Chupke Chupke,4.0,1975.0,1975-04-11,,150.0,"['Hindi', 'Urdu']",['India'],"['Comedy', 'World cinema']",Professor Parimal Tripathi is a botany profes...,positive,0.9346,Deception,7.1
