In [2]:
import pandas as pd 
import numpy as np 
import re 
import matplotlib.pyplot as plt 
from bs4 import BeautifulSoup 
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.express as px

In [3]:
df = pd.read_csv("../data/TMDB_movie_dataset_v11.csv", encoding='utf-8')

In [4]:
#drop duplicate movies
print(df["id"].is_unique)
df = df.drop_duplicates(subset="id", keep="first")
print(df["id"].is_unique)


False
True


In [5]:
#keep only the relevant columns
df_relevant = df[['title', 'release_date','budget']].copy(deep = True)

In [6]:
df_relevant['release_date'] = pd.to_datetime(df_relevant['release_date'], format='%Y-%m-%d')
print(df_relevant['release_date'].isna().sum())
# Add a new column 'year' with the year extracted from the date
df_relevant['year'] = df_relevant['release_date'].dt.year
df_relevant['month'] = df_relevant['release_date'].dt.month

print(df_relevant['year'].isna().sum())
print(df_relevant['month'].isna().sum())

df_relevant['budget'] = df_relevant['budget'].replace(0, np.nan)


185133
185133
185133


In [7]:
df_relevant.head(5)

Unnamed: 0,title,release_date,budget,year,month
0,Inception,2010-07-15,160000000.0,2010.0,7.0
1,Interstellar,2014-11-05,165000000.0,2014.0,11.0
2,The Dark Knight,2008-07-16,185000000.0,2008.0,7.0
3,Avatar,2009-12-15,237000000.0,2009.0,12.0
4,The Avengers,2012-04-25,220000000.0,2012.0,4.0


In [8]:
df_movies=pd.read_csv("../generated/" + "cleaned_data_with_theme.csv",index_col = 'Wikipedia movie ID')


In [9]:
# even duplicates with different ID's

duplicates_relevant = df_relevant.duplicated(subset=['year', 'title']).sum()
print(f"Duplicates in df_relevant: {duplicates_relevant}")

duplicates_movies = df_movies.duplicated(subset=['Movie release year', 'Movie name']).sum()
print(f"Duplicates in df_movies: {duplicates_movies}")

#duplicates = df_movies[df_movies.duplicated(keep=False)]  # `keep=False` marks all duplicates
#print(duplicates)


df_movies = df_movies.drop_duplicates(subset=['Movie release year', 'Movie name'], keep="first")
df_relevant = df_relevant.drop_duplicates(subset=['year', 'title'], keep="first")

duplicates_relevant = df_relevant.duplicated(subset=['year', 'title']).sum()
print(f"Duplicates in df_relevant: {duplicates_relevant}")

duplicates_movies = df_movies.duplicated(subset=['Movie release year', 'Movie name']).sum()
print(f"Duplicates in df_movies: {duplicates_movies}")

Duplicates in df_relevant: 27344
Duplicates in df_movies: 187
Duplicates in df_relevant: 0
Duplicates in df_movies: 0


In [10]:
print(df_relevant.shape)
print(df_movies.shape)

(1105379, 5)
(81550, 13)


In [11]:
df_merged = pd.merge(
    df_movies, 
    df_relevant,
    left_on=['Movie release year', 'Movie name'],
    right_on=['year', 'title'],
    how='left')


df_merged.index = df_movies.index
print(df_merged.shape)

df_merged.head(5)


(81550, 18)


Unnamed: 0_level_0,Movie name,Movie release month,Movie release year,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Summary,Sentiment,Compound Score,Theme,title,release_date,budget,year,month
Wikipedia movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
975900,Ghosts of Mars,8.0,2001.0,2001-08-24,24771180.0,98.0,['English'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th...",negative,-0.9913,Survival,Ghosts of Mars,2001-08-24,28000000.0,2001.0,8.0
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2.0,2000.0,2000-02-16,,95.0,['English'],['United States of America'],"['Mystery', 'Biographical film', 'Drama', 'Cri...",,,,,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,2000.0,2.0
28463795,Brun bitter,,1988.0,,,83.0,['Norwegian'],['Norway'],"['Crime Fiction', 'Drama']",,,,,,NaT,,,
9363483,White Of The Eye,,1987.0,,,110.0,['English'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",A series of murders of rich young women throug...,negative,-0.9983,Nihilism,,NaT,,,
261236,A Woman in Flames,,1983.0,,,106.0,['German'],['Germany'],['Drama'],"Eva, an upper class housewife, becomes frustra...",positive,0.9604,Empowerment,A Woman in Flames,1983-05-11,,1983.0,5.0


In [12]:
inflation_data = {
    1914: 1.3, 1915: 0.9, 1916: 7.7, 1917: 17.8, 1918: 17.3, 
    1919: 15.2, 1920: 15.6, 1921: -10.9, 1922: -6.2, 1923: 1.8, 
    1924: 0.4, 1925: 2.4, 1926: 0.9, 1927: -1.9, 1928: -1.2, 
    1929: 0.0, 1930: -2.7, 1931: -8.9, 1932: -10.3, 1933: -5.2, 
    1934: 3.5, 1935: 2.6, 1936: 1.0, 1937: 3.7, 1938: -2.0, 
    1939: -1.3, 1940: 0.7, 1941: 5.1, 1942: 10.9, 1943: 6.0, 
    1944: 1.6, 1945: 2.3, 1946: 8.5, 1947: 14.4, 1948: 7.7, 
    1949: -1.0, 1950: 1.1, 1951: 7.9, 1952: 2.3, 1953: 0.8, 
    1954: 0.3, 1955: -0.3, 1956: 1.5, 1957: 3.3, 1958: 2.7, 
    1959: 1.08, 1960: 1.5, 1961: 1.1, 1962: 1.2, 1963: 1.2, 
    1964: 1.3, 1965: 1.6, 1966: 3.0, 1967: 2.8, 1968: 4.3, 
    1969: 5.5, 1970: 5.8, 1971: 4.3, 1972: 3.3, 1973: 6.2, 
    1974: 11.1, 1975: 9.1, 1976: 5.7, 1977: 6.5, 1978: 7.6, 
    1979: 11.3, 1980: 13.5, 1981: 10.3, 1982: 6.1, 1983: 3.2, 
    1984: 4.3, 1985: 3.5, 1986: 1.9, 1987: 3.7, 1988: 4.1, 
    1989: 4.8, 1990: 5.4, 1991: 4.2, 1992: 3.0, 1993: 3.0, 
    1994: 2.6, 1995: 2.8, 1996: 2.9, 1997: 2.3, 1998: 1.6, 
    1999: 2.2, 2000: 3.4, 2001: 2.8, 2002: 1.6, 2003: 2.3, 
    2004: 2.7, 2005: 3.4, 2006: 3.2, 2007: 2.9, 2008: 3.8, 
    2009: -0.4, 2010: 1.6, 2011: 3.2, 2012: 2.1, 2013: 1.5, 
    2014: 1.6, 2015: 0.1, 2016: 1.3, 2017: 2.1, 2018: 2.4, 
    2019: 1.8, 2020: 1.2, 2021: 4.7, 2022: 8.0, 2023: 4.1, 2024: 3.2
}

#source : minesota website

def adjust_for_inflation(year, amount, inflation_data):
    """
    Adjusts the given amount from the provided year to 2024 based on annual inflation rates.

    Parameters:
    year (int): The starting year.
    amount (float): The amount to be adjusted.
    inflation_data (dict): A dictionary with years as keys and inflation rates as values.

    Returns:
    float: The inflation-adjusted amount for 2024.
    """
    adjusted_value = amount
    
    for y in range(year.astype(int), 2024):
        if y in inflation_data:
            inflation_rate = inflation_data[y]
            # Adjust for inflation for each year
            adjusted_value *= (1 + inflation_rate / 100)
    return adjusted_value


for i in range(len(df_merged)):
    if pd.notna(df_merged.iloc[i]['Movie release year']) and pd.notna(df_merged.iloc[i]['budget']):
        release_year = df_merged['Movie release year'].iloc[i]
        
        adjusted_revenue = adjust_for_inflation(release_year, df_merged.iloc[i]['budget'], inflation_data)
        
        df_merged.iloc[i, df_merged.columns.get_loc('budget')] = adjusted_revenue
 
df_merged.head(5)

Unnamed: 0_level_0,Movie name,Movie release month,Movie release year,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Summary,Sentiment,Compound Score,Theme,title,release_date,budget,year,month
Wikipedia movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
975900,Ghosts of Mars,8.0,2001.0,2001-08-24,24771180.0,98.0,['English'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th...",negative,-0.9913,Survival,Ghosts of Mars,2001-08-24,49504060.0,2001.0,8.0
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2.0,2000.0,2000-02-16,,95.0,['English'],['United States of America'],"['Mystery', 'Biographical film', 'Drama', 'Cri...",,,,,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,2000.0,2.0
28463795,Brun bitter,,1988.0,,,83.0,['Norwegian'],['Norway'],"['Crime Fiction', 'Drama']",,,,,,NaT,,,
9363483,White Of The Eye,,1987.0,,,110.0,['English'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",A series of murders of rich young women throug...,negative,-0.9983,Nihilism,,NaT,,,
261236,A Woman in Flames,,1983.0,,,106.0,['German'],['Germany'],['Drama'],"Eva, an upper class housewife, becomes frustra...",positive,0.9604,Empowerment,A Woman in Flames,1983-05-11,,1983.0,5.0


In [13]:

df_merged.drop(columns = ['release_date','year','month','title'],inplace = True)

df_merged.head()

Unnamed: 0_level_0,Movie name,Movie release month,Movie release year,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Summary,Sentiment,Compound Score,Theme,budget
Wikipedia movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
975900,Ghosts of Mars,8.0,2001.0,2001-08-24,24771180.0,98.0,['English'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th...",negative,-0.9913,Survival,49504060.0
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2.0,2000.0,2000-02-16,,95.0,['English'],['United States of America'],"['Mystery', 'Biographical film', 'Drama', 'Cri...",,,,,
28463795,Brun bitter,,1988.0,,,83.0,['Norwegian'],['Norway'],"['Crime Fiction', 'Drama']",,,,,
9363483,White Of The Eye,,1987.0,,,110.0,['English'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",A series of murders of rich young women throug...,negative,-0.9983,Nihilism,
261236,A Woman in Flames,,1983.0,,,106.0,['German'],['Germany'],['Drama'],"Eva, an upper class housewife, becomes frustra...",positive,0.9604,Empowerment,


In [14]:
df_merged.to_csv("../generated/movies_with_budget.csv")

In [35]:
df_merged['Movie box office revenue'].isna().sum()

73160

In [45]:
df_relevant['budget'].isna().sum()

1049404

In [47]:
df_relevant['budget'].shape

(1105379,)

In [49]:
1049404-1105379

-55975

In [19]:
df = df[df['title'] == 'Ghosts of Mars']
df

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
4000,10016,Ghosts of Mars,5.127,977,Released,2001-08-24,14010832,98,False,/anSbunnEMI0TSmizqUSRACoe18l.jpg,...,Ghosts of Mars,"In 2176, a Martian police unit is sent to pick...",14.189,/i2zztssCIbahGES1fdfWFmDXian.jpg,Terror is the same on any planet.,"Action, Horror, Science Fiction","Animationwerks, Screen Gems, Storm King Produc...",United States of America,English,"future, planet mars, anti hero, possession, ho..."


In [23]:
df_ = df_relevant[df_relevant['title'] == 'Ghosts of Mars']
df_

Unnamed: 0,title,release_date,budget,year,month
4000,Ghosts of Mars,2001-08-24,28000000.0,2001.0,8.0


In [66]:
# Temporarily display all rows
pd.set_option('display.max_rows', None)

print(df_merged['Theme'].value_counts())

# (Optional) Reset the option if needed
pd.reset_option('display.max_rows')

Theme
Redemption              4822
Love                    2512
Betrayal                2386
Revenge                 2312
Survival                1735
Deception               1304
Identity                 987
Friendship               981
Resilience               975
Adventure                790
Sacrifice                631
Obsession                604
Ambition                 563
Conflict                 548
Isolation                528
Justice                  511
Rebellion                468
ransformation            428
Chaos                    378
Corruption               355
Desire                   354
Family                   339
Desperation              330
Despair                  317
Courage                  316
Empowerment              311
Mystery                  288
Comedy                   269
Reunion                  264
Reconciliation           255
Greed                    243
Perseverance             241
Loss                     225
Resistance               203
Connecti