In [3]:
import pandas as pd
import matplotlib.pyplot as plt

# path
DATA_FOLDER = 'Data/MovieSummaries/'
MOVIE_DATASET = DATA_FOLDER + 'movie.metadata.tsv'

# Dataset loading
movies = pd.read_csv(MOVIE_DATASET, sep='\t', header=None)

# define the name for each columns
movies.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue',
                  'Movie runtime', 'Movie languages (Freebase ID:name tuples)', 'Movie countries (Freebase ID:name tuples)',
                  'Movie genres (Freebase ID:name tuples)']


 ## Prepare the movie dataset


### Removing movies with missing values

In [4]:
# Verify the missing values 
print(movies.isna().sum())

# Drop the movies without release dat, box office revenue or runtime 
movies = movies.dropna(subset=['Movie release date'])

# Drop the box office revenue columns 
movies = movies.drop(columns=['Movie box office revenue'])

if movies.isna().sum().sum() == 0:
    print("No missing values")
else:
    print(movies.isna().sum())


Wikipedia movie ID                               0
Freebase movie ID                                0
Movie name                                       0
Movie release date                            6902
Movie box office revenue                     73340
Movie runtime                                20450
Movie languages (Freebase ID:name tuples)        0
Movie countries (Freebase ID:name tuples)        0
Movie genres (Freebase ID:name tuples)           0
dtype: int64
Wikipedia movie ID                               0
Freebase movie ID                                0
Movie name                                       0
Movie release date                               0
Movie runtime                                16208
Movie languages (Freebase ID:name tuples)        0
Movie countries (Freebase ID:name tuples)        0
Movie genres (Freebase ID:name tuples)           0
dtype: int64


### Create a column 'Release year' which contains the release year of the movie

In [5]:
# Convert the release date column in datetime
#error = coerce convert invalid dates to NaT,
movies['Movie release date'] = pd.to_datetime(movies['Movie release date'], errors='coerce')

# Extract the year and convert it in int
movies['Release year'] = movies['Movie release date'].dt.year
movies = movies.dropna(subset=['Release year'])
movies['Release year'] = movies['Release year'].astype(int)


print(movies[['Movie name', 'Movie release date', 'Release year']].head())

                                           Movie name Movie release date  \
0                                      Ghosts of Mars         2001-08-24   
1   Getting Away with Murder: The JonBenét Ramsey ...         2000-02-16   
5                                       The Gangsters         1913-05-29   
7                            Alexander's Ragtime Band         1938-08-16   
12                                        Little city         1997-04-04   

    Release year  
0           2001  
1           2000  
5           1913  
7           1938  
12          1997  


 ### Extract and store the movie genres in a column named 'Movie genres'

In [6]:
import json

# Function to extract genres from a JSON-formatted string
def extract_genres(genre_data):
    if isinstance(genre_data, str):
        try:
            # Replace specific occurrences of 'Children's' with 'Children'
            genre_data = genre_data.replace("Children's", "Children")

            # Convert the genre data into a dictionary
            genres_dict = json.loads(genre_data)
            return ", ".join(genres_dict.values())  # Return the genres as a comma-separated string
        except json.JSONDecodeError:
            print(f"Error decoding JSON: {genre_data}")
            return "Unknown_1"  # Return "Unknown" in case of a parsing error
    else:
        return "Unknown_2"  # If the data is not a string, return "Unknown"

# Apply this function to the movie genres column
movies['Movie genres'] = movies['Movie genres (Freebase ID:name tuples)'].apply(extract_genres)

# Check the results
print(movies[['Movie name', 'Movie genres', 'Movie genres (Freebase ID:name tuples)']].head(5))

# Display the count of different genres
print(movies['Movie genres'].value_counts())


                                           Movie name  \
0                                      Ghosts of Mars   
1   Getting Away with Murder: The JonBenét Ramsey ...   
5                                       The Gangsters   
7                            Alexander's Ragtime Band   
12                                        Little city   

                                         Movie genres  \
0   Thriller, Science Fiction, Horror, Adventure, ...   
1      Mystery, Biographical film, Drama, Crime Drama   
5   Short Film, Silent film, Indie, Black-and-whit...   
7                    Musical, Comedy, Black-and-white   
12  Romantic comedy, Ensemble Film, Comedy-drama, ...   

               Movie genres (Freebase ID:name tuples)  
0   {"/m/01jfsb": "Thriller", "/m/06n90": "Science...  
1   {"/m/02n4kr": "Mystery", "/m/03bxz7": "Biograp...  
5   {"/m/02hmvc": "Short Film", "/m/06ppq": "Silen...  
7   {"/m/04t36": "Musical", "/m/01z4y": "Comedy", ...  
12  {"/m/06cvj": "Romantic comedy"

### Store the language in a column named 'Movie languages'

In [7]:
import json

# Function to extract the language from a JSON-formatted string
def extract_language(language_data):
    if isinstance(language_data, str):
        try:
            # Convert the language data into a dictionary (assuming it's structured like genres)
            languages_dict = json.loads(language_data)
            # Assuming language is one of the values, you can modify this part depending on the actual structure
            return ", ".join(languages_dict.values())  # Return the language(s) as a comma-separated string
        except json.JSONDecodeError:
            print(f"Error decoding JSON: {language_data}")
            return "Unknown_1"  # Return "Unknown" in case of a parsing error
    else:
        return "Unknown_2"  # If the data is not a string, return "Unknown"

# Apply this function to extract the language
movies['Movie languages'] = movies['Movie languages (Freebase ID:name tuples)'].apply(extract_language)

# Check the results
print(movies[['Movie name', 'Movie languages', 'Movie languages (Freebase ID:name tuples)']].head(5))

# Display the count of different languages
print(movies['Movie languages'].value_counts())


                                           Movie name  \
0                                      Ghosts of Mars   
1   Getting Away with Murder: The JonBenét Ramsey ...   
5                                       The Gangsters   
7                            Alexander's Ragtime Band   
12                                        Little city   

                  Movie languages  \
0                English Language   
1                English Language   
5   Silent film, English Language   
7                English Language   
12               English Language   

            Movie languages (Freebase ID:name tuples)  
0                  {"/m/02h40lc": "English Language"}  
1                  {"/m/02h40lc": "English Language"}  
5   {"/m/06ppq": "Silent film", "/m/02h40lc": "Eng...  
7                  {"/m/02h40lc": "English Language"}  
12                 {"/m/02h40lc": "English Language"}  
Movie languages
English Language                                                                  

### Store the movies countries in a column named 'Movie countries'

In [8]:
import json

# Function to extract the countries from a JSON-formatted string
def extract_countries(country_data):
    if isinstance(country_data, str):
        try:
            # Convert the country data into a dictionary (assuming it's structured like genres)
            countries_dict = json.loads(country_data)
            # Assuming countries are one of the values, you can modify this part depending on the actual structure
            return ", ".join(countries_dict.values())  # Return the countries as a comma-separated string
        except json.JSONDecodeError:
            print(f"Error decoding JSON: {country_data}")
            return "Unknown_1"  # Return "Unknown" in case of a parsing error
    else:
        return "Unknown_2"  # If the data is not a string, return "Unknown"

# Apply this function to extract the countries
movies['Movie countries'] = movies['Movie countries (Freebase ID:name tuples)'].apply(extract_countries)

# Check the results
movies[['Movie name', 'Movie countries', 'Movie countries (Freebase ID:name tuples)']].head(5)

Unnamed: 0,Movie name,Movie countries,Movie countries (Freebase ID:name tuples)
0,Ghosts of Mars,United States of America,"{""/m/09c7w0"": ""United States of America""}"
1,Getting Away with Murder: The JonBenét Ramsey ...,United States of America,"{""/m/09c7w0"": ""United States of America""}"
5,The Gangsters,United States of America,"{""/m/09c7w0"": ""United States of America""}"
7,Alexander's Ragtime Band,United States of America,"{""/m/09c7w0"": ""United States of America""}"
12,Little city,United States of America,"{""/m/09c7w0"": ""United States of America""}"


In [None]:
import requests
from tqdm import tqdm
import os
import pickle
from concurrent.futures import ThreadPoolExecutor, as_completed

API_KEY = '9923aaa2a3b2777bfdeba7f76c97d212'

def get_movie_data_from_tmdb(wikipedia_id, title):
    params = {
        'api_key': API_KEY,
        'query': title,
        'language': 'en-US'
    }
    BASE_URL = 'https://api.themoviedb.org/3/search/movie'
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        data = response.json()
        if len(data['results']) == 0:
            return None  # No results found
        overview = data['results'][0]['overview']
        tmdb_id = data['results'][0]['id']
        return wikipedia_id, {"overview": overview, "tmdb_id": tmdb_id}
    else:
        print(f"Error fetching TMDB data: {response.status_code} for ID {wikipedia_id}")
        return wikipedia_id, {}

# Load existing data
if os.path.exists('Data/MovieSummaries/movie_data_from_tmdb_only_id.pkl'):
    with open('Data/MovieSummaries/movie_data_from_tmdb_only_id.pkl', 'rb') as file:
        movie_data_from_tmdb = pickle.load(file)
else:
    movie_data_from_tmdb = {}

# Define the list of movies to process
movies_to_process = [
    (movie_wikipedia_id, movie_title)
    for movie_wikipedia_id, movie_title in zip(movies['Wikipedia movie ID'], movies['Movie name'])
    if movie_wikipedia_id not in movie_data_from_tmdb
]

# Set up multithreading
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(get_movie_data_from_tmdb, movie_id, title): movie_id for movie_id, title in movies_to_process}
    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        if result:
            wikipedia_id, movie_info = result
            movie_data_from_tmdb[wikipedia_id] = movie_info

# Save the data
with open('Data/MovieSummaries/movie_data_from_tmdb_only_id.pkl', 'wb') as file:
    pickle.dump(movie_data_from_tmdb, file)


100%|██████████| 12364/12364 [01:03<00:00, 195.06it/s]


In [119]:
def download_from_url(url, save_path):
    # get the name of the file
    file_name = url.split("/")[-1].split("?")[0]
    file_path = os.path.join(save_path, file_name)
    url = url.replace("www.dropbox.com", "dl.dropboxusercontent.com")
    response = requests.get(url)
    os.makedirs(save_path, exist_ok=True)
    with open(file_path, "wb") as f:
        f.write(response.content)
    return file_path

# this is the url to the file that we preporcessed
url = "https://www.dropbox.com/scl/fi/tghsy0x20bn6n6fg6osen/movie_data_from_tmdb.pkl?rlkey=yhs1qfagp61t91go4ejwgxkh4&st=qrbqz43b&dl=0"
save_path = "Data/MovieSummaries/"
file_path = download_from_url(url, save_path)
print(f"File downloaded to: {file_path}")

File downloaded to: Data/MovieSummaries/movie_data_from_tmdb.pkl


In [116]:
def get_movie_details_from_tmdb(wikipedia_id, tmdb_id):
    BASE_URL = f'https://api.themoviedb.org/3/movie/{tmdb_id}'
    params = {
        'api_key': API_KEY,
        'language': 'en-US'
    }
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        return wikipedia_id, "details", response.json()
    else:
        print(f"Error fetching TMDB data: {response.status_code} for ID {wikipedia_id}")
        return wikipedia_id, "details", {}

def get_movie_credits_from_tmdb(wikipedia_id, tmdb_id):
    BASE_URL = f'https://api.themoviedb.org/3/movie/{tmdb_id}/credits'
    params = {
        'api_key': API_KEY
    }
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        return wikipedia_id, "credits", response.json()
    else:
        print(f"Error fetching TMDB data: {response.status_code} for ID {wikipedia_id}")
        return wikipedia_id, "credits", {}

# Load existing data
if os.path.exists('Data/MovieSummaries/movie_data_from_tmdb.pkl'):
    with open('Data/MovieSummaries/movie_data_from_tmdb.pkl', 'rb') as file:
        movie_data_from_tmdb = pickle.load(file)
elif os.path.exists('Data/MovieSummaries/movie_data_from_tmdb_only_id.pkl'):
    with open('Data/MovieSummaries/movie_data_from_tmdb_only_id.pkl', 'rb') as file:
        movie_data_from_tmdb = pickle.load(file)
else:
    raise FileNotFoundError("Run the previous cells to fetch the TMDB data first")

# Define the list of movies to process
movies_to_process = [
    (movie_wikipedia_id, movie_data_from_tmdb[movie_wikipedia_id].get('tmdb_id'))
    for movie_wikipedia_id in movie_data_from_tmdb.keys()
    if movie_data_from_tmdb[movie_wikipedia_id].get('tmdb_id')
]

# Set up multithreading
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = []
    for movie_wikipedia_id, tmdb_id in movies_to_process:
        # Schedule both detail and credits requests for each movie
        futures.append(executor.submit(get_movie_details_from_tmdb, movie_wikipedia_id, tmdb_id))
        futures.append(executor.submit(get_movie_credits_from_tmdb, movie_wikipedia_id, tmdb_id))
    
    # Collect and process the results
    for future in tqdm(as_completed(futures), total=len(futures)):
        wikipedia_id, data_type, data = future.result()
        if wikipedia_id in movie_data_from_tmdb:
            movie_data_from_tmdb[wikipedia_id][data_type] = data

# Save the data
with open('Data/MovieSummaries/movie_data_from_tmdb.pkl', 'wb') as file:
    pickle.dump(movie_data_from_tmdb, file)


 67%|██████▋   | 49842/74148 [22:01<11:49, 34.28it/s]

Error fetching TMDB data: 404 for ID 27859176
Error fetching TMDB data: 404 for ID 27859176


 78%|███████▊  | 57985/74148 [25:25<08:16, 32.56it/s]

Error fetching TMDB data: 404 for ID 6965467


 78%|███████▊  | 58001/74148 [25:25<07:04, 38.01it/s]

Error fetching TMDB data: 404 for ID 6965467


100%|██████████| 74148/74148 [32:02<00:00, 38.56it/s]


### Fill the missing values of the column 'Movie runtime'

In [117]:

def get_run_time(wikipedia_id):
    if wikipedia_id not in movie_data_from_tmdb:
        return pd.NA
    data = movie_data_from_tmdb[wikipedia_id]
    if 'details' not in data:
        return pd.NA
    data = data['details']
    if 'runtime' not in data:
        return pd.NA
    return data['runtime']

# Apply the function to fill the missing values 
movies['Movie runtime'] = movies.apply(
    lambda row: row['Movie runtime'] if pd.notnull(row['Movie runtime']) else get_run_time(row['Wikipedia movie ID']),
    axis=1
)

### Add score to the movie dataset

In [None]:
!pip install requests textblob

In [None]:
pd.set_option('display.max_rows', None)

In [118]:
# Import necessary libraries
import pandas as pd
import requests
from textblob import TextBlob


def analyze_sentiment(summary):
    if summary:
        analysis = TextBlob(summary)
        if analysis.sentiment.polarity > 0.5:
            return 5  # Very happy ending
        elif 0.1 < analysis.sentiment.polarity <= 0.5:
            return 4  # Happy ending
        elif -0.1 <= analysis.sentiment.polarity <= 0.1:
            return 3  # Neutral ending
        elif -0.5 < analysis.sentiment.polarity < -0.1:
            return 2  # Sad ending
        else:
            return 1  # Very sad ending
    else:
        return None

results = []
for movie_wikipedia_id in tqdm(movie_data_from_tmdb.keys()):
    data = movie_data_from_tmdb[movie_wikipedia_id]
    if 'overview' in data:
        summary = data['overview']
        score = analyze_sentiment(summary)
        results.append({'Score': score})

results_df = pd.DataFrame(results)

movie_data_cleaned = movies.copy()
movie_data_cleaned['Score'] = pd.Series(results_df['Score'])

output_file_path = 'Data/MovieSummaries/movies_dataset_cleaned.tsv'
movie_data_cleaned.to_csv(output_file_path, sep='\t', index=False)

print(f"Cleaned dataset saved to {output_file_path}")

100%|██████████| 37074/37074 [00:05<00:00, 6484.26it/s]


Cleaned dataset saved to Data/MovieSummaries/movies_dataset_cleaned.tsv
