# Importing the dependencies

In [85]:
import pandas as pd
import sqlite3
import re

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup 

from tmdbv3api import TMDb
import requests

import google.generativeai as genai
import vertexai
from vertexai.generative_models import GenerativeModel, Part

import os
import time
from dotenv import load_dotenv


load_dotenv()
GOOGLE_API_KEY_MOVIE_RECOMMENDER = os.getenv("GOOGLE_API_KEY_MOVIE_RECOMMENDER")
TMDB_API_KEY = os.getenv("TMDB_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY_MOVIE_RECOMMENDER)

# Loading the dataset

In [3]:
database = "../SQL_Database/Movies.db"

database_key_based = pd.read_sql_query("SELECT m.* FROM Movies_Key_Based AS m", sqlite3.connect(database))
database_query_based = pd.read_sql_query("SELECT m.* FROM Movies_Database AS m", sqlite3.connect(database))

In [4]:
database_key_based.head()

Unnamed: 0,id,title,keywords,review_summary,tags,embeddings
0,283995,Guardians of the Galaxy Vol. 2,"['demi god', 'alien creature', 'sarcasm', 'cra...",Guardians of the Galaxy Vol. 2 elicited a gene...,adventure action sci-fi chris pratt zoe saldañ...,"[-0.010018928121777062, -0.042597577593544884,..."
1,480530,Creed II,"['baby', 'training montage', 'sequel', 'boxing...",Creed II elicits a mixed response from audienc...,drama michael b. jordan sylvester stallone tes...,"[-0.012480250747134571, -0.02905849380429892, ..."
2,299536,Avengers: Infinity War,"['superhero', 'ensemble cast', 'marvel cinemat...",Avengers: Infinity War elicits a generally pos...,adventure action sci-fi robert downey jr. chri...,"[-0.01140899767743463, -0.027857139652446076, ..."
3,299534,Avengers: Endgame,"['time travel', 'superhero', 'super villain', ...",Avengers: Endgame elicited a largely positive ...,adventure sci-fi action robert downey jr. chri...,"[-0.00412223552630982, -0.031019326010432745, ..."
4,337167,Fifty Shades Freed,"['sex scene', 'wedding ceremony', 'bondage', '...",The audience reaction to Fifty Shades Freed is...,drama romance dakota johnson jamie dornan eric...,"[-0.0075680177194708525, -0.024964091224630152..."


In [5]:
database_query_based.head()

Unnamed: 0,id,IMDB_ID,title,release_year,genres,vote_average,cast,Director,keywords,reviews,review_sentiment,review_summary,poster_path,backdrop_path
0,283995,tt3896198,Guardians of the Galaxy Vol. 2,2017,"['Adventure', 'Action', 'Sci-Fi']",7.623,"['Chris Pratt', 'Zoe Saldaña', 'Dave Bautista'...",James Gunn,"['demi god', 'alien creature', 'sarcasm', 'cra...","[""Despite being a huge comic book nerd I was n...","['positive', 'positive', 'positive', 'positive...",Guardians of the Galaxy Vol. 2 elicited a gene...,/y4MBh0EjBlMuOzv9axM4qJlmhzz.jpg,/aJn9XeesqsrSLKcHfHP4u5985hn.jpg
1,480530,tt6343314,Creed II,2018,['Drama'],6.99,"['Michael B. Jordan', 'Sylvester Stallone', 'T...",Steven Caple Jr.,"['baby', 'training montage', 'sequel', 'boxing...","[""This movie is not as good as the first Creed...","['negative', 'positive', 'positive', 'positive...",Creed II elicits a mixed response from audienc...,/v3QyboWRoA4O9RbcsqH8tJMe8EB.jpg,/xTYGN1b3XkOtODryXTKgdXLtPMz.jpg
2,299536,tt4154756,Avengers: Infinity War,2018,"['Adventure', 'Action', 'Sci-Fi']",8.26,"['Robert Downey Jr.', 'Chris Hemsworth', 'Mark...",Anthony RussoJoe Russo,"['superhero', 'ensemble cast', 'marvel cinemat...","[""Avengers infinity war is an emotional roller...","['positive', 'positive', 'positive', 'positive...",Avengers: Infinity War elicits a generally pos...,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg
3,299534,tt4154796,Avengers: Endgame,2019,"['Adventure', 'Sci-Fi', 'Action']",8.268,"['Robert Downey Jr.', 'Chris Evans', 'Mark Ruf...",Anthony RussoJoe Russo,"['time travel', 'superhero', 'super villain', ...","[""But its a pretty good film. A bit of a mess ...","['positive', 'positive', 'positive', 'positive...",Avengers: Endgame elicited a largely positive ...,/or06FN3Dka5tukK1e9sl16pB3iy.jpg,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg
4,337167,tt4477536,Fifty Shades Freed,2018,"['Drama', 'Romance']",6.699,"['Dakota Johnson', 'Jamie Dornan', 'Eric Johns...",James Foley,"['sex scene', 'wedding ceremony', 'bondage', '...","[""The first of the three that is actually emot...","['positive', 'negative', 'negative', 'negative...",The audience reaction to Fifty Shades Freed is...,/9ZedQHPQVveaIYmDSTazhT3y273.jpg,/9ywA15OAiwjSTvg3cBs9B7kOCBF.jpg


In [62]:
database_query_based.tail()

Unnamed: 0,id,IMDB_ID,title,release_year,genres,vote_average,cast,Director,keywords,reviews,review_sentiment,review_summary,synopsis,poster_path,backdrop_path
909,484889,tt6076226,Rise of the Footsoldier 3,2017,"['Action', 'Crime', 'Drama', 'Thriller']",6.3,"['Craig Fairbrass', 'Terry Stone', 'Roland Man...",Zackary Adler,[],"[""There's a glut of these sort of films but th...","['positive', 'positive', 'negative', 'positive...",Audience reaction to Rise of the Footsoldier 3...,"Notorious gangster, Pat Tate, rises through th...",/1LgOIQNpTfWKY3wizRrQ6CvNAXK.jpg,/vC0x4816uI8yHSLaCzjfdTTSDK1.jpg
910,592230,tt7394674,Blood Quantum,2019,['Horror'],5.8,"['Michael Greyeyes', 'Elle', 'Máijá Tailfeathe...",Jeff Barnaby,[],"[""With the lack of a trailer or really any inf...","['positive', 'positive', 'negative', 'positive...","Audience reaction to Blood Quantum is mixed, w...",The dead are coming back to life outside the i...,/pQnfrys3nyOpUxktxK2CBnm7Rv8.jpg,/xg1adjc8iEsQ4znJNjUQSvuiPjr.jpg
911,574638,tt9577852,Rolling Thunder Revue: A Bob Dylan Story by Ma...,2019,"['Documentary', 'Music']",7.1,"['Bob Dylan', 'Allen Ginsberg', 'Patti Smith',...",Martin Scorsese,[],"[""My ex and I saw Bob Dylan perform in 1984, a...","['positive', 'positive', 'positive', 'positive...",Audience reaction to *Rolling Thunder Revue: A...,"In an alchemic mix of fact and fantasy, Martin...",/ixxELBgYj9OH8hz0XCrcZOJpIx9.jpg,/4MYtYsSGzQUUYQqHGWBNeEkT91s.jpg
912,299782,tt0069049,The Other Side of the Wind,2018,['Drama'],6.7,"['John Huston', 'Oja Kodar', 'Peter Bogdanovic...",Orson Welles,"['film business', 'nudity', 'female nudity', '...","[""Years ago I saw a documentary that included ...","['positive', 'positive', 'positive', 'positive...",The Other Side of the Wind elicits a mixed res...,At a media-swamped party to celebrate his 70th...,/kFky1paYEfHxfCYByEc9g7gn6Zk.jpg,/wXUcSJG6dqNKgIRgqYqX98UA1wz.jpg
913,431093,tt5537228,Everybody Loves Somebody,2017,"['Romance', 'Comedy']",6.6,"['Karla Souza', 'José María Yázpik', ""Ben O'To...",Catalina Aguilar Mastretta,[],['This is definitely a light comedy worth reco...,"['positive', 'positive', 'negative', 'positive...","""Everybody Loves Somebody"" elicits a mixed res...","A single, successful career woman asks her co-...",/bZKpuVXmYu6gaprjfkOdvlvY5Z9.jpg,/aAeUP99GEhJr2KIHzfA9chqqXUD.jpg


# Fetching Synopsis from the IMDB Website
Here, our aim is to fetch the synopsis from the IMDB website for each movie. This will help us in getting the text data for each movie which can be used for further analysis and movie discussion. Now, we have built a web scraper using BeautifulSoup and requests library to fetch the synopsis from the IMDB website, in such a way that we get all the synopsis and summaries as a list, out of which we select with the longest length. In this way, we can get the most detailed synopsis for each movie if it exists, else we can get the longest summary of the movie.

In [11]:
def load_synopsis(imdb_id):
    try:
        driver = webdriver.Safari()
        url = 'https://www.imdb.com/title/{}/plotsummary/?ref_=tt_stry_pl#synopsis'.format(imdb_id)
        driver.get(url)

        wait = WebDriverWait(driver, 10)

        for _ in range(2):
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body')))
            driver.find_element(By.CSS_SELECTOR, 'body').send_keys(Keys.PAGE_DOWN)
            time.sleep(2)
            

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        page_section_sections = soup.findAll('section', class_ = "ipc-page-section ipc-page-section--base")
        if page_section_sections is None:
            driver.quit()
            return ""
        
        texts = []
        for section in page_section_sections:
            for div in section:
                texts.append(div.text)
                
        if texts is None:
            driver.quit()
            return ""
                
        synopsis = ""
        for text in texts:
            if len(text) > len(synopsis):
                synopsis = text
        
        driver.quit()
        return synopsis

    except (TimeoutException, NoSuchElementException) as e:
        print(f"Error scraping synopsis: {e}")
        return ""

Now, we would also need a function to clean the text data, which will remove all the special characters, numbers, and extra spaces from the text data. This will help us in getting the clean text data, as most of the data is in html format and contains special characters and numbers which are not required for our analysis.

In [13]:
def remove_html_tag(text):
  # We will create a pattern of html tags (means any content which is encapsulated between HTML tags), and then replace/substitute it with an single space string across the text
  pattern = re.compile('<.*?>')
  return pattern.sub(" ",text)

In [14]:
database_query_based['synopsis'] = database_query_based['IMDB_ID'].apply(load_synopsis)

In [17]:
database_query_based["synopsis"] = database_query_based["synopsis"].apply(remove_html_tag)

Now, we can check if any synopsis is missing for any movie, and if it is missing, we can handle those rows.

In [36]:
blankRows = 0
for i in database_query_based["synopsis"]:
    if(i == ""):
        blankRows += 1
        
blankRows

# Now, let us find the average length of the synopsis
database_query_based['synopsis_length'] = database_query_based['synopsis'].apply(lambda x: len(x.split()))

In [37]:
print(f"Average length of synopsis: {database_query_based['synopsis_length'].mean()}")

Average length of synopsis: 714.3238512035011


As we can see, we have fetched the synopsis for each movie and cleaned the text data, and on an average, we have around 714 words which is a decent start to work with the text data. We can set the columns in the correct order.

In [39]:
database_query_based = database_query_based.drop(columns = ["synopsis_length"])

In [30]:
database_query_based = database_query_based.reindex(['id','IMDB_ID', 'title', 'release_year', 'genres', 'vote_average','cast','Director', 'keywords', 'reviews', 'review_sentiment', 'review_summary', 'synopsis','poster_path', 'backdrop_path'], axis=1)

# Finding the Youtube Trailer Link
Here, our aim is to fetch the youtube trailers from the TMDB website for each movie. This will help us in getting the video data for each movie which can be used for further analysis and movie discussion, and can be given to the Gemini model to understand the video trailers and aid in movie discussion. This is done simply done via the TMDB API.

In [67]:
tmdb = TMDb()
tmdb.api_key = TMDB_API_KEY

In [68]:
def get_youtube_link(movie_id):
  response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}&append_to_response=videos'.format(movie_id,tmdb.api_key))
  
  if response.status_code == 429:
      print("Rate limit exceeded. Waiting...")
      time.sleep(2)
      return get_youtube_link(movie_id)  # Recursively call the function after waiting
  elif response.status_code != 200:
      print(f"Error for ID {movie_id}. Status Code: {response.status_code}")
      return ""
      
  data_json = response.json()
  trailers_keys = []
  if data_json['videos']['results']:
    for video in data_json['videos']['results']:
        if video['type'] == "Trailer":
            trailers_keys.append(video['key'])
    
  time.sleep(0.015)
  
  trailers = []
  for key in trailers_keys:
    trailers.append(f"https://www.youtube.com/watch?v={key}")
  
  return trailers

In [69]:
database_query_based['youtube_trailers'] = database_query_based['id'].apply(get_youtube_link)

Let us now preprocess the dataframe to make it ready for the database.

In [72]:
database_query_based = database_query_based.reindex(['id','IMDB_ID', 'title', 'release_year', 'genres', 'vote_average','cast','Director', 'keywords', 'reviews', 'review_sentiment', 'review_summary', 'synopsis','youtube_trailers','poster_path', 'backdrop_path'], axis=1)

In [79]:
database_query_based['youtube_trailers'] = database_query_based['youtube_trailers'].apply(lambda x: str(x))

We can now go ahead and modify the database with the added columns.

In [80]:
database = "/Users/dhruv/Desktop/Machine_Learning/Projects/Chitra_Movie_Bot/SQL_Database/Movies.db"
database_query_based.to_sql("Movies_Database", sqlite3.connect(database), if_exists = "replace", index = False)

914

In [86]:
database_query_based["youtube_trailers"].iloc[0]

"['https://www.youtube.com/watch?v=wUn05hdkhjM']"

In [89]:
vertexai.init(project="movierecommender-427615", location="asia-south1")

model = genai.GenerativeModel('gemini-1.5-flash-latest')

prompt = """
Analyse this movie trailer and prepare a summary of it, and what the user can expect from the movie.
"""

video_file_uri = "../Data/Guardians of the Galaxy Vol. 2 - Trailer 3 (Official).mp4"
video_file = Part.from_uri(video_file_uri, mime_type="video/mp4")

contents = [video_file, prompt]

response = model.generate_content(contents)
print(response.text)

TypeError: Could not create `Blob`, expected `Blob`, `dict` or an `Image` type(`PIL.Image.Image` or `IPython.display.Image`).
Got a: <class 'vertexai.generative_models._generative_models.Part'>
Value: file_data {
  mime_type: "video/mp4"
  file_uri: "../Data/Guardians of the Galaxy Vol. 2 - Trailer 3 (Official).mp4"
}
