In [14]:
#---------------------------------------------------
# Movies Title  scraping (Web scraping module) from Rotten Tomatoes
#---------------------------------------------------

import requests
from bs4 import BeautifulSoup        # importing BeautifulSoup Python library for parsing HTML and XML documents


def fetch_movies_from_rotten_tomatoes():

    print("🍅 Fetching Latest Movie Titles from Rotten Tomatoes...\n")

    url = "https://www.rottentomatoes.com/browse/movies_in_theaters" 
 
    headers = {

        "User-Agent": "Mozilla/5.0"  # To mimic a real browser

    }

 
    try:

        response = requests.get(url,headers=headers)

        if response.status_code==200:      #  if response successfull
            
            soup = BeautifulSoup(response.content, "html.parser")
            
            l=soup.find_all("watchlist-button")             # all tags "watchlist-button" in fetched response contains the movie titles
            
            titles = [i.attrs['mediatitle'] for i in l]     # extracting titles using list comprehension from each watchlist-button tags
            
            print(f"✅ Fetched total {len(titles)} movies!")
            
            return titles[:30]                              # Limiting return of 30 latest movies

 
    except requests.exceptions.RequestException as e:  # exception handling block in case failed fetching from Rotten Tomatoes

        print(f"❌ Error while scraping Rotten Tomatoes: {e}")

        return []

#Testing module 
# titles=fetch_movies_from_rotten_tomatoes()


🍅 Fetching Latest Movie Titles from Rotten Tomatoes...

✅ Fetched total 28 movies!


In [9]:
#---------------------------------------------------
# Integration with OMDb API for movie metadata Extract Module
#---------------------------------------------------


def extract_movie_data(titles):
    API_KEY='683d504a' # OMDb API key of Trisha
    
    movies_data=[]
    for title in titles:
        url=f'http://www.omdbapi.com/?t={title}&apikey={API_KEY}'
        try:
            response = requests.get(url,timeout=10)
            data=response.json()
            if data.get("Response") == "True":
                movies_data.append(data)
            else:
                print(f"❌ Metadata not found for Movie: '{title}' ")
            
        except requests.exceptions.RequestException as e:
            print(f"❌ Request Error for {title}: {e}")
    
    print(f"⚠️ Out of total {len(titles)} movies,  metadata not found for {len(titles)-len(movies_data)} movies!\n")
    
    return movies_data

# Testing module individually

#movies_data=extract_movie_data(titles)


❌ Metadata not found for Movie: 'Open Your Eyes Jeffrey' 
❌ Metadata not found for Movie: 'UFC 313: Pereira vs. Ankalaev' 
⚠️ Out of total 28 movies,  metadata not found for 2 movies!



In [10]:
#-----------------------------
# Movie data transformation module 
#-----------------------------

from datetime import datetime 
import re

def transformation(movies_data):
    
    transformedMovies = []
    
    print(f"🔄 Data Transformation started...")
    
    for movie in movies_data:

        try:
            title=movie.get('Title')
                        
            # task 1 Title Transformation 
            # ● Remove Special Characters: Use regular expressions to eliminate non-alphanumeric characters.
            # ● Standardized Case: Convert all titles to Title Case for consistency.
            # ● Trim Whitespaces: Remove leading/trailing spaces to maintain uniformity
            cleaned_title = re.sub(r"[^a-zA-Z0-9\s']", '', str(title).title().strip()) if str(title)!='None' else 'N/A'
            
            # Task 2. Release Date Transformation
            # ● Date Formatting: Replace spaces with hyphens to standardize the date format (e.g., DD MMM YYYY to DD-MMM-YYYY).
            # ● Convert to Date Object: Use date parsing to convert text dates into proper date formats.
            # ● Handle Missing Dates: Replace missing dates with "Unknown."        
            released=movie.get('Released')             
            if released == 'N/A':
                released='Unknown'
            else:                      
                released=str(released).replace(' ','-')  # putting hyphen in place of space in between in released date
                date_object = datetime.strptime(released, "%d-%b-%Y") # converting into date object  
                formatted_date = date_object.strftime("%Y-%m-%d")
                
             
            # Task 3. Genre Transformation 
            # ● Convert to Lowercase: Ensure all genres are in lowercase for consistency.
            # ● Split Genres: If multiple genres are present, separate them into a list.
            # ● Remove Duplicates: Ensure unique genre entries        
            genre=[i.strip().lower() if i.strip().lower()!='n/a' else 'unknown' for i in movie.get('Genre').split(',')] # list comprehension to generate the genre tranformed list
        
            
            # Task 4. IMDb Rating Transformation
            # ● Convert to Numeric: Change rating from text to a fl oating-point number.
            # ● Round Off: Round ratings to one decimal place.
            # ● Normalize: Optionally, normalize ratings on a scale of 0 to 1.
            imdb_rate=round(float(movie.get('imdbRating')),1) if movie.get('imdbRating')!='N/A' else 'Unknown'

            
            # Task 5. Actors Transformation
            # ● Limit to Top 3: Display only the top three actors.
            # ● Trim Spaces: Remove extra spaces around names.
            # ● Sort Alphabetically (Optional): For consistency in display
            actors=[i.strip() if i!='N/A' else 'Unknown' for i in movie.get('Actors').split(',')]  #Triming Spaces and replacing with 'Unknown' where 'N/A'
            actors=", ".join(sorted(actors)[:3]) # fetching top 3

            
            # Task 6 Box Offi ce Transformation
            # ● Remove Currency Symbols: Eliminate $, ,, and other non-numeric characters.
            # ● Convert to Numeric: Store as an integer for analysis.
            # ● Handle Missing Data: Replace missing values with 0 or N/A
            BO=str(movie.get('BoxOffice'))
            BO_number=int(re.sub(r'[^\d]','', BO)) if re.sub(r'[^\d]','', BO)!='' else 0 # Eliminating non-numeric characters.
            
            
            # 7. Awards Transformation
            # ● Extract Numbers: Identify and sum all numeric values related to awards won.
            # ● Standardize Format: Display total awards won.
            # ● Handle Missing Awards: Set to 0 if no data is available
            awards=str(movie.get('Awards')) if str(movie.get('Awards'))!='N/A' else 0
            total_awards=sum(map(int,re.findall(r"\d+", str(awards)))) # extracting numbers in the string and summing then into total awards value

            
            # Task 8. Metascore Transformation
            # ● Convert to Integer: Change metascore to an integer for calculations.
            # ● Normalize: Convert to a 0-1 scale by dividing by 100.
            # ● Handle Missing Values: Replace "N/A" with None.
            metascore=int(movie.get('Metascore'))/100 if movie.get('Metascore')!='N/A' else 'None'

            
            # Task 9. Language Transformation
            # ● Convert to Lowercase: Ensure all language names are in lowercase.
            # ● Standardize Codes: Optionally convert to ISO language codes.
            # ● Handle Missing Data: Replace missing languages with "Unknown."
            lang=movie.get('Language').lower() if movie.get('Language')!='N/A' else 'Unknown'

            # Task 10. Production Transformation
            # ● Remove Special Characters: Clean the production company names.
            # ● Standardize Names: Correct common misspellings and standardize abbreviations.
            # ● Handle Missing Data: Replace missing production companies with "Independent."
            production = movie.get('Production') if movie.get('Production') else 'Independent'
            production=re.sub(r'[^a-z.A-Z0-9\s]', '', production) if production!='N/A' else 'Independent'
            
            transformedMovies.append({"Title":cleaned_title,
                                  "Released Date":formatted_date,
                                  "Genres":genre, 
                                  "IMDb Rating":imdb_rate,
                                  "Top 3 Actors":actors, 
                                  "Box Office":BO_number,
                                  "Awards":total_awards,
                                  "Metascore":metascore,
                                  "Language":lang,
                                  "Production":production})
        except Exception as e:            
            print(f'⚠️** Data exception for record:{title}, exception: {e}') # this is to identify if any processing expection occurs then which movie data having it.
    print(f"✨ Transformation Complete for {len(transformedMovies)} movies!\n")
    # transformedMovies[0].keys()
    return transformedMovies

# Testing module 
#transformed_data=transformation(movies_data)

🔄 Data Transformation started...
✨ Transformation Complete for 26 movies!



In [11]:
# ------------
# Loading to CSV file module
#------------
def load_to_csv(transformed_data):
    print("💾 Saving Data to CSV... 📊")

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    file_name=f"movies_{timestamp}.csv" # setting the CSV file name where data going to load
    
    with open(file_name, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=transformed_data[0].keys())  # the keys in transformed_data[0].keys() will be headers of the CSV file
        writer.writeheader() 
        writer.writerows(transformed_data)                    # this line actually writing the data contained in "transformed_data" list

    print(f"✅ Data successfully saved to {file_name}!\n")

    
# Testing module individually
#load_to_csv(transformed_data)


💾 Saving Data to CSV... 📊
✅ Data successfully saved to movies_20250309_140317.csv!



In [12]:

# -----------------------
# Executing Automated ETL PipeLine 
#------------------------

import requests
from bs4 import BeautifulSoup        # importing BeautifulSoup Python library for parsing HTML and XML documents
import csv
from fpdf import FPDF
from datetime import datetime 
import re

if __name__ == "__main__":
    
    titles = fetch_movies_from_rotten_tomatoes()     # (Web scraping module) - fetching movie title through web scraping from Rotten tomatoes
    
    if titles:
        movies = extract_movie_data(titles)          # (Data Extraction module) - for all movie titles extracting movie data through OMDb API key and storing into movies list
        
        if movies:
            transformed_data = transformation(movies) # (Data Transformation module) - Transforming/cleaning each movie data 
            
            load_to_csv(transformed_data)             # (Data Laoding module) - cleansed movie data loaded into CSV file for analysis
            
        else:
            print("⚠️ No movie data fetched from API.")
    else:
        print("⚠️ No movie titles found during Rotten tomatoes web scraping.")
        

🍅 Fetching Latest Movie Titles from Rotten Tomatoes...

✅ Fetched total 28 movies!
❌ Metadata not found for Movie: 'Open Your Eyes Jeffrey' 
❌ Metadata not found for Movie: 'UFC 313: Pereira vs. Ankalaev' 
⚠️ Out of total 28 movies,  metadata not found for 2 movies!

🔄 Data Transformation started...
✨ Transformation Complete for 26 movies!

💾 Saving Data to CSV... 📊
✅ Data successfully saved to movies_20250309_140359.csv!



In [13]:
#----------------
# PDF report generation
#----------------
def load_to_pdf(transformed_data):
    print("📝 Generating PDF Report... 📄")
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f"movies_{timestamp}.pdf"

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # ✅ Title
    pdf.cell(200, 10, "Movie Insights Report", ln=True, align="C")
    pdf.ln(5)

    # 🔹 Add Data with Bifurcation Lines
    for movie in transformed_data:
        pdf.set_font("Arial", style='B', size=12)  # Bold for Title
        pdf.cell(0, 8, f" {movie.get('Title', 'N/A')}", ln=True)
        pdf.set_font("Arial", size=11)

        for key, value in movie.items():
            if key != "Title":  # Skip repeating the title
                pdf.multi_cell(0, 8, f"{key}: {value}")

        # 🚀 Add a Bifurcation Line for Separation
        pdf.set_draw_color(0, 0, 0)  # Black color
        pdf.set_line_width(0.5)
        pdf.line(10, pdf.get_y(), 200, pdf.get_y())  # Draw horizontal line
        pdf.ln(5)  # Add space after the line

    pdf.output(filename)
    print(f"✅ PDF report saved as {filename}!\n")
load_to_pdf(transformed_data)

📝 Generating PDF Report... 📄
✅ PDF report saved as movies_20250309_140407.pdf!

