In [7]:
# --------------------------------------------
# 03_web_scraping.ipynb
# Purpose: Enrich Netflix dataset with IMDb ratings from OMDb API
# --------------------------------------------
# üîπ Import required libraries
!pip install python-dotenv
import os
from dotenv import load_dotenv
import pandas as pd
import requests
from time import sleep
from tqdm import tqdm

# üîπ Load API key from .env file
# Make sure you created a .env file in your main project folder:
# OMDB_API_KEY=your_real_api_key_here
load_dotenv()
API_KEY = os.getenv("OMDB_API_KEY")

if not API_KEY:
    raise ValueError("‚ùå OMDB_API_KEY not found. Please add it to your .env file.")

print("‚úÖ API Key loaded successfully.")

# üîπ Read the cleaned dataset
df = pd.read_csv('../data/netflix_cleaned.csv')
print("Dataset loaded:", df.shape, "records")

# üîπ Define function to get IMDb rating using OMDb API
def get_imdb_rating(title):
    """Fetch IMDb rating for a given title using OMDb API"""
    try:
        url = f"http://www.omdbapi.com/?t={title}&apikey={API_KEY}"
        response = requests.get(url, timeout=10)
        data = response.json()
        if data.get("Response") == "True":
            return data.get("imdbRating", None)
        else:
            return None
    except Exception as e:
        print(f"Error for title '{title}': {e}")
        return None

# üîπ Apply to dataset (with progress bar)
tqdm.pandas(desc="Fetching IMDb Ratings")

# ‚ö†Ô∏è To avoid free-plan limits (1000/day), use .head(200) for a smaller test first.
# For full dataset, remove .head(200)
df_subset = df.head(200)  # <- change or remove this line for full data

df_subset['imdb_rating'] = df_subset['title'].progress_apply(get_imdb_rating)

# üîπ Save the enriched dataset
output_path = '../data/netflix_enriched.csv'
df_subset.to_csv(output_path, index=False)
print(f"‚úÖ Enriched dataset saved successfully at: {output_path}")
print("Preview of enriched data:")
df_subset.head()



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\dell\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


‚úÖ API Key loaded successfully.
Dataset loaded: (8807, 15) records


Fetching IMDb Ratings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [01:16<00:00,  2.62it/s]

‚úÖ Enriched dataset saved successfully at: ../data/netflix_enriched.csv
Preview of enriched data:



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['imdb_rating'] = df_subset['title'].progress_apply(get_imdb_rating)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,duration_num,imdb_rating
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021.0,9.0,90.0,7.4
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021.0,9.0,2.0,7.6
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021.0,9.0,1.0,7.2
3,s4,TV Show,Jailbirds New Orleans,,,Unknown,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021.0,9.0,1.0,6.5
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021.0,9.0,2.0,9.0
