In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.parse
import re # Added for extracting RAM and 5G info
import time


In [2]:
# --- CONFIGURATION ---
TOKEN = "api_token" # <--- PASTE YOUR TOKEN HERE. I use Scrape.do api key
PAGES_TO_SCRAPE = 10

In [3]:
# --- STEP 1: SCRAPING ---
print("--- Starting Scraping Process ---")
all_html_pages = [] 

for page_num in range(1, PAGES_TO_SCRAPE + 1):
    print(f"Fetching page {page_num}...")
    
    # 1. Define the target URL (Flipkart search for smartphones)
    target_url = f"https://www.add_page_url=smartphones&page={page_num}"
    
    # 2. Encode the URL for the API
    encoded_url = urllib.parse.quote(target_url)
    
    # 3. Construct Scrap.do API URL
    api_url = f"http://api.scrape.do/?token={TOKEN}&url={encoded_url}"
    
    try:
        response = requests.get(api_url)
        if response.status_code == 200:
            all_html_pages.append(response.text)
        else:
            print(f"Failed to fetch page {page_num}. Status Code: {response.status_code}")
    except Exception as e:
        print(f"Error fetching page {page_num}: {e}")
    
    # Optional: Sleep briefly to be polite/avoid errors
    time.sleep(1)

print(f"\nSuccessfully collected {len(all_html_pages)} HTML pages.")

--- Starting Scraping Process ---
Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...

Successfully collected 10 HTML pages.


In [4]:
# --- STEP 2: PARSING ---
def parse_html_pages(html_list):
    all_products = []
    
    for html_content in html_list:
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Flipkart product card class
        product_cards = soup.find_all('div', {'class': 'cPHDOP'})
        
        for card in product_cards:
            try:
                # Extract raw details
                name_el = card.find('div', {'class': 'KzDlHZ'})
                price_el = card.find('div', {'class': 'Nx9bqj _4b5DiR'})
                rating_el = card.find('div', {'class': 'XQDdHH'})
                
                if name_el and price_el:
                    name_text = name_el.text.strip()
                    price_text = price_el.text.strip()
                    rating_text = rating_el.text.strip() if rating_el else "0"
                    
                    all_products.append({
                        "Product Name": name_text,
                        "Original Price": price_text,
                        "Rating": rating_text
                    })
            except Exception as e:
                continue
                
    return all_products

print("--- Parsing Data ---")
raw_data = parse_html_pages(all_html_pages)
print(f"Extracted {len(raw_data)} products.")


--- Parsing Data ---
Extracted 240 products.


In [5]:
print(raw_data[:3])

[{'Product Name': 'POCO C71 (Cool Blue, 128 GB)', 'Original Price': '₹6,799', 'Rating': '4.1'}, {'Product Name': 'Ai+ Pulse (Blue, 64 GB)', 'Original Price': '₹5,999', 'Rating': '4.3'}, {'Product Name': 'realme P3 Lite 5G Charger in the Box (Midnight Lily, 128 GB)', 'Original Price': '₹9,749', 'Rating': '4.4'}]


In [7]:
# 1. Create a DataFrame from your data list
df = pd.DataFrame(raw_data)

# 2. Save it to a CSV file
# index=False prevents pandas from adding an extra row of numbers (0, 1, 2...)
df.to_csv('my_products.csv', index=False)

print("Saved to my_products.csv using Pandas!")

Saved to my_products.csv using Pandas!
