In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from urllib.parse import quote_plus

In [2]:
def get_product_name(soup):
    try:
        title = soup.find('span', class_='VU-ZEz')
        title_value = title.text.strip()
    except AttributeError:
        title_value = ''
    
    return title_value

In [3]:
def get_price(soup):
    try:
        price = soup.find('div', class_='Nx9bqj CxhGGd')
        price_value = price.text.strip()
    except AttributeError:
        price_value = ''
    
    return price_value

In [4]:
def get_prev_price(soup):
    try:
        prev_price = soup.find('div', class_='yRaY8j A6+E6v')
        prev_price_value = prev_price.text.strip()
    except AttributeError:
        prev_price_value = ''
    
    return prev_price_value

In [5]:
def get_discnt(soup):
    try:
        discount = soup.find('div', class_='UkUFwK WW8yVX')
        discount_value = discount.text.strip()
    except AttributeError:
        discount_value = ''
    
    return discount_value

In [6]:
def get_stars(soup):
    try:
        rating = soup.find('div', class_='XQDdHH')
        rating_value = rating.text.strip()
    except AttributeError:
        rating_value = ''
    
    return rating_value

In [7]:
def get_specs(soup):
    try:
        rating = soup.find('div', class_='xFVion')
        rating_value = rating.text.strip()
    except AttributeError:
        rating_value = ''
    
    return rating_value

In [8]:
def get_delivery(soup):
    try:
        delivery = soup.find('span', class_='hcf08j').text.strip()
    except AttributeError:
        delivery = ''
    
    return delivery

In [9]:
def get_ratings(soup):
    try:
        ratings = soup.find('span', class_='Wphh3N').text.strip()
    except AttributeError:
        ratings = ''
    
    return ratings

In [10]:
if __name__ == '__main__':
    headers = ({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0', 
        'Accept-Language': 'en-US, en;q=0.5'
    })

    keyword = input('Enter the product keyword(s) to search: ')
    encoded_keyword = quote_plus(keyword)
    base_url = f'https://www.flipkart.com/search?q={encoded_keyword}&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off'
    num_pages = int(input("Enter the number of pages to scrape: "))
    
    data = {
        'product_name': [],
        'price': [],
        'prev_price': [],
        'discount': [],
        'stars': [],
        'description': [],
        'delivery': [],
        'ratings': []
    }
    
    for page in range(1, num_pages + 1):
        url = f'{base_url}&page=' + str(page)
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a', class_=['CGtC98', 'VJA3rP', 'rPDeLR'])
        # links = soup.find_all('a', class_='rPDeLR') # horizontal / clothing and accessories
        # links = soup.find_all('a', class_='VJA3rP') # horizontal / food, grooming, small and cheap electronics, stationery, grocery
        # links = soup.find_all('a', class_='CGtC98') # vertical / big and costly electronics
        
        links_list = []
        for link in links:
            links_list.append(link.get('href'))
        
        for link in links_list:
            new_response = requests.get('https://www.flipkart.com' + link, headers=headers)
            prod_soup = BeautifulSoup(new_response.content, 'html.parser')
            
            data['product_name'].append(get_product_name(prod_soup))
            data['price'].append(get_price(prod_soup))
            data['prev_price'].append(get_prev_price(prod_soup))
            data['discount'].append(get_discnt(prod_soup))
            data['stars'].append(get_stars(prod_soup))
            data['description'].append(get_specs(prod_soup))
            data['delivery'].append(get_delivery(prod_soup))
            data['ratings'].append(get_ratings(prod_soup))
    
    flipkart_df = pd.DataFrame.from_dict(data)
    # flipkart_df.to_csv('flipkart_data.csv', header=True, index=False)
    print(flipkart_df)

                                      product_name      price prev_price  \
0         Apple iPhone 14 Plus (Starlight, 128 GB)    ₹55,999    ₹79,900   
1                   Apple iPhone 15 (Pink, 128 GB)    ₹70,999    ₹79,900   
2                  Apple iPhone 15 (Black, 128 GB)    ₹70,999    ₹79,900   
3          Apple iPhone 14 Plus (Midnight, 128 GB)    ₹55,999    ₹79,900   
4              Apple iPhone 14 Plus (Blue, 128 GB)    ₹55,999    ₹79,900   
..                                             ...        ...        ...   
67                  Apple iPhone 13 (Blue, 256 GB)    ₹62,999    ₹69,900   
68             Apple iPhone 14 Plus (Blue, 512 GB)    ₹85,999  ₹1,09,900   
69                 Apple iPhone 15 (Black, 512 GB)  ₹1,00,999  ₹1,09,900   
70              Apple iPhone 13 (Midnight, 512 GB)    ₹82,999    ₹89,900   
71  Apple iPhone 15 Pro (Natural Titanium, 128 GB)  ₹1,24,990  ₹1,34,900   

   discount stars                                        description delivery  \
0   29

In [11]:
print(url)

https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=3


In [12]:
flipkart_df

Unnamed: 0,product_name,price,prev_price,discount,stars,description,delivery,ratings
0,"Apple iPhone 14 Plus (Starlight, 128 GB)","₹55,999","₹79,900",29% off,4.6,128 GB ROM17.02 cm (6.7 inch) Super Retina XDR...,Free,"72,722 Ratings & 4,091 Reviews"
1,"Apple iPhone 15 (Pink, 128 GB)","₹70,999","₹79,900",11% off,4.6,128 GB ROM15.49 cm (6.1 inch) Super Retina XDR...,Free,"40,748 Ratings & 2,226 Reviews"
2,"Apple iPhone 15 (Black, 128 GB)","₹70,999","₹79,900",11% off,4.6,128 GB ROM15.49 cm (6.1 inch) Super Retina XDR...,Free,"40,748 Ratings & 2,226 Reviews"
3,"Apple iPhone 14 Plus (Midnight, 128 GB)","₹55,999","₹79,900",29% off,4.6,128 GB ROM17.02 cm (6.7 inch) Super Retina XDR...,Free,"72,722 Ratings & 4,091 Reviews"
4,"Apple iPhone 14 Plus (Blue, 128 GB)","₹55,999","₹79,900",29% off,4.6,128 GB ROM17.02 cm (6.7 inch) Super Retina XDR...,Free,"72,722 Ratings & 4,091 Reviews"
...,...,...,...,...,...,...,...,...
67,"Apple iPhone 13 (Blue, 256 GB)","₹62,999","₹69,900",9% off,4.6,256 GB ROM15.49 cm (6.1 inch) Super Retina XDR...,Free,"2,82,681 Ratings & 13,675 Reviews"
68,"Apple iPhone 14 Plus (Blue, 512 GB)","₹85,999","₹1,09,900",21% off,4.6,512 GB ROM17.02 cm (6.7 inch) Super Retina XDR...,Free,"72,722 Ratings & 4,091 Reviews"
69,"Apple iPhone 15 (Black, 512 GB)","₹1,00,999","₹1,09,900",8% off,4.6,512 GB ROM15.49 cm (6.1 inch) Super Retina XDR...,Free,"40,748 Ratings & 2,226 Reviews"
70,"Apple iPhone 13 (Midnight, 512 GB)","₹82,999","₹89,900",7% off,4.6,512 GB ROM15.49 cm (6.1 inch) Super Retina XDR...,Free,"2,82,681 Ratings & 13,675 Reviews"


In [13]:
flipkart_df.dtypes

product_name    object
price           object
prev_price      object
discount        object
stars           object
description     object
delivery        object
ratings         object
dtype: object

In [14]:
flipkart_df['price'] = flipkart_df['price'].str.replace('₹', '').str.replace(',', '')
flipkart_df['prev_price'] = flipkart_df['prev_price'].str.replace('₹', '').str.replace(',', '')

In [15]:
flipkart_df['price'].replace('', np.nan, inplace=True)
flipkart_df['prev_price'].replace('', np.nan, inplace=True)
flipkart_df['stars'].replace('', np.nan, inplace=True)

flipkart_df['price'] = pd.to_numeric(flipkart_df['price'], errors='coerce')
flipkart_df['prev_price'] = pd.to_numeric(flipkart_df['prev_price'], errors='coerce')
flipkart_df['stars'] = pd.to_numeric(flipkart_df['stars'], errors='coerce')

# flipkart_df['price']= flipkart_df['price'].astype('float')
# flipkart_df['prev_price']= flipkart_df['prev_price'].astype('float')
# flipkart_df['rating']= flipkart_df['rating'].astype('float')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  flipkart_df['price'].replace('', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  flipkart_df['prev_price'].replace('', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which 

In [16]:
flipkart_df.dtypes

product_name     object
price             int64
prev_price      float64
discount         object
stars           float64
description      object
delivery         object
ratings          object
dtype: object

In [17]:
flipkart_df['brand_name'] = flipkart_df['product_name'].apply(lambda x: x.split()[0])

In [18]:
def clean_discount(discount):
    if isinstance(discount, str) and 'off' in discount:
        return int(discount.split()[0].replace('%', ''))
    return None

flipkart_df['discount_%'] = flipkart_df['discount'].apply(clean_discount)

flipkart_df['discount_%'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  flipkart_df['discount_%'].fillna(0, inplace=True)


In [19]:
import re

def extract_ratings_reviews(row):
    ratings_reviews = row['ratings']
    ratings_match = re.search(r'(\d{1,3}(,\d{3})*) Ratings', ratings_reviews)
    reviews_match = re.search(r'(\d{1,3}(,\d{3})*) Reviews', ratings_reviews)
    
    if ratings_match:
        ratings = ratings_match.group(1).replace(',', '')
    else:
        ratings = None
    
    if reviews_match:
        reviews = reviews_match.group(1).replace(',', '')
    else:
        reviews = None
    
    return pd.Series([ratings, reviews])

flipkart_df[['ratings', 'reviews']] = flipkart_df.apply(extract_ratings_reviews, axis=1)

flipkart_df['ratings'] = pd.to_numeric(flipkart_df['ratings'], errors='coerce')
flipkart_df['reviews'] = pd.to_numeric(flipkart_df['reviews'], errors='coerce')

### Analysis

In [33]:
print(f'Showing results for keyword: {keyword}')

total_products = len(flipkart_df)
print(f'Total Products Scraped: {total_products}')

print(f'Total Pages Scraped: {num_pages}')

Showing results for keyword: iphone
Total Products Scraped: 72
Total Pages Scraped: 3


In [20]:
cheapest_product = flipkart_df.loc[flipkart_df['price'].idxmin()]
print(f'Cheapest product: \n{cheapest_product}')

Cheapest product: 
product_name                       Apple iPhone 12 (Black, 64 GB)
price                                                       39999
prev_price                                                49900.0
discount                                                  19% off
stars                                                         4.6
description     64 GB ROM15.49 cm (6.1 inch) Super Retina XDR ...
delivery                                                     Free
ratings                                                   10123.0
brand_name                                                  Apple
discount_%                                                   19.0
reviews                                                   13884.0
Name: 11, dtype: object


In [21]:
costliest_product = flipkart_df.loc[flipkart_df['price'].idxmax()]
print(f'Costliest product: \n{costliest_product}')

Costliest product: 
product_name           Apple iPhone 15 Pro (White Titanium, 1 TB)
price                                                      174990
prev_price                                               184900.0
discount                                                   5% off
stars                                                         4.7
description     1 TB ROM15.49 cm (6.1 inch) Super Retina XDR D...
delivery                                                     Free
ratings                                                     818.0
brand_name                                                  Apple
discount_%                                                    5.0
reviews                                                      64.0
Name: 38, dtype: object


In [22]:
highest_rated_product = flipkart_df.loc[flipkart_df['stars'].idxmax()]
print(f'Highest rated product: \n{highest_rated_product}')

Highest rated product: 
product_name          Apple iPhone 15 Pro (Blue Titanium, 256 GB)
price                                                      134990
prev_price                                               144900.0
discount                                                   6% off
stars                                                         4.7
description     256 GB ROM15.49 cm (6.1 inch) Super Retina XDR...
delivery                                                     Free
ratings                                                     818.0
brand_name                                                  Apple
discount_%                                                    6.0
reviews                                                      64.0
Name: 26, dtype: object


In [23]:
least_discounted_product = flipkart_df.loc[flipkart_df['discount_%'].idxmin()]
print(f'Least discounted product: \n{least_discounted_product}')

Least discounted product: 
product_name    Apple iPhone XR ((PRODUCT)RED, 64 GB) (Include...
price                                                       47900
prev_price                                                    NaN
discount                                                         
stars                                                         4.6
description     64 GB ROM15.49 cm (6.1 inch) Display12MP Rear ...
delivery                                                         
ratings                                                     933.0
brand_name                                                  Apple
discount_%                                                    0.0
reviews                                                    8502.0
Name: 56, dtype: object


In [24]:
highest_discounted_product = flipkart_df.loc[flipkart_df['discount_%'].idxmax()]
print(f'Highest discounted product: \n{highest_discounted_product}')

Highest discounted product: 
product_name             Apple iPhone 14 Plus (Starlight, 128 GB)
price                                                       55999
prev_price                                                79900.0
discount                                                  29% off
stars                                                         4.6
description     128 GB ROM17.02 cm (6.7 inch) Super Retina XDR...
delivery                                                     Free
ratings                                                   72722.0
brand_name                                                  Apple
discount_%                                                   29.0
reviews                                                    4091.0
Name: 0, dtype: object


In [25]:
brand_counts = flipkart_df['brand_name'].value_counts().reset_index()
brand_counts.columns = ['brand_name', 'count']

fig = px.bar(brand_counts, x='brand_name', y='count', title='Count of Products per Brand', labels={'brand_name': 'Brand', 'count': 'Count'}, text='count')

fig.show()

In [26]:
avg_stars_per_brand = flipkart_df.groupby('brand_name')['stars'].mean().reset_index().round(1)
avg_stars_per_brand.columns = ['brand_name', 'avg_stars']

fig = px.line(avg_stars_per_brand, x='brand_name', y='avg_stars', title='Average Stars by Brand', markers=True, labels={'brand_name': 'Brand', 'avg_stars': 'Average Stars'}, text='avg_stars')

fig.show()

In [27]:
avg_review_by_brand = flipkart_df.groupby('brand_name')['reviews'].sum().reset_index().round(0)
avg_review_by_brand.columns = ['brand_name', 'avg_reviews']

fig = px.line(avg_review_by_brand, x='brand_name', y='avg_reviews', title='Total Reviews by Brand', markers=True, labels={'brand_name': 'Brand', 'avg_reviews': 'Average Reviews'}, text='avg_reviews')
fig.update_traces(textposition='top center')

fig.show()

In [28]:
brand_ratings = flipkart_df.groupby('brand_name')['ratings'].sum().reset_index()
brand_ratings.columns = ['brand_name', 'total_ratings']

fig = px.line(brand_ratings, x='brand_name', y='total_ratings', title='Total Ratings by Brand', markers=True, labels={'brand_name': 'Brand', 'total_ratings': 'Total Ratings'}, text='total_ratings')
fig.update_traces(textposition='top center')

fig.show()

In [29]:
top_products = flipkart_df.sort_values(by='stars', ascending=False).head(10)

# Create a table figure
fig_table = go.Figure(data=[go.Table(
    header=dict(values=['Brand', 'Product Name', 'Price', 'Stars'],
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[top_products['brand_name'],
                       top_products['product_name'],
                       top_products['price'], 
                       top_products['stars']],
               fill_color='lavender',
               align='left'))
])

fig_table.update_layout(title='Top Products by Stars')
fig_table.show()

In [30]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
labels = [f'{bins[i]}-{bins[i+1]}%' for i in range(len(bins) - 1)]

flipkart_df['discount_bin'] = pd.cut(flipkart_df['discount_%'], bins=bins, labels=labels, include_lowest=True)

discount_counts = flipkart_df['discount_bin'].value_counts().reset_index()
discount_counts.columns = ['Discount Range', 'Count']

discount_counts = discount_counts.sort_values(by='Discount Range')

fig_bar = px.bar(discount_counts, x='Discount Range', y='Count', 
                 title='Product Count by Discount % Range',
                 labels={'Discount Range': 'Discount % Range', 'Count': 'Product Count'})

fig_bar.show()

In [31]:
flipkart_df.to_csv('flipkart_keyword.csv', header=True, index=False)