## Scraping Product Name and Price from the following product listing page on Flipkart

![image.png](attachment:image.png)

## Importing Libraries 

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import csv
import time

## Creating a Web Scraping Function

In [None]:
def check_price():  
    
    #Preparing DataFrame to add web scrapped data
    df = pd.DataFrame(columns = ['Date','Time','Product', 'Price'] )


    # Scraping the html code from FlipKart's Website 
    url = 'https://www.flipkart.com/search?q=smartphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&sort=popularity'
    page = requests.get(url)
    soup = BeautifulSoup(page.text,'html')

    # Scraping Product Name 
    product_name = soup.find_all('div',class_="_4rR01T")

    # Cleaning Product Name data and adding to the previously created df,row by row
    length = 0
    for name in product_name:
        product = name.text.strip()
        df.loc[length,'Product'] = product
        length = length + 1

    # Scraping Product Price 
    prices = soup.find_all('div',class_="_30jeq3 _1_WHN1")

    # Cleaning Price data and adding to the previously created df, row by row
    length = 0
    for value in prices:
        price = value.text.strip()
        df.loc[length,'Price'] = price
        length = length + 1


    # Adding datestamp to the df
    date = datetime.now()
    # Converting into string to capture only the date values from date and time
    date = str(date) 
    date = date[:10]
    date
    
    #Adding date to the previoulsy created df, row by row
    length = 0
    while length <24: # Since only 24 rows will have listings
        df.loc[length,'Date'] = date
        length = length + 1

    # Adding timestamp to the df
    timestamp = datetime.now()
    # Converting into string to capture only the time values from date and time
    timestamp = str(timestamp)
    timestamp = timestamp[10:]
    timestamp

    #Adding time to the previoulsy created df, row by row    
    length = 0
    while length <24: # Since only 24 rows will have listings
        df.loc[length,'Time'] = timestamp
        length = length + 1


    #Converting df into a dictionary
    data = df.to_dict('records')

    #Saving data in a CSV file
    headers=['Date','Time','Product', 'Price']


#WHEN SAVING THE DATA FOR THE FIRST TIME - THIS CREATES A NEW FILE, ADDS HEADERS AND THEN SAVES THE SUBSEQUENT DATA
#     with open('Flipkart_Mobile_Prices.csv','w', encoding = 'UTF8', newline = '') as f:   ('w' stands for write)
#         c = csv.DictWriter(f, fieldnames = headers)
#         c.writeheader()
#         c.writerows(data)
        
#SUBSEQUENT SAVING OF DATA - THIS APPENDS SUBSEQUENT DATA THAT IS SCRAPPED ON THE ALREADY CREATED FILE
    with open('Flipkart_Mobile_Prices_Single_Page.csv','a', encoding = 'UTF8', newline = '') as f:   #('a' stands for append)
        c = csv.DictWriter(f, fieldnames = headers)
        c.writerows(data)


## Automating the code such that the Flipkart website is scraped every 30 minutes

In [None]:
from datetime import datetime 
import time

# Runs function every 30 mins        
while(True):
    check_price()
    print(datetime.now() ,"run completed") # date and time and status of when the website is scraped
    time.sleep(1800)  #pauses the infinite group for 30 mins

## Checking the saved Dataframe - Flipkart_Mobile_Prices_Single_Page.csv

In [3]:
pd.set_option('display.max.rows',5448)
price_data = pd.read_csv(r"C:\Users\chels\ALex beginner\Python\Flipkart_Mobile_Prices_Single_Page.csv")
price_data

Unnamed: 0,Date,Time,Product,Price
0,25-01-2024,18:26:35.979358,"Apple iPhone 14 (Midnight, 128 GB)","₹57,999"
1,25-01-2024,18:26:35.979358,"Apple iPhone 14 (Starlight, 128 GB)","₹57,999"
2,25-01-2024,18:26:35.979358,"vivo T2x 5G (Aurora Gold, 128 GB)","₹14,999"
3,25-01-2024,18:26:35.979358,"vivo T2x 5G (Glimmer Black, 128 GB)","₹12,999"
4,25-01-2024,18:26:35.979358,"vivo T2x 5G (Marine Blue, 128 GB)","₹12,999"
5,25-01-2024,18:26:35.979358,"vivo T2x 5G (Aurora Gold, 128 GB)","₹12,999"
6,25-01-2024,18:26:35.979358,"vivo T2x 5G (Marine Blue, 128 GB)","₹11,999"
7,25-01-2024,18:26:35.979358,"vivo T2x 5G (Aurora Gold, 128 GB)","₹11,999"
8,25-01-2024,18:26:35.979358,"vivo T2x 5G (Glimmer Black, 128 GB)","₹11,999"
9,25-01-2024,18:26:35.979358,"vivo T2x 5G (Marine Blue, 128 GB)","₹14,999"


##  

#### In the above funtion, we have scrapped only a single page of product listing. Next, we are creating a function that scraps 4 pages of listings

## Creating a Web Scraping Function

In [None]:
# Preparing DataFrame
df = pd.DataFrame(columns = ['Date','Time','Product', 'Price'] )

# Webscraping function
def check_price_2():
    url = 'https://www.flipkart.com/search?q=smartphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&sort=popularity'
    page = requests.get(url)
    soup = BeautifulSoup(page.text,'html')

    # Scraping the link adresses of first 4 pages from the botton of the 1st page listing where all page numbers are mentioned
    raw_links = soup.find_all('a',class_="ge-49M")[0:4]

    # Converting the link addresses into strings and adding it to a list
    links = [str(r) for r in raw_links]

    #Cleaning the links such that they contain domain names and '&'
    links[0] = links[0][32:175].replace("&amp;",'&').replace('/','https://www.flipkart.com/')
    links[1] = links[1][24:167].replace("&amp;",'&').replace('/','https://www.flipkart.com/')
    links[2] = links[2][24:167].replace("&amp;",'&').replace('/','https://www.flipkart.com/')
    links[3] = links[3][24:167].replace("&amp;",'&').replace('/','https://www.flipkart.com/')


    # Retriving the html code for each page from the list of links
    for x in links[0:4]:
        url = x
        page = requests.get(url)
        soup = BeautifulSoup(page.text,'html')
        print(soup)

        # Scraping Product Name 
        product_name = soup.find_all('div',class_="_4rR01T")

        # Cleaning Product Name data and adding to the previously created df, row by row
        length = 0
        for p in product_name:
            product = p.text.strip()
            df.loc[length,'Product'] = product
            length = length + 1

        # Scraping Product Price 
        prices = soup.find_all('div',class_="_30jeq3 _1_WHN1")

        # Cleaning Price data and adding to the previously created df, row by row
        length = 0
        for c in prices:
            price = c.text.strip()
            df.loc[length,'Price'] = price
            length = length + 1


        # Adding datestamp to the df
        date = datetime.now()
        # Converting into string to capture only the date values from date and time
        date = str(date)
        date = date[:10]
        date
        
        # Adding date to the previoulsy created df, row by row
        length = 0
        while length <24:
            df.loc[length,'Date'] = date
            length = length + 1

            
        # Adding timestamp to the df
        timestamp = datetime.now()
        # Converting into string to capture only the time values from date and time
        timestamp = str(timestamp)
        timestamp = timestamp[10:]
        timestamp
        
        # Adding date to the previoulsy created df, row by row
        length = 0
        while length <24:
            df.loc[length,'Time'] = timestamp
            length = length + 1


        # Converting df into a dictionary
        data = df.to_dict('records')

        # Saving data in a CSV file
        headers=['Date','Time','Product', 'Price']
        
# WHEN SAVING THE DATA FOR THE FIRST TIME - THIS CREATES A NEW FILE, ADDS HEADERS AND THEN SAVES THE SUBSEQUENT DATA
#         with open('multiple.csv','w', encoding = 'UTF8', newline = '') as f: ('w' stands for write)
#             c = csv.DictWriter(f, fieldnames = headers)
#             c.writeheader()
#             c.writerows(data)
    
# SUBSEQUENT SAVING OF DATA - THIS APPENDS SUBSEQUENT DATA THAT IS SCRAPPED ON THE ALREADY CREATED FILE
        with open('Flipkart_Mobile_Prices_Multiple_Page.csv','a', encoding = 'UTF8', newline = '') as f: #('a' stands for append)
            c = csv.DictWriter(f, fieldnames = headers)
            c.writerows(data)



## Automating the code such that the Flipkart website is scraped every 30 minutes

In [None]:
from datetime import datetime 
import time
  
# Runs function every 30 mins            
while(True):
    check_price_2()
    print(datetime.now() ,"run completed") # date and time and status of when the website is scraped
    time.sleep(1800) #pauses the infinite group for 30 mins

## Checking the saved Dataframe - Flipkart_Mobile_Prices_Multiple_Page.csv.csv

In [5]:
pd.set_option('display.max.rows',12672)
price_data_2 = pd.read_csv(r"C:\Users\chels\ALex beginner\Python\Flipkart_Mobile_Prices_Multiple_Page.csv")
price_data_2

Unnamed: 0,Date,Time,Product,Price
0,26-01-2024,00:06:28.005257,"Apple iPhone 15 (Blue, 128 GB)","₹66,999"
1,26-01-2024,00:06:28.005257,"Motorola g54 5G (Pearl Blue, 128 GB)","₹13,999"
2,26-01-2024,00:06:28.005257,"Motorola G34 5G (Ocean Green, 128 GB)","₹10,999"
3,26-01-2024,00:06:28.005257,"Apple iPhone 14 (Midnight, 128 GB)","₹57,999"
4,26-01-2024,00:06:28.005257,"Apple iPhone 14 (Starlight, 128 GB)","₹57,999"
5,26-01-2024,00:06:28.005257,"Motorola g54 5G (Midnight Blue, 128 GB)","₹13,999"
6,26-01-2024,00:06:28.005257,"vivo T2x 5G (Aurora Gold, 128 GB)","₹14,999"
7,26-01-2024,00:06:28.005257,"Motorola Edge 40 Neo (Soothing Sea, 128 GB)","₹22,999"
8,26-01-2024,00:06:28.005257,"vivo T2x 5G (Marine Blue, 128 GB)","₹12,999"
9,26-01-2024,00:06:28.005257,"vivo T2x 5G (Aurora Gold, 128 GB)","₹12,999"
