# Step 1 : Scraping Data 

#### This is the first step of the Laptop Price Predictor project.<br> In this webscraping of Laptops from https://www.flipkart.com/search?q=laptop using BeautifulSoup is performed.<br><br>
This is divided into 5 parts:
- 1: Importing Libraries 
- 2: Scrapping first page
- 3: Scrapping all the pages
- 4: Creating dataframe from scrapped pages
- 5: Saving data in csv
<br><br>
#### Details of columns saved in csv:<br>
Brand: Title of Laptop <br>
Features: Features dictionary of 15 features Laptop <br>
Price: Price of Laptop <br>

### 1.  Importing Libraries

In [1]:
import requests #to make website get request
from bs4 import BeautifulSoup as bs # to scrape through website contents
import re # for regex
import time
import datetime 
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)

### 2.  Scraping one page

In [2]:
#intialising website string
flipkart_url="https://www.flipkart.com/search?q=" + "laptop"

In [3]:
#making get request
html_text=requests.get(flipkart_url).text

In [4]:
#scraping through url page contents
soup=bs(html_text,'html.parser')

In [5]:
title = []
for i in soup.find_all('div', class_="_4rR01T"):
    title.append(i.get_text())
title

['ASUS VivoBook 15 (2022) Core i3 10th Gen - (8 GB/512 GB SSD/Windows 11 Home) X515JA-EJ362WS | X515JA-E...',
 'HP Celeron Dual Core - (8 GB/256 GB SSD/Windows 11 Home) 14s-dq3037tu Thin and Light Laptop',
 'ASUS Vivobook Ultra 14 (2022) Core i5 11th Gen - (16 GB/512 GB SSD/Windows 11 Home) K413EA-EB522WS Thi...',
 'HP 14s Intel Core i3 11th Gen - (8 GB/512 GB SSD/Windows 11 Home) 14s - dy2508TU Thin and Light Laptop',
 'Lenovo IdeaPad 3 Core i3 11th Gen - (8 GB/512 GB SSD/Windows 11 Home) 82H801L7IN | 82H802FJIN | 82H802...',
 'Lenovo IdeaPad 3 Core i5 12th Gen - (8 GB/512 GB SSD/Windows 11 Home) 15IAU7 Thin and Light Laptop',
 'Lenovo IdeaPad Core i3 11th Gen - (8 GB/512 GB SSD/Windows 11 Home) 81X800J3IN | 81X800LGIN Thin and L...',
 'Lenovo Celeron Dual Core - (4 GB/256 GB HDD/256 GB SSD/DOS) 82C3A00DIH Laptop',
 'DELL Inspiron Core i3 11th Gen - (8 GB/1 TB HDD/256 GB SSD/Windows 11 Home) D560841WIN9S Thin and Ligh...',
 'HP Ryzen 5 Hexa Core 5500U - (16 GB/512 GB SSD/Windows 11 Ho

In [6]:
features=[]

for li in soup.find_all('ul',class_="_1xgFaf"):
    f1={}
    for i in range(1,11):
        f=str()
        if bool(li.select_one(f'[class="_1xgFaf"] li:nth-child({i})'))!=False:
            f = li.select_one(f'[class="_1xgFaf"] li:nth-child({i})').get_text()
            
            
        else:f='NA'
        
        f1[i]=(f)
    features.append(f1)

In [7]:
features

[{1: 'Intel Core i3 Processor (10th Gen)',
  2: '8 GB DDR4 RAM',
  3: '64 bit Windows 11 Operating System',
  4: '512 GB SSD',
  5: '39.62 cm (15.6 inch) Display',
  6: 'Office Home and Student 2021',
  7: '1 Year Onsite Warranty',
  8: 'NA',
  9: 'NA',
  10: 'NA'},
 {1: 'Intel Celeron Dual Core Processor',
  2: '8 GB DDR4 RAM',
  3: '64 bit Windows 11 Operating System',
  4: '256 GB SSD',
  5: '35.56 cm (14 inch) Display',
  6: '1 Year Onsite Warranty',
  7: 'NA',
  8: 'NA',
  9: 'NA',
  10: 'NA'},
 {1: 'Intel Core i5 Processor (11th Gen)',
  2: '16 GB DDR4 RAM',
  3: '64 bit Windows 11 Operating System',
  4: '512 GB SSD',
  5: '35.56 cm (14 inch) Display',
  6: '1 Year Onsite Warranty',
  7: 'NA',
  8: 'NA',
  9: 'NA',
  10: 'NA'},
 {1: 'Intel Core i3 Processor (11th Gen)',
  2: '8 GB DDR4 RAM',
  3: '64 bit Windows 11 Operating System',
  4: '512 GB SSD',
  5: '35.56 cm (14 inch) Display',
  6: 'Microsoft Office Home & Student 2021',
  7: '1 Year Onsite Warranty',
  8: 'NA',
  9: '

In [8]:
price = []
for i in soup.find_all('div', class_="_30jeq3 _1_WHN1"):
    price.append(i.get_text())
price

['₹33,990',
 '₹25,990',
 '₹53,990',
 '₹41,990',
 '₹38,900',
 '₹59,790',
 '₹37,290',
 '₹19,490',
 '₹41,990',
 '₹50,499',
 '₹51,990',
 '₹57,990',
 '₹35,999',
 '₹46,490',
 '₹62,990',
 '₹38,490',
 '₹51,990',
 '₹27,990',
 '₹59,990',
 '₹50,990',
 '₹85,990',
 '₹1,13,990',
 '₹59,990',
 '₹45,990']

### 3.  Scrapping all pages

In [9]:
# finding total number of pages
pages=int(soup.find('div',class_='_2MImiq').span.text.split()[-1])
pages

57

In [10]:
def page_urls(page):    
    """function to make list of urls strings to be scraped
        Args:
        page (int): max number of pages to be scraped
        Returns
        a(list): list of urls strings to be scraped        
    """
    a=[]
    for i in range(1,page+1):
        a.append(flipkart_url+"&page="+str(i))
    return a

In [11]:
page_urls(pages)# call to function page_urls with 58 as argument

['https://www.flipkart.com/search?q=laptop&page=1',
 'https://www.flipkart.com/search?q=laptop&page=2',
 'https://www.flipkart.com/search?q=laptop&page=3',
 'https://www.flipkart.com/search?q=laptop&page=4',
 'https://www.flipkart.com/search?q=laptop&page=5',
 'https://www.flipkart.com/search?q=laptop&page=6',
 'https://www.flipkart.com/search?q=laptop&page=7',
 'https://www.flipkart.com/search?q=laptop&page=8',
 'https://www.flipkart.com/search?q=laptop&page=9',
 'https://www.flipkart.com/search?q=laptop&page=10',
 'https://www.flipkart.com/search?q=laptop&page=11',
 'https://www.flipkart.com/search?q=laptop&page=12',
 'https://www.flipkart.com/search?q=laptop&page=13',
 'https://www.flipkart.com/search?q=laptop&page=14',
 'https://www.flipkart.com/search?q=laptop&page=15',
 'https://www.flipkart.com/search?q=laptop&page=16',
 'https://www.flipkart.com/search?q=laptop&page=17',
 'https://www.flipkart.com/search?q=laptop&page=18',
 'https://www.flipkart.com/search?q=laptop&page=19',
 '

In [12]:
title,features,price,discount=[],[],[],[]

In [13]:
for i in page_urls(pages):
    html_text_page=requests.get(i).text#making get request for particular page url
    soup_page=bs(html_text_page,'html.parser')#scraping through page url contents
    
    for j in soup_page.find_all('div', class_="_4rR01T"):
        title.append(j.get_text())
        
    for li in soup_page.find_all('ul',class_="_1xgFaf"):
        f1={}
        for k in range(1,16):
            f=str()
            if bool(li.select_one(f'[class="_1xgFaf"] li:nth-child({k})'))!=False:
                f = li.select_one(f'[class="_1xgFaf"] li:nth-child({k})').get_text()


            else:f='NA'

            f1[k]=(f)
        features.append(f1)
    
    for j in soup_page.find_all('div', class_="_30jeq3 _1_WHN1"):
        price.append(j.get_text())
    
    for j in soup_page.find_all('div', class_="_3Ay6Sb"):
        discount.append(j.get_text())  

In [14]:
len(price)

984

In [15]:
len(title)

984

In [16]:
len(features)

984

### 4.  Creating dataframe from scrapped pages

In [17]:
df = pd.DataFrame({
    'Brand' : title,
    'Features' : features,
    'Price'  : price
})

In [18]:
df.head()

Unnamed: 0,Brand,Features,Price
0,HP Celeron Dual Core - (8 GB/256 GB SSD/Windows 11 Home) 14s-dq3037tu Thin and Light Laptop,"{1: 'Intel Celeron Dual Core Processor', 2: '8 GB DDR4 RAM', 3: '64 bit Windows 11 Operating System', 4: '256 GB SSD', 5: '35.56 cm (14 inch) Display', 6: '1 Year Onsite Warranty', 7: 'NA', 8: 'NA', 9: 'NA', 10: 'NA', 11: 'NA', 12: 'NA', 13: 'NA', 14: 'NA', 15: 'NA'}","₹25,990"
1,ASUS VivoBook 15 (2022) Core i3 10th Gen - (8 GB/512 GB SSD/Windows 11 Home) X515JA-EJ362WS | X515JA-E...,"{1: 'Intel Core i3 Processor (10th Gen)', 2: '8 GB DDR4 RAM', 3: '64 bit Windows 11 Operating System', 4: '512 GB SSD', 5: '39.62 cm (15.6 inch) Display', 6: 'Office Home and Student 2021', 7: '1 Year Onsite Warranty', 8: 'NA', 9: 'NA', 10: 'NA', 11: 'NA', 12: 'NA', 13: 'NA', 14: 'NA', 15: 'NA'}","₹33,990"
2,Lenovo IdeaPad 3 Core i3 11th Gen - (8 GB/512 GB SSD/Windows 11 Home) 82H801L7IN | 82H802FJIN | 82H802...,"{1: 'Intel Core i3 Processor (11th Gen)', 2: '8 GB DDR4 RAM', 3: '64 bit Windows 11 Operating System', 4: '512 GB SSD', 5: '39.62 cm (15.6 inch) Display', 6: 'Office Home and Student 2021', 7: '2 Year Onsite�Warranty', 8: 'NA', 9: 'NA', 10: 'NA', 11: 'NA', 12: 'NA', 13: 'NA', 14: 'NA', 15: 'NA'}","₹38,900"
3,HP 14s Intel Core i3 11th Gen - (8 GB/512 GB SSD/Windows 11 Home) 14s - dy2508TU Thin and Light Laptop,"{1: 'Intel Core i3 Processor (11th Gen)', 2: '8 GB DDR4 RAM', 3: '64 bit Windows 11 Operating System', 4: '512 GB SSD', 5: '35.56 cm (14 inch) Display', 6: 'Microsoft Office Home & Student 2021', 7: '1 Year Onsite Warranty', 8: 'NA', 9: 'NA', 10: 'NA', 11: 'NA', 12: 'NA', 13: 'NA', 14: 'NA', 15: 'NA'}","₹40,490"
4,Lenovo IdeaPad 3 Core i5 12th Gen - (8 GB/512 GB SSD/Windows 11 Home) 15IAU7 Thin and Light Laptop,"{1: 'Intel Core i5 Processor (12th Gen)', 2: '8 GB DDR4 RAM', 3: '64 bit Windows 11 Operating System', 4: '512 GB SSD', 5: '39.62 cm (15.6 Inch) Display', 6: '2 Year Onsite Warranty', 7: 'NA', 8: 'NA', 9: 'NA', 10: 'NA', 11: 'NA', 12: 'NA', 13: 'NA', 14: 'NA', 15: 'NA'}","₹59,790"


In [19]:
pwd

'D:\\projects\\laptop_price_predictor\\notebooks'

In [20]:
timestamp = datetime.datetime.now() 
print(f"Scrapping done on {timestamp}")

Scrapping done on 2023-01-06 12:10:03.460973


### 5.  Saving data in csv

In [21]:
df.to_csv('../data/Raw_scraped_data.csv',index=False)

In [24]:
%store features

Stored 'features' (list)
