#### The codes in this notebook is to scrape the Wiki page for the list of Korean Dramas over the years. Then IMDB data is scraped for the dramas to get the movie ID and other data such as genre, cast etc. The data is then stored in a csv file which will be used for analysis through visualization in Power BI.

#### 'requests' is the HTTP library used for accessing the web pages

In [2]:
import requests
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup

#### 'Beautiful Soup' is the Python library used for extracting data out of html and xml files.

In [3]:
# The pages in link needs to be dynamically set which has around 999 pages of data with around 21 products in one page
pages = np.arange(1,547,1)

root = "https://www.nykaa.com"
h_pdt_names = []
h_org_prices = []
h_offer_prices = []
h_discounts = []
h_avail_offers = []
h_ratings = []
h_votes = []
h_pdt_img = []

for page in pages:

    page_haircare = requests.get("https://www.nykaa.com/hair-care/c/24?page_no="+str(page)+"&sort=popularity&eq=desktop")

    h_soup = BeautifulSoup(page_haircare.content,'html.parser')

    h_frames = h_soup.find_all('div',class_='css-d5z3ro')

    # extracting data for each of the items in the page
    for frame in h_frames:

        if frame.find('div',class_='css-xrzmfa') is not None:

            # Product names
            name = frame.find('div',class_='css-xrzmfa')
            h_pdt_names.append(name.get_text())
              #print(h_pdt_names)

        else:
            h_pdt_names.append("")

        if frame.find('span',class_='css-17x46n5') is not None:

            # Original Price
            price = frame.find('span',class_='css-17x46n5')
            off_price = frame.find('span',class_='css-111z9ua')

            if price.get_text() != "MRP:":
              cleaned = re.sub(r'[^0-9]','',price.get_text()).strip()
              h_org_prices.append(cleaned)

              if off_price is not None:
                off_cleaned = re.sub(r'[^0-9]','',off_price.get_text()).strip()
                h_offer_prices.append(off_cleaned)

            else:
              off_cleaned = re.sub(r'[^0-9]','',off_price.get_text()).strip()
              h_org_prices.append(off_cleaned)
              h_offer_prices.append(None)
        else:
          h_org_prices.append(None)


        if frame.find('span',class_='css-cjd9an') is not None:

            # Discount %
            discount = frame.find('span',class_='css-cjd9an')
            cleaned = re.sub(r'[^0-9%]','',discount.get_text()).strip()
            h_discounts.append(cleaned)
            #print(h_discounts)
        else:
          h_discounts.append(None)

        if frame.find('p',class_='css-1kzcg63') is not None:

            # Available offers
            offer = frame.find('p',class_='css-1kzcg63')
            h_avail_offers.append(offer.get_text())
        else:
          h_avail_offers.append(None)
            #print(h_avail_offers)


        if frame.find('a',class_='css-qlopj4') is not None:

            link = frame.find('a',class_='css-qlopj4')
            pdt_link = link.get('href')

            response = requests.get(root+pdt_link)

            if response.status_code==200:
                pdt_soup = BeautifulSoup(response.text,'html.parser')

                if pdt_soup.find('div',class_='css-m6n3ou') is not None:

                        # Rating
                    rating = pdt_soup.find('div',class_='css-m6n3ou')
                    cleaned = rating.get_text().split('/')[0]
                    h_ratings.append(cleaned)
                        #print(h_ratings)
                else:
                  h_ratings.append(None)
            else:
              h_ratings.append(None)


        if frame.find('span',class_='css-1qbvrhp') is not None:

            # No of h_votes
            vote = frame.find('span',class_='css-1qbvrhp')
            cleaned = re.sub(r'[^0-9]','',vote.get_text()).strip()
            h_votes.append(cleaned)
        else:
          h_votes.append(None)
            #print(h_votes)


        if frame.find('img',class_='css-11gn9r6') is not None:

            # image links
            image = frame.find('img',class_='css-11gn9r6')
            if image.get('src') is not None:
              img_link = image.get('src')
              h_pdt_img.append(img_link)
            else:
              h_pdt_img.append("")
        else:
            h_pdt_img.append("")

In [4]:
haircare_df = pd.DataFrame({'Haircare Product': h_pdt_names,
                            'Original Price': h_org_prices,
                            'Offer Price': h_offer_prices,
                            'Discount': h_discounts,
                            'Available Offers': h_avail_offers,
                            'Ratings': h_ratings,
                            'Votes': h_votes,
                            'Image_url': h_pdt_img})

In [5]:
haircare_df

Unnamed: 0,Haircare Product,Original Price,Offer Price,Discount,Available Offers,Ratings,Votes,Image_url
0,Minimalist Maleic Bond Repair Complex 5% Serum...,499,474,5%,,4.4,753,https://images-static.nykaa.com/media/catalog/...
1,TRESemme Pro Pure Damage Recovery Shampoo with...,500,450,10%,,4.2,570,https://images-static.nykaa.com/media/catalog/...
2,L'Oreal Professionnel X-Tenso Shampoo + Masque...,2100,1890,10%,,4.4,42429,https://images-static.nykaa.com/media/catalog/...
3,TIGI Bed Head Resurrection Super Repair Shampo...,1400,1050,25%,,,,https://images-static.nykaa.com/media/catalog/...
4,"Matrix Biolage Scalppure Shampoo,conditioner &...",1745,1571,10%,,4.4,320,https://images-static.nykaa.com/media/catalog/...
...,...,...,...,...,...,...,...,...
10915,Soho Boho Studio Pink Raindrop Zipper Scrunchie,700,420,40%,,,,
10916,Joker & Witch Delicate Golden Head Chain,599,240,60%,,4,1,
10917,Nature Sure Biotin Gummies - Pack Of 2,1598,879,45%,,5,1,
10918,Ferosh Ava Square Magic Golden Pearl Hairpin,999,499,50%,,,,


In [7]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/Colab Notebooks/Datasets/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
haircare_df.to_csv('/content/drive/My Drive/Colab Notebooks/Datasets/haircare.csv',)