#### The codes in this notebook is to scrape the Wiki page for the list of Korean Dramas over the years. Then IMDB data is scraped for the dramas to get the movie ID and other data such as genre, cast etc. The data is then stored in a csv file which will be used for analysis through visualization in Power BI.

#### 'requests' is the HTTP library used for accessing the web pages

In [1]:
import requests

#### 'Beautiful Soup' is the Python library used for extracting data out of html and xml files.

In [2]:
from bs4 import BeautifulSoup

In [3]:
import numpy as np
import pandas as pd
import re

In [4]:
# The pages in link needs to be dynamically set which has around 999 pages of data with around 21 products in one page
pages = np.arange(1,1000,1)

root = "https://www.nykaa.com"
pdt_names = []
org_prices = []
offer_prices = []
discounts = []
avail_offers = []
ratings = []
votes = []
pdt_img = []

for page in pages:

    page_skincare = requests.get("https://www.nykaa.com/skin/c/8377?page_no="+str(page)+"&sort=popularity&eq=desktop")

    soup = BeautifulSoup(page_skincare.content,'html.parser')

    frames = soup.find_all('div',class_='css-d5z3ro')

    # extracting data for each of the items in the page
    for frame in frames:

        if frame.find('div',class_='css-xrzmfa') is not None:

            # Product names
            name = frame.find('div',class_='css-xrzmfa')
            pdt_names.append(name.get_text())
              #print(pdt_names)

        else:
            pdt_names.append("")

        if frame.find('span',class_='css-17x46n5') is not None:

            # Original Price
            price = frame.find('span',class_='css-17x46n5')
            off_price = frame.find('span',class_='css-111z9ua')

            if price.get_text() != "MRP:":
              cleaned = re.sub(r'[^0-9]','',price.get_text()).strip()
              org_prices.append(cleaned)

              if off_price is not None:
                off_cleaned = re.sub(r'[^0-9]','',off_price.get_text()).strip()
                offer_prices.append(off_cleaned)

            else:
              off_cleaned = re.sub(r'[^0-9]','',off_price.get_text()).strip()
              org_prices.append(off_cleaned)
              offer_prices.append(None)
        else:
          org_prices.append(None)


        if frame.find('span',class_='css-cjd9an') is not None:

            # Discount %
            discount = frame.find('span',class_='css-cjd9an')
            cleaned = re.sub(r'[^0-9%]','',discount.get_text()).strip()
            discounts.append(cleaned)
            #print(discounts)
        else:
          discounts.append(None)

        if frame.find('p',class_='css-1kzcg63') is not None:

            # Available offers
            offer = frame.find('p',class_='css-1kzcg63')
            avail_offers.append(offer.get_text())
        else:
          avail_offers.append(None)
            #print(avail_offers)


        if frame.find('a',class_='css-qlopj4') is not None:

            link = frame.find('a',class_='css-qlopj4')
            pdt_link = link.get('href')

            response = requests.get(root+pdt_link)

            if response.status_code==200:
                pdt_soup = BeautifulSoup(response.text,'html.parser')

                if pdt_soup.find('div',class_='css-m6n3ou') is not None:

                        # Rating
                    rating = pdt_soup.find('div',class_='css-m6n3ou')
                    cleaned = rating.get_text().split('/')[0]
                    ratings.append(cleaned)
                        #print(ratings)
                else:
                  ratings.append(None)
            else:
              ratings.append(None)


        if frame.find('span',class_='css-1qbvrhp') is not None:

            # No of votes
            vote = frame.find('span',class_='css-1qbvrhp')
            cleaned = re.sub(r'[^0-9]','',vote.get_text()).strip()
            votes.append(cleaned)
        else:
          votes.append(None)
            #print(votes)


        if frame.find('img',class_='css-11gn9r6') is not None:

            # image links
            image = frame.find('img',class_='css-11gn9r6')
            if image.get('src') is not None:
              img_link = image.get('src')
              pdt_img.append(img_link)
            else:
              pdt_img.append("")
        else:
            pdt_img.append("")

In [5]:
skincare_df = pd.DataFrame({'Skincare Product': pdt_names,
                            'Original Price': org_prices,
                            'Offer Price': offer_prices,
                            'Discount': discounts,
                            'Available Offers': avail_offers,
                            'Ratings': ratings,
                            'Votes': votes,
                            'Image_url': pdt_img})

In [6]:
skincare_df

Unnamed: 0,Skincare Product,Original Price,Offer Price,Discount,Available Offers,Ratings,Votes,Image_url
0,Nykaa Skin Secrets Gold Sheet Mask + Nykaa Ski...,1099,879,20%,Enjoy Free Gift,4.4,25,https://images-static.nykaa.com/media/catalog/...
1,L'Oreal Paris Glycolic Bright Serum With Glyco...,749,561,25%,,4.4,5378,https://images-static.nykaa.com/media/catalog/...
2,Olay Total Effects 7 In One Anti-Ageing Day Cr...,798,638,20%,,4.4,6157,https://images-static.nykaa.com/media/catalog/...
3,Nykaa SKINRX Ultra Matte Dry Touch Sunscreen S...,780,702,10%,Enjoy Free Gift,,,https://images-static.nykaa.com/media/catalog/...
4,Cetaphil Optimal Hydration Daily Cream,1250,1000,20%,,,,https://images-static.nykaa.com/media/catalog/...
...,...,...,...,...,...,...,...,...
11995,Prolixr Salicylic & Tea Tree Acne Therapy Mask,799,543,32%,,4.4,25,
11996,L'Occitane Shea Hands & Body Lavender Liquid S...,2150,,,Enjoy Free Gift,1,1,
11997,Shahnaz Husain Platinum Ultimate Cellular Skin...,2655,,,,,,
11998,House Of Beauty Brown Spot Corrector,699,524,25%,,4.8,4,


In [9]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/Colab Notebooks/Datasets/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
skincare_df.to_csv('/content/drive/My Drive/Colab Notebooks/Datasets/skincare.csv',)