#### The codes in this notebook is to scrape the Wiki page for the list of Korean Dramas over the years. Then IMDB data is scraped for the dramas to get the movie ID and other data such as genre, cast etc. The data is then stored in a csv file which will be used for analysis through visualization in Power BI.

#### 'requests' is the HTTP library used for accessing the web pages

In [2]:
import requests
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup

In [3]:
# The pages in link needs to be dynamically set which has around 999 pages of data with around 21 products in one page
pages = np.arange(1,608,1)

root = "https://www.nykaa.com"
m_pdt_names = []
m_org_prices = []
m_offer_prices = []
m_discounts = []
m_avail_offers = []
m_ratings = []
m_votes = []
m_pdt_img = []

for page in pages:

    page_makeup = requests.get("https://www.nykaa.com/makeup/c/12?page_no="+str(page)+"&sort=popularity&eq=desktop")

    m_soup = BeautifulSoup(page_makeup.content,'html.parser')

    m_frames = m_soup.find_all('div',class_='css-d5z3ro')

    # extracting data for each of the items in the page
    for frame in m_frames:

        if frame.find('div',class_='css-xrzmfa') is not None:

            # Product names
            name = frame.find('div',class_='css-xrzmfa')
            m_pdt_names.append(name.get_text())
              #print(m_pdt_names)

        else:
            m_pdt_names.append("")

        if frame.find('span',class_='css-17x46n5') is not None:

            # Original Price
            price = frame.find('span',class_='css-17x46n5')
            off_price = frame.find('span',class_='css-111z9ua')

            if price.get_text() != "MRP:":
              cleaned = re.sub(r'[^0-9]','',price.get_text()).strip()
              m_org_prices.append(cleaned)

              if off_price is not None:
                off_cleaned = re.sub(r'[^0-9]','',off_price.get_text()).strip()
                m_offer_prices.append(off_cleaned)

            else:
              off_cleaned = re.sub(r'[^0-9]','',off_price.get_text()).strip()
              m_org_prices.append(off_cleaned)
              m_offer_prices.append(None)
        else:
          m_org_prices.append(None)


        if frame.find('span',class_='css-cjd9an') is not None:

            # Discount %
            discount = frame.find('span',class_='css-cjd9an')
            cleaned = re.sub(r'[^0-9%]','',discount.get_text()).strip()
            m_discounts.append(cleaned)
            #print(m_discounts)
        else:
          m_discounts.append(None)

        if frame.find('p',class_='css-1kzcg63') is not None:

            # Available offers
            offer = frame.find('p',class_='css-1kzcg63')
            m_avail_offers.append(offer.get_text())
        else:
          m_avail_offers.append(None)
            #print(m_avail_offers)


        if frame.find('a',class_='css-qlopj4') is not None:

            link = frame.find('a',class_='css-qlopj4')
            pdt_link = link.get('href')

            response = requests.get(root+pdt_link)

            if response.status_code==200:
                pdt_soup = BeautifulSoup(response.text,'html.parser')

                if pdt_soup.find('div',class_='css-m6n3ou') is not None:

                        # Rating
                    rating = pdt_soup.find('div',class_='css-m6n3ou')
                    cleaned = rating.get_text().split('/')[0]
                    m_ratings.append(cleaned)
                        #print(m_ratings)
                else:
                  m_ratings.append(None)
            else:
              m_ratings.append(None)


        if frame.find('span',class_='css-1qbvrhp') is not None:

            # No of m_votes
            vote = frame.find('span',class_='css-1qbvrhp')
            cleaned = re.sub(r'[^0-9]','',vote.get_text()).strip()
            m_votes.append(cleaned)
        else:
          m_votes.append(None)
            #print(m_votes)


        if frame.find('img',class_='css-11gn9r6') is not None:

            # image links
            image = frame.find('img',class_='css-11gn9r6')
            if image.get('src') is not None:
              img_link = image.get('src')
              m_pdt_img.append(img_link)
            else:
              m_pdt_img.append("")
        else:
            m_pdt_img.append("")

In [4]:
makeup_df = pd.DataFrame({'Makeup Product': m_pdt_names,
                            'Original Price': m_org_prices,
                            'Offer Price': m_offer_prices,
                            'Discount': m_discounts,
                            'Available Offers': m_avail_offers,
                            'Ratings': m_ratings,
                            'Votes': m_votes,
                            'Image_url': m_pdt_img})

In [5]:
makeup_df

Unnamed: 0,Makeup Product,Original Price,Offer Price,Discount,Available Offers,Ratings,Votes,Image_url
0,Nykaa Cosmetics Matte To Last Pore Minimizing ...,849,,,Enjoy Free Gift,,,https://images-static.nykaa.com/media/catalog/...
1,Lakme 9 To 5 Primer + Matte Lipstick,550,330,40%,,4.3,27874,https://images-static.nykaa.com/media/catalog/...
2,Kay Beauty Matte Blush,899,719,20%,Enjoy Free Gift,4.5,6018,https://images-static.nykaa.com/media/catalog/...
3,Elle 18 Nude Liquid Lips - Pack of 4,540,486,10%,Enjoy Free Gift,4.3,28867,https://images-static.nykaa.com/media/catalog/...
4,Faces Canada Comfy Matte Lip Color,399,279,30%,Enjoy Free Gift,4.3,9823,https://images-static.nykaa.com/media/catalog/...
...,...,...,...,...,...,...,...,...
11995,Viseart Matte Eyeshadow Palette,5800,4350,25%,,5,1,
11996,Bronson Professional Eyelashes (M72),175,89,49%,,4.4,877,
11997,Nykaa Gloss it Up! High Shine Lip Gloss - 08 L...,499,250,50%,,4.2,7917,
11998,Nudestix Smokey Nude Glow,6050,,,,5,3,


In [7]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/Colab Notebooks/Datasets/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
makeup_df.to_csv('/content/drive/My Drive/Colab Notebooks/Datasets/makeup.csv',)