## Scraping Data Using Selenium (w/o any exe file downloads)

In [21]:
!pip3 install selenium pandas beautifulsoup4 numpy --quiet

In [22]:
#Importing libraries that will help us scrape the data off the internet
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## 1st Task : Scraping Data from [screener.in](https://www.screener.in)

This script automatically gathers important financial information for a group of companies from a website. Here's what it does in simple terms:

1. **List of Companies**:
   - We start with a list of company names, like **TCS**, **Reliance**, and **Titan**, for which we want to collect financial details.

2. **Opening the Website Automatically**:
   - The script uses an automation tool to automatically open a web browser (like Google Chrome) and go to the webpage of each company on [screener.in](https://www.screener.in)
   - It sets up the browser to make sure everything runs smoothly, even if there are issues like slow loading or website restrictions.

3. **Collecting Financial Information**:
   - Once the companyâ€™s page is open, the script looks through the page to find important financial numbers, like profits, stock prices, or ratios that help investors understand how the company is doing.
   - It picks out the key pieces of information from the page and saves them.

4. **Organizing the Data**:
   - For each company, it creates a table with the financial details (like a mini-report for each company).
   - It then adds all these mini-reports together into one big report that includes all the companies in the list.

5. **Finishing Up**:
   - After collecting all the data, the browser is closed.
   - The script also gives updates while running, letting you know when it has finished gathering data for each company.
   - Once Completed, the script also saves a CSV file in the current working directory.

In [23]:
# List of stock symbols to scrape
symbols = ['ULTRACEMCO', 'TITAN', 'TCS', 'SRF', 'SCHAEFFLER', 'RELIANCE', 'KOTAKBANK', 'HDFCBANK',
           'EICHERMOT', 'DRREDDY', 'DIVISLAB', 'TATAELXSI', 'M&M', 'INFY', 'HINDUNILVR', 'CDSL']

# Create an empty DataFrame to store all the scraped data
main = pd.DataFrame()

# Loop through each symbol to scrape data
for symbol in symbols:
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")  # Disable GPU hardware acceleration
    chrome_options.add_argument("--no-sandbox")  # Disable sandboxing
    chrome_options.add_argument("--disable-dev-shm-usage")  # Disable /dev/shm usage
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")  # Set user-agent

    # Initialize the Chrome WebDriver with the specified options
    driver = webdriver.Chrome(options=chrome_options)

    # Open the web page for the current symbol
    driver.get(f"https://www.screener.in/company/{symbol}/")

    # Wait for the page to fully load
    time.sleep(5)

    # Get the page source after the page has loaded
    page_source = driver.page_source

    # Parse the page source using BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find all div elements with class 'company-ratios' (where the data is located)
    search_results = soup.find_all('div', class_='company-ratios')

    # Initialize lists to hold the metric names and values
    values = []
    numbers = []

    # Iterate over each 'company-ratios' div found
    for result in search_results:
        # Find all list items within the 'company-ratios' div
        items = result.find_all('li', class_='flex flex-space-between')
        for item in items:
            # Extract the metric name and value
            value = item.find('span', class_='name').text.strip()
            number = item.find('span', class_='number').text.strip()

            # Add the extracted data to the lists
            values.append(value)
            numbers.append(number)

    # Create a dictionary with the scraped data
    data = {'Symbol': symbol, 'Metric': values, 'Numbers': numbers}

    # Convert the dictionary to a DataFrame
    df = pd.DataFrame(data)

    # Append the current DataFrame to the main DataFrame
    main = pd.concat([main, df], axis=0)

    # Print a message indicating successful data scraping for the current symbol
    print(f"Scraped data for {symbol}...")

# Close the WebDriver after all symbols have been processed
driver.quit()

#Removing the Apostrophe sign
main['Numbers'] = main['Numbers'].str.replace(",", "")

#Saving Screener data as a CSV File
main.to_csv('Screener_data.csv', index=False)

print('Data scraped and saved as a CSV file in the current working directory')

Scraped data for ULTRACEMCO...
Scraped data for TITAN...
Scraped data for TCS...
Scraped data for SRF...
Scraped data for SCHAEFFLER...
Scraped data for RELIANCE...
Scraped data for KOTAKBANK...
Scraped data for HDFCBANK...
Scraped data for EICHERMOT...
Scraped data for DRREDDY...
Scraped data for DIVISLAB...
Scraped data for TATAELXSI...
Scraped data for M&M...
Scraped data for INFY...
Scraped data for HINDUNILVR...
Scraped data for CDSL...
Data scraped and saved as a CSV file in the current working directory


In [24]:
#All of the Screener Data collected in 1 table
main.head()

Unnamed: 0,Symbol,Metric,Numbers
0,ULTRACEMCO,Market Cap,340077.0
1,ULTRACEMCO,Current Price,11780.0
2,ULTRACEMCO,High / Low,12078.0
3,ULTRACEMCO,Stock P/E,49.2
4,ULTRACEMCO,Book Value,2047.0


## Scraping Data from [nseindia.com](https://www.nseindia.com)

This script automates the process of scraping stock data for various companies from the National Stock Exchange (NSE) website using Python and Selenium.

1. **Setup**: 
   - It first initializes an empty structure (DataFrame) to store the collected data.
   - A list of stock symbols is defined, representing the companies we want to gather data for (e.g., ULTRACEMCO, TITAN, TCS).

2. **Automated Web Browsing**: 
   - For each stock symbol, the script uses a browser automation tool (Selenium) to open the NSE website and retrieve the page that contains data for that specific stock.
   - Chrome browser settings are configured to avoid issues related to loading the webpage (e.g., disabling GPU, handling sandboxing issues).

3. **Data Scraping**: 
   - The script extracts tables of stock data from the webpage by parsing the HTML structure using BeautifulSoup.
   - It identifies key metrics (e.g., market cap, volatility) and their corresponding values from the tables on the webpage.

4. **Data Storage**:
   - The extracted metrics and values are stored in a DataFrame format, a table-like structure that organizes data for analysis.
   - Data for each stock symbol is appended to the main DataFrame.

5. **Completion**: 
   - After scraping data for each symbol, the browser is closed, and a message is printed to indicate successful data extraction.
   - Once Completed, the script also saves all of the data as a CSV file in the current working directory.

In [25]:
# Create empty DataFrame to store the final results
nse = pd.DataFrame()

symbols = ['ULTRACEMCO', 'TITAN', 'TCS', 'SRF', 'SCHAEFFLER', 'RELIANCEP1', 'KOTAKBANK', 'HDFCBANK', 'EICHERMOT',
           'DRREDDY', 'DIVISLAB', 'TATAELXSI','M%26M', 'INFY', 'HINDUNILVR', 'CDSL']

for symbol in symbols:
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

    # Initialize the driver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(f"https://www.nseindia.com/get-quotes/equity?symbol={symbol}")
    time.sleep(5)  # Wait for the page to fully load

    # Get the page source
    page_source = driver.page_source

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Locate the table with the class 'card-table' and scrape relevant data
    search_results = soup.find_all('table', class_='card-table')

    #Loop through the table and append the values to a list
    scraped = []
    for results in search_results:
        rows = results.find_all('tbody')
        for row in rows:
            values = row.find_all('td')
            data = [value.text.strip().replace(",", "") for value in values]
            scraped.append(data)

    # Only proceed if we have scraped data
    if scraped:
        #Putting the scraped data into a table
        df = pd.DataFrame(scraped)[1:][:2].T

        # Flatten and extract metrics and values
        metrics = np.ravel([df.iloc[i].values for i in range(len(df)) if i % 2 == 0])
        values = np.ravel([df.iloc[i].values for i in range(len(df)) if i % 2 != 0])

        # Make sure both arrays are of the same length
        if len(metrics) != len(values):
            # Pad the shorter list with None to ensure equal length
            if len(metrics) > len(values):
                values = np.append(values, [None] * (len(metrics) - len(values)))
            else:
                metrics = np.append(metrics, [None] * (len(values) - len(metrics)))

        # Create a dictionary for the current symbol
        json_data = {'Symbol': symbol, 'Metrics': metrics, 'Values': values}
        df = pd.DataFrame(json_data)

        # Append to the main DataFrame
        nse = pd.concat([nse, df], axis=0)
        
    # Print a message indicating successful data scraping for the current symbol
    print(f"Scraped data for {symbol}...")
    
    # Close the browser for the current symbol
    driver.quit()

#Dropping any rows that contain null values
nse = nse.dropna()
#Saving the data as a CSV file
nse.to_csv('nse.csv', index=False)
print('NSE Data scraped and saved as a CSV file in the current working directory')


Scraped data for ULTRACEMCO...
Scraped data for TITAN...
Scraped data for TCS...
Scraped data for SRF...
Scraped data for SCHAEFFLER...
Scraped data for RELIANCEP1...
Scraped data for KOTAKBANK...
Scraped data for HDFCBANK...
Scraped data for EICHERMOT...
Scraped data for DRREDDY...
Scraped data for DIVISLAB...
Scraped data for TATAELXSI...
Scraped data for M%26M...
Scraped data for INFY...
Scraped data for HINDUNILVR...
Scraped data for CDSL...
NSE Data scraped and saved as a CSV file in the current working directory


In [26]:
#Displaying top 5 rows
nse.head()

Unnamed: 0,Symbol,Metrics,Values
0,ULTRACEMCO,52 Week High,-
1,ULTRACEMCO,Security VaR,-
2,ULTRACEMCO,52 Week Low,-
3,ULTRACEMCO,Index VaR,-
4,ULTRACEMCO,Upper Band,-
