In [None]:
# Author: Damodar Rajbhandari (2019)

# Installing dependencies
#!pip install pandas bs4 requests
from IPython.display import clear_output, display

In [None]:
# Necessary Tools

import re
import csv
import requests

from bs4 import BeautifulSoup 
import pandas as pd

def get_soup(url: str) -> 'Beautiful soup':
  """
  Creating initiate the BeautifulSoup class
  """
  headers = {'User-Agent': 
                 ('Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                  'AppleWebKit/537.36 (KHTML, like Gecko)'
                  'Chrome/61.0.3163.100 Safari/537.36')}
  r = requests.get(url, headers=headers)
  soup = BeautifulSoup(r.content, 'xml')
  # To ensure whether or not url doesnot shows error.
  clear_output(wait=True)
  print(r.url)
  return soup

def get_text(x: 'soup list or soup str') -> 'list or str':
  """
  x takes soup list or soup str and return into list or str.
  Basically, the above format is called function annotation.
  """
  if isinstance(x, list):
    # strip removes the spaces, if any
    return [i.get_text(strip=True) for i in x]
  else:
    return x.get_text(strip=True)
    
def chunks(l: list, n: int) -> 'generator':
  """
     Yield successive n-sized chunks from l.
  
     Generators are iterators, a kind of iterable 
     you can only iterate over once. This means
     generators can only be used once: they calculate 0, 
     then forget about it and calculate 1, and end calculating 4, one by one.
     
     yield is a keyword that is used like return,
     except the function will return a generator.
  """
  for i in range(0, len(l), n):
    yield l[i:i + n]

def get_date_range(start: str='2010-04-15', 
                   end: str='2019-06-06') -> list:
  """get list of days in str and day of week"""
  range = pd.date_range(start, end, freq = 'D')
  # freq = 'D' means calendar day frequency
  # str(range[0]) gives '2010-04-15 00:00:00' and
  # str(range[0])[:10] gives '2010-04-15'
  # range[0].dayofweek means which day is this? gives
  # 3 which means Thursday
  # i.e. The number of the day of the week with Monday=0, Sunday=6
  return [(str(i)[:10], i.dayofweek) for i in range]

def write_csv(l: list, filename: str, 
              mode: str='w') -> 'write file':
  """
  Create a csv file
  """
  with open(filename, mode, newline='') as f:
    writer = csv.writer(f)
    for row in l:
      writer.writerow(row)
            
def clean_names(words: list) -> list:
  """replace space with _ and remove any non letter"""
  for i, word in enumerate(words):
    word = re.sub(' ', '_', word)
    word = re.sub('[^a-zA-Z_]', '', word.lower())
    words[i] = word
  return words

In [None]:
# Getting Data from NEPSE

import re
import time
from collections import defaultdict
from functools import lru_cache
import pandas as pd


def company_listed(url='http://www.nepalstock.com/company?_limit=500', 
                   table='my-table table') -> list:
  """
      get all company listed

      Params:
      -------
      url: url of page
      table: class id of table

      Return
      -------
      list of information in table as list of list
  """

  nepse = get_soup(url)
  table = nepse.find('table', {'class':table})

  # get all row 
  rows = table.find_all('tr')[1:-1] # remove first and last row
  body_list = []
  for row in rows:
    body = row.find_all('td')
    link = row.find('a').get('href') # add company link
    body_list.append(get_text(body)+[link])

  colnames  = clean_names(body_list[0])
  colnames = colnames[:-1] + ['link']
  body_list[0] =  colnames  # first list is colnames
  return body_list


def company_detail(company_id = '397', table='my-table table'):
  """
  Get company additional detail
  Params:
  -------
  company_id: company display id
  table: class id of table

  Return
  -------
  list of information in table as list of list
  """
  url = 'http://www.nepalstock.com/company/display/{}'.format(company_id)
  nepse = get_soup(url)
  table = nepse.find('table', {'class':table})
  rows = table.find_all('tr')[2:] #remove  unwanted row
  body_list = []
  for i in rows:
    body = i.find_all('td')
    body_list.append(get_text(body))

  #remove unwanted row
  del body_list[0][0], body_list[1]

  body_list.append(['url_id', company_id])
  return body_list


#get detail of all company
def company_detail_all(company_id_list, table):
  """
  Get company stail
  Params:
  -------
  company_id: list of company display id
  table: class id of table

  Return
  -------
  list of information in table as dict
  """

  detail = defaultdict(list)
  for i in company_id_list:
      c_detail = company_detail(i)
      for key, values in c_detail:
          detail[key].append(values)
  return detail


@lru_cache(maxsize=512)
def stock_today(date, 
                table='table table-condensed table-hover', 
                headers = True):
  """
      Get data from stock table
      Params:
      -------
      date: 'str' -- 'YYYY-MM-DD'
      table: class id of table
      headers: if you want to return header of table

      Return
      -------
      list of information in table as list of list
  """

  url = ('http://www.nepalstock.com/todaysprice?'
         +'startDate={}&stock-symbol=&_limit=500'.format(date))
  npse = get_soup(url)
  pages = npse.find_all(text=re.compile('Page.*')) #'Page' is only present if page has data
  if pages:
    table = npse.find('table', {'class':table})

    #get title 
    title = table.find_all('tr')[1].find_all('td') #title of table
    title = get_text(title)
    title = clean_names(title)
    title = ['date'] + title #added date to data

    #tr in table are nested, so we only need top tr
    body = table.find_all('tr')[2].find_all('td')
    body = list(chunks(get_text(body), 10)) #get all data and make chuck of rows
    body = [[date] + i  for i in body] #added date to data
    body_list = [title] + body
    if headers:
      return body_list[:-1] #last coloum is total, so ignored
    else:
      return body_list[1:-1]
  else:
      print("No data on stock exchange, was that day public holiday!!")
      pass


def stockprice_range(start, end):
  """
      get stock price of each company each day for given range

      Params
      --------
      start: str -- 'YYYY-MM-DD'
      end: str -- 'YYYY-MM-DD'

      Return
      -------
      list of information as list of list, first list as header
  """

  ranges = get_date_range(start, end)
  stock = []
  for date, day in ranges:
    time.sleep(1)
    if day not in [5, 4]: #stock market is closed on fri and sat
      if not stock:
        temp = stock_today(date)
        if temp:
          stock = temp
      else:
          temp = stock_today(date, headers=False)
          if temp:
            stock += temp
  return stock


In [None]:
company = company_listed()

# Converting to DataFrame for analysis and wangling
company = pd.DataFrame(company[1:], columns=company[0])
company['url_id'] = company['link'].str.split('/').str[-1]
clear_output(wait=True)
print(company.shape)

detail = company_detail_all(company['url_id'][1:3], table='my-table table')
detail = pd.DataFrame(detail)
clear_output(wait=True)
print(detail)

today = time.strftime('%Y-%m-%d', time.localtime())

##join company with detail
company = pd.merge(company, detail, how='left', on=['url_id'])
company.to_csv('company_full_data.csv')

# years = [ '2010-04-15', '2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01', '2018-01-01', '2019-01-01', '2019-06-06']
years = ['2019-06-06', today]
for i, year in enumerate(years):
    if i+1 != len(years):
        stock_all = stockprice_range(years[i], years[i+1])
        stock_all = pd.DataFrame(stock_all[1:], columns=stock_all[0])
        stock_all.to_csv('stock_range.csv',mode='a', header=False)

print("Finally, all the NEPSE data downloaded from {} to {}".format(years[0], today))

http://www.nepalstock.com/todaysprice?startDate=2019-06-06&stock-symbol=&_limit=500
Finally, all the NEPSE data downloaded from 15th of April, 2019 to 6th of June, 2019
