In [1]:
import requests
import re
from bs4 import BeautifulSoup

In [2]:
import pandas as pd
import copy

In [3]:
import datetime

In [4]:
import time

In [28]:
def get_news_for_stock(stock_code, duration_type, duration, pages_end):
    all_data = []
    url = "https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id={}&durationType={}&Year={}&pageno={}"
    for page_number in range(1, pages_end):
        time.sleep(1)
        response = requests.get(url.format(
            stock_code, duration_type, duration, page_number
        ))
        html = response.text
        soup = BeautifulSoup(html, "html5lib")
        er = soup.find_all('div', attrs={'class': re.compile("MT15 PT10 PB10")})
        for eer in er:
            link = eer.find('a', attrs={'class': "g_14bl"})
            source_span = eer.find('span', attrs={'class': re.compile("a_2_10bl")})
            date_p = eer.find('p', attrs = {'class': re.compile("PT3 a_10dgry")})
            try:
                headline = link.text
                news_url = "https://www.moneycontrol.com" + link['href']
                source = source_span.text
                date = date_p.text
                data = {
                    "headline": headline,
                    "date": date,
                    "source": source,
                    "url": news_url
                }
                all_data.append(data)
            except AttributeError:
                pass
    return all_data

In [29]:
def clean_data(all_data, year, ticker):
    cleaned_data = copy.deepcopy(all_data)
    for data_item in cleaned_data:
        # Cleaning the headline
        headline = data_item["headline"]
        headline = headline.replace("\n", "")
        headline = headline.replace("\t", "")
        data_item["headline"] = headline
        # Cleaning the date
        date = data_item["date"]
        date = date[:date.find(year)+4]
        date = datetime.datetime.strptime(date, "%H.%M %p | %d %b %Y").date()
        data_item["date"] = date
        data_item["ticker"] = ticker
    return cleaned_data

In [None]:
news_items_to_be_scraped = [
  {
    "stock_code": "HDF01",
    "ticker": "HDFCBANK",
    "year": "2018",
    "pages_end": 8,
    
  },
  {
    "stock_code": "HDF01",
    "ticker": "HDFCBANK",
    "year": "2019",
    "pages_end": 4,
    
  },
  {
    "stock_code": "ICI02",
    "ticker": "ICICIBANK",
    "year": "2018",
    "pages_end": 11,
    
  },
  {
    "stock_code": "ICI02",
    "ticker": "ICICIBANK",
    "year": "2019",
    "pages_end": 5,
    
  },
  {
    "stock_code": "UTI10",
    "ticker": "AXISBANK",
    "year": "2018",
    "pages_end": 7,
    
  },
  {
    "stock_code": "UTI10",
    "ticker": "AXISBANK",
    "year": "2019",
    "pages_end": 4,
    
  },
  {
    "stock_code": "HDF01",
    "ticker": "HDFCBANK",
    "year": "2017",
    "pages_end": 11,
    
  },
  {
    "stock_code": "ICI02",
    "ticker": "ICICIBANK",
    "year": "2017",
    "pages_end": 11,
    
  },
  {
    "stock_code": "UTI10",
    "ticker": "AXISBANK",
    "year": "2017",
    "pages_end": 11,
    
  },
  {
    "stock_code": "HDF01",
    "ticker": "HDFCBANK",
    "year": "2016",
    "pages_end": 11,
    
  },
  {
    "stock_code": "ICI02",
    "ticker": "ICICIBANK",
    "year": "2016",
    "pages_end": 11,
    
  },
  {
    "stock_code": "UTI10",
    "ticker": "AXISBANK",
    "year": "2016",
    "pages_end": 11,
    
  },
  {
    "stock_code": "HDF01",
    "ticker": "HDFCBANK",
    "year": "2015",
    "pages_end": 11,
    
  },
  {
    "stock_code": "ICI02",
    "ticker": "ICICIBANK",
    "year": "2015",
    "pages_end": 11,
    
  },
  {
    "stock_code": "UTI10",
    "ticker": "AXISBANK",
    "year": "2015",
    "pages_end": 11,
    
  },
  {
    "stock_code": "HDF01",
    "ticker": "HDFCBANK",
    "year": "2014",
    "pages_end": 11,
    
  },
  {
    "stock_code": "ICI02",
    "ticker": "ICICIBANK",
    "year": "2014",
    "pages_end": 11,
    
  },
  {
    "stock_code": "UTI10",
    "ticker": "AXISBANK",
    "year": "2014",
    "pages_end": 11,
    
  }
]

In [41]:
def run_pipeline(config):
    total_data = []
    for item in config:
        print("Working on {} for year {}".format(item["ticker"], item["year"]))
        try:
            scraped_data = get_news_for_stock(item["stock_code"],
                                              "Y",
                                              item["year"],
                                              item["pages_end"])

            cleaned_data = clean_data(scraped_data,
                                      item["year"],
                                      item["ticker"])
            total_data += cleaned_data
        except Exception as e:
            print(e)
    return total_data

In [42]:
all_cleaned_news = run_pipeline(news_items_to_be_scraped)

Working on HDFCBANK for year 2014
Working on ICICIBANK for year 2014
Working on AXISBANK for year 2014


In [43]:
df = pd.DataFrame(all_cleaned_news)

In [44]:
df.to_csv("MoneyControlScrapingFull2014.csv", header=False, index=False, sep='~')

In [45]:
df.head()

Unnamed: 0,date,headline,source,ticker,url
0,2014-12-26,"Prefer HPCL, BEML, Oil India: Rajesh Agarwal",CNBC-TV18,HDFCBANK,https://www.moneycontrol.com/news/stocks-views...
1,2014-12-24,"Short ICICI Bank, Axis Bank: Siddharth Bhamre",CNBC-TV18,HDFCBANK,https://www.moneycontrol.com/news/stocks-views...
2,2014-12-24,"Private sector banks may outperform, says Ajay...",CNBC-TV18,HDFCBANK,https://www.moneycontrol.com/news/stocks-views...
3,2014-12-24,HDFC Bank top pick: Jignesh Shial,CNBC-TV18,HDFCBANK,https://www.moneycontrol.com/news/stocks-views...
4,2014-12-23,"Buy HDFC Bank, says Manas Jaiswal",CNBC-TV18,HDFCBANK,https://www.moneycontrol.com/news/stocks-views...


In [46]:
df = df.drop(["source", "url"], axis=1)

In [47]:
df = df[["headline", "date", "ticker"]]

In [48]:
df.to_csv("FINAL_DATA/bank_news_money_control_2014_~.csv", header=False, index=False, sep='~')