In [2]:
import scipy
import pandas as pd
import numpy as np
import math
import pymongo
import random
import csv
import time
import re
import matplotlib.pyplot as plt; import matplotlib.pylab as pylab
#%matplotlib inline
pd.options.display.mpl_style = 'default'
pylab.rcParams['figure.figsize'] = 12, 6
from dateutil import parser
import Quandl
from pymongo import MongoClient
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import urllib

## Get A BeautifulSoup Object

In [3]:
class BloombergSearch:
    def __init__(self, search_term):
        self.search_term = search_term
        self.url_page1 = ('http://www.bloomberg.com/search?query=' + str(self.search_term))

    def get_search_soup(self):
        url =  self.url_page1
        soup = self.get_soup(url)
        return soup
    def get_soup(self, url):
        page = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(page)
        return soup
    def get_search_page_links(self, num_pages):
        article_list = []
        for i in range(1, num_pages + 1):
            temp_soup = self.get_soup(self.url_page1 + str('&page=') + str(i))
            for result in temp_soup.find_all('h1'):
                try:
                    if 'video' in result.a['href']:
                        continue
                    if 'http' in result.a['href']:
                        #print item.a['href']
                        article_list.append(result.a['href'])
                    else:
                        #print 'http://www.bloomberg.com/' + item.a['href']
                        article_list.append('http://www.bloomberg.com/' + result.a['href'])
                        #print('http://www.bloomberg.com/' + result.a['href'])
                except:
                    continue
            #print 'Added page=' + str(i)
        return article_list
    
    def get_post_body(self, article_url):
        final_text = ""
        article_soup = self.get_soup(article_url)
        query = article_soup.find_all('div',  class_="article-body__content")
        for item in query:
            for text in item.find_all('p'):
                final_text = final_text + '\n\n' + str(text.text.encode('utf-8'))
        if final_text == "":
            return 0
        return final_text
    
    def get_post_date(self, article_url):
        final_text = ""
        article_soup = self.get_soup(article_url)
        result = article_soup.find('time', class_ = "published-at")
        try:
            return result['datetime']
        except:
            return None
    
    def get_post_author(self, article_url):
        final_text = ""
        article_soup = self.get_soup(article_url)
        result = article_soup.find('a', class_ = "author-link")
        try:
            return result.text.lstrip().rstrip()
        except:
            return None
    
    def get_post_title(self, article_url):
        final_text = ""
        article_soup = self.get_soup(article_url)        
        result = article_soup.find('title')
        try:
            return result.text.lstrip().rstrip()
        except:
            return None
    
    def make_info(self, pages = 1):
        final_df = pd.DataFrame()
        for url in self.get_search_page_links(pages):
            body = self.get_post_body(url)
            title = self.get_post_title(url)
            author = self.get_post_author(url)
            date = self.get_post_date(url)
            temp_series = pd.Series([title, author, date, body])
            final_df= final_df.append(temp_series, ignore_index = True)
        final_df.columns = ['title', 'author', 'date', 'text']
        return final_df

In [4]:
bloom_obj = BloombergSearch('ibm')

In [6]:
temp = bloom_obj.make_info(3)

In [14]:
class ForbesSearch:
    def __init__(self, search_term):
        self.search_term = search_term
        self.url_page1 = ('http://www.forbes.com/search/?q=' + str(self.search_term))
    def get_search_soup(self):
        url =  self.url_page1
        soup = self.get_soup(url)
        return soup
    def get_soup(self, url):
        page = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(page)
        return soup
    def get_search_page_links(self, num_pages):
        article_list = []
        for i in range(1, num_pages + 1):
            temp_soup = self.get_soup(self.url_page1 + str('&page=') + str(i))
            for result in temp_soup.find_all('h2'):
                try:
                    if 'video' in result.a['href']:
                        continue
                    if 'http' in result.a['href']:
                        #print item.a['href']
                        article_list.append(result.a['href'])
                    else:
                        #print 'http://www.bloomberg.com/' + item.a['href']
                        article_list.append('http://www.forbes.com/' + result.a['href'])
                except:
                    continue
            #print 'Added page=' + str(i)
        return article_list
    
    def get_post_body(self, article_url):
        array1 = []
        array2 = []
        array3 = []
        array4 = []
        tolooktext = ''
        article_soup = self.get_soup(article_url)
        query = article_soup.find_all('script')
        for item in query:
            if item.text.find('"body":"<p>') > 0:
                indexA = item.text.find('"body"') 
                indexB = item.text.find('</p>","description"')
                tolooktext = item.text[indexA:indexB]
                tolooktext = tolooktext.replace(r'\r\n\r\n','')
                tolooktext = tolooktext.replace('&nbsp;',' ')
                tolooktext = tolooktext.replace('&rsquo;','')
                tolooktext = tolooktext.replace('&rdquo;','')
                tolooktext = tolooktext.replace('&ldquo;','')
                tolooktext = tolooktext.replace('"body":"','')
                tolooktext = tolooktext.replace('\r\n\r\n3','')
                tolooktext = tolooktext.replace('\\','')
                tolooktext = re.sub('<[^>]+>', '', tolooktext)
                tolooktext = re.sub("\[[^]]*\]", '',tolooktext)
                if tolooktext != '':
                    array1.append(tolooktext)
            elif item.text.find('"body":"<em>') > 0:
                indexA = item.text.find('"body":"<em>') 
                indexB = item.text.find('.</em>')
                tolooktext = item.text[indexA:indexB]
                tolooktext = tolooktext.replace(r'\r\n\r\n','')
                tolooktext = tolooktext.replace('&nbsp;',' ')
                tolooktext = tolooktext.replace('&rsquo;','')
                tolooktext = tolooktext.replace('&rdquo;','')
                tolooktext = tolooktext.replace('&ldquo;','')
                tolooktext = tolooktext.replace('"body":"','')
                tolooktext = tolooktext.replace('\r\n\r\n3','')
                tolooktext = tolooktext.replace('\\','')
                tolooktext = re.sub('<[^>]+>', '', tolooktext)
                tolooktext = re.sub("\[[^]]*\]", '',tolooktext)
                if tolooktext != '':
                    array1.append(tolooktext)
                    
            if item.text.find('"title":"') > 0:
                indexA = item.text.find('"title":"') 
                indexB = item.text.index('","date"')
                print(indexB)
                tolooktext = item.text[indexA:indexB]
                if tolooktext != "":
                    array2.append(tolooktext)
                    
            if item.text.find('date":') > 0:
                indexA = item.text.find('"timestamp":')
                print(indexA)
                indexB = item.text.find('","body"')
                print(indexB)
                tolooktext = item.text[indexA:indexB]
                if tolooktext != "":
                    array3.append(tolooktext)
                    
            if item.text.find('"author":"') > 0:
                indexA = item.text.find('"author":"') 
                indexB = item.text.find('","title"')
                tolooktext = item.text[indexA:indexB]
                if tolooktext != "":
                    array4.append(tolooktext)
        return [array1,array2,array3,array4]
       
        
    def make_info(self, pages = 1):
        series_articles = []
        series_titles = []
        series_authors = []
        series_dates = []
        for url in self.get_search_page_links(pages):
            print(url)
            [bodies,titles,dates,authors] = self.get_post_body(url)
            if len(dates) != 0:
                for date in dates:
                    series_dates.append(date)
            if len(titles) != 0:
                for title in titles:
                    series_titles.append(title)
            if len(bodies) != 0:
                for body in bodies:
                    series_articles.append(body)
            if len(authors) != 0:
                for author in authors:
                    series_authors.append(author)
        series_titles = np.unique(series_titles)
        series_articles = np.unique(series_articles)
        while len(series_authors) != len(series_articles):
           series_authors.pop()
        series_dates = np.unique(series_dates)
        print(len(series_dates),len(series_authors),len(series_articles),len(series_titles))
        df1 = pd.DataFrame({'Articles': series_articles,'Titles':series_titles,'Authors':series_authors,'Dates':series_dates})     
        return df1

In [None]:
forb_obj = ForbesSearch('ibm')
temp = forb_obj.make_info(3)
temp