In [1]:
import os
os.chdir(r'C:\Users\Derrick\Documents\Scripts\Python\StockRelated')

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
import datetime as dt

def scrape_news_text(news_url): 
    news_html = requests.get(news_url).content 
    news_soup = BeautifulSoup(news_html , 'lxml') 
    titles = news_soup.find_all('title')
    title = [title.text for title in titles if title.text is not None]
    spans = news_soup.find_all('span')
    publishedDatetime = [span.get('content') for span in spans if span.get('content') is not None]
    paragraphs = [par.text for par in news_soup.find_all('p')]
    news_text = '\n'.join(paragraphs) 
    try:
        return title[0], publishedDatetime[0], news_text
    except:
        print('try on return empty')
        pass

def get_news_urls(links_site):
    resp = requests.get(links_site) 
    if not resp.ok:
        return None 
    html = resp.content
    soup = BeautifulSoup(html , 'lxml')
    links = soup.find_all('a')
 
    urls = [link.get('href') for link in links]
    urls = [url for url in urls if url is not None]
    news_urls = [url for url in urls if '/article/' in url]
 
    return news_urls


def scrape_all_articles(ticker , upper_page_limit = 1): 
    landing_site = 'http://www.nasdaq.com/symbol/' + ticker + '/news-headlines' 
    all_news_urls = get_news_urls(landing_site) 
    current_urls_list = all_news_urls.copy()
 
    index = 2
    while (current_urls_list is not None) and (current_urls_list != []) and \
        (index <= upper_page_limit):
        
        PreUrl = current_urls_list[-1]        
        current_site = landing_site + '?page=' + str(index)
        current_urls_list = get_news_urls(current_site)        
        CurUrl = current_urls_list[-1]       
        
        if PreUrl==CurUrl:
            break
        else: 
            all_news_urls = all_news_urls + current_urls_list     
            index = index + 1
    
    all_news_urls = list(set(all_news_urls))
    
    ''' getting contents from urls'''
    all_articles = []
    for news_url in all_news_urls:
        try:
            article = scrape_news_text(news_url)
            all_articles.append(article)
        except:
            print('This url can''t be extracted: \n', news_url )

    all_articles  = [article for article in all_articles if article is not None]
    #return all_articles
    df1 = pd.DataFrame(all_articles, columns =['Title','PublishedDT','Content'])
    return df1

def SelectParagraphs(ContentText, ticker):
    paras = ContentText.split('\n')
    KeyParas = [para for para in paras if ticker in para]
    return " ".join(KeyParas)


def nsdaqNewsScripting(Ticker):
    #Ticker1 = Ticker
    print('nsdaqNewsScripting')
    all_articles = scrape_all_articles(Ticker.lower() , upper_page_limit = 200)

    all_articles['PublishedDT'] = pd.to_datetime(all_articles['PublishedDT'])
    all_articles = all_articles.sort_values(by='PublishedDT')

    all_articles['Content'] = all_articles['Content'].str.replace('Join the Nasdaq Community today and get free, instant access to portfolios, stock ratings, real-time alerts, and more!','')
    all_articles['Title'] = all_articles['Title'].str.replace('- Nasdaq.com','')


    all_articles['TickerInTitle'] = 0
    all_articles['TickerInTitle'][all_articles['Title'].str.contains(Ticker)] = 1

    all_articles['TickerInContent'] = 0
    all_articles['TickerInContent'][all_articles['Content'].str.contains(Ticker)] = 1

    all_articles['KeyParagraphs'] =   all_articles.apply(lambda x: SelectParagraphs(x['Content'], Ticker), axis=1 )

    fileName = Ticker +  'NewsForNLP_'+ dt.datetime.now().strftime("%Y%m%d")  +'.xlsx'

    all_articles[['PublishedDT','TickerInTitle','TickerInContent', 'Title', 'Content','KeyParagraphs']].to_excel(fileName, index=False)

    return all_articles

In [4]:
TickerList = pd.read_csv('companylist.csv')
TickerList  = TickerList['Symbol']

In [5]:
pd.options.mode.chained_assignment = None

for ticker in TickerList[396:400]:
    print('Now processing ', ticker)
    try:
        all_articles = nsdaqNewsScripting(ticker)
    except:
        print('\t', ticker, ' cannot be extracted')

Now processing  NXPI
nsdaqNewsScripting
	 NXPI  cannot be extracted
Now processing  OIIM
nsdaqNewsScripting
	 OIIM  cannot be extracted
Now processing  OCLR
nsdaqNewsScripting
	 OCLR  cannot be extracted
Now processing  OKTA
nsdaqNewsScripting
	 OKTA  cannot be extracted


In [76]:
all_articles.head()

Unnamed: 0,Title,PublishedDT,Content,TickerInTitle,TickerInContent,KeyParagraphs
10,How Data Is Making The U.S. Stock Market Open ...,2018-07-27 07:31:12,"\n\nFor most Americans, financial security, or...",0,0,
7,Friday's ETF with Unusual Volume: URTH,2018-12-07 07:01:31,\nThe iShares MSCI World ETF ( URTH ) is seei...,0,0,
2,AMD Stock Up Almost 100% and Still Has Fight I...,2018-12-07 08:15:33,"\n\nInvestorPlace - Stock Market News, Stock A...",1,1,This has been an epic year for the stock marke...
4,Why Advanced Micro Devices Stock Surged 17% in...,2018-12-10 06:14:00,\n \nShares of Advanced Micro Devices (NA...,0,1,Shares of Advanced Micro Devices (NASDAQ:...
8,Support Will Hold for Micron Stock After Earni...,2018-12-10 08:27:31,"\n\nInvestorPlace - Stock Market News, Stock A...",0,1,"That said, Micron stock does have a potential ..."


In [8]:
TickerList[228]

'JOB'

In [5]:
TickerList = pd.read_csv('companylist.csv')

In [6]:
TickerList[:50]

Unnamed: 0,Symbol,Name,LastSale,MarketCap,ADR TSO,IPOyear,Sector,Industry,Summary Quote,Unnamed: 9
0,VNET,"21Vianet Group, Inc.",9.61,596364900.0,62056704.0,2011.0,Technology,"Computer Software: Programming, Data Processing",https://www.nasdaq.com/symbol/vnet,
1,TWOU,"2U, Inc.",57.84,3350244000.0,,2014.0,Technology,Computer Software: Prepackaged Software,https://www.nasdaq.com/symbol/twou,
2,DDD,3D Systems Corporation,12.48,1424973000.0,,,Technology,Computer Software: Prepackaged Software,https://www.nasdaq.com/symbol/ddd,
3,JOBS,"51job, Inc.",64.0,2331915000.0,36436171.0,2004.0,Technology,Diversified Commercial Services,https://www.nasdaq.com/symbol/jobs,
4,WUBA,58.com Inc.,57.24,8479463000.0,,2013.0,Technology,"Computer Software: Programming, Data Processing",https://www.nasdaq.com/symbol/wuba,
5,EGHT,8x8 Inc,18.97,1809191000.0,,,Technology,EDP Services,https://www.nasdaq.com/symbol/eght,
6,ATEN,"A10 Networks, Inc.",6.29,464192000.0,,2014.0,Technology,Computer Communications Equipment,https://www.nasdaq.com/symbol/aten,
7,AAN,"Aaron&#39;s, Inc.",47.53,3449647000.0,,,Technology,Diversified Commercial Services,https://www.nasdaq.com/symbol/aan,
8,ACIA,"Acacia Communications, Inc.",41.39,1673946000.0,,2016.0,Technology,Semiconductors,https://www.nasdaq.com/symbol/acia,
9,ACIW,"ACI Worldwide, Inc.",28.99,3362250000.0,,,Technology,Computer Software: Prepackaged Software,https://www.nasdaq.com/symbol/aciw,


In [9]:
TickerList['Symbol'][380:420]

380    NATI
381    NPTN
382    NETE
383    NTAP
384    NTCT
385    NTWK
386    NEWR
387    NXGN
388    NICE
389    LASR
390     NOK
391    NUAN
392    NTNX
393    NVEC
394     NVT
395    NVDA
396    NXPI
397    OIIM
398    OCLR
399    OKTA
400    OMCL
401     OMC
402      ON
403    OTIV
404     OSS
405    OSPN
406    OOMA
407    OTEX
408    ORCL
409    OSIS
410    PFIN
411    PAGS
412    PANW
413    TEUM
414    PCYG
415     PKE
416    PAYC
417    PCTY
418    PCTI
419    PDFS
Name: Symbol, dtype: object