In [1]:
####-----------------------------------------------------------------------------####
#   title: "IEOR 135 Group Project -- Data Gathering (Built for RiskEx)"            #
#   author: Elias Castro Hernandez                                                  #
#   date: "March 2018"                                                              # 
#   purpose: scrape over website, read, and write data for ML modeling              #
####-----------------------------------------------------------------------------####

In [20]:
# Import Packages and Libraries
import requests # HTTP parser
import html5lib

# Web parcing, scraping, etc.
import bs4 as bs # BeautifulSoup4 
import urllib3
import re

# data frames and math
import pandas as pd
import numpy as np

# Import output related packages 
import pprint

In [21]:
# stretch Jupyter coding blocks to fit screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>")) 

# make it run on py2 and py3
from __future__ import division, print_function

## Read over the following to understand how to use NewsAPI
__Get Started__
General overview of how to readin using version two of newsapi.

https://newsapi.org/docs/get-started

__Working with python__
Granular details on paraments of newsapi funtions/methods.

https://newsapi.org/docs/client-libraries/python

__Working with JSON Objects__
Overview of JSON objects as they pertain to javascript.

https://www.w3schools.com/js/js_json_intro.asp

In [22]:
### NEWSAPI RELATED ###
#Elias' key: 
ekey = '2bc85776a0c14af6b9937366ad683e2f'

# Install API 
#pip install newsapi-python

# Import Client
from newsapi import NewsApiClient

# Initialize Client (create object)
news = NewsApiClient(api_key = ekey)
print(type(news))

<class 'newsapi.newsapi_client.NewsApiClient'>


# Accessing NewsAPI Articles


## Accessing all data in newsapi

####    Search every article given a query and starting date

In [23]:
# Read in everything
url = ('https://newsapi.org/v2/everything?'
       'q=bitcoin&'
       'from=2018-01-09&'       #yyy-mm-dd
       'sortBy=publishedAt&'
       'apiKey=2bc85776a0c14af6b9937366ad683e2f')

everything = requests.get(url)

#pprint.pprint(everything.json())

# NOTE:
#### The data type of of the object being read is "class: 'requests.models.Response' "
##### Thus in order to access the content, you have to use the '.json()' method available for the object class



##### example of how to access JSON values

In [24]:
print(type(everything))
print(type(everything.json()))

# Accessing values
example = everything.json()['articles'][0]['description']
print(type(example), "\n", example, "\n\n\n")

<class 'requests.models.Response'>
<class 'dict'>
<class 'str'> 
 文|全天候科技 张少华 “基于区块链上的私有中央银行会被不同的国家当做中央银行来使用，以解决一些原本没办法解决的问题”，吴忌寒说，比特大陆非常有兴趣投资那些运用区块链技术的私有中央银行。 这是吴忌寒在3月7日举行的2018华盛顿DC区块链峰会上的演讲内容。他是比特币挖矿巨头比特大陆公司（Bitmain）的创始人，目前比特大陆公司掌握着全球比特币矿池60%的算力。在算力方面一家独大，让吴忌寒有能力影响比特币的命运，正是在他的主导下，比特币去年进行了轰轰烈烈的“硬分叉”运动。 吴忌寒在2018华盛顿DC区块链峰会上演… 





#### Get Breaking News (all types)

In [25]:
# Read top 20 breaking news and headlines
url = ('https://newsapi.org/v2/top-headlines?'
       'country=us&'
       'apiKey=2bc85776a0c14af6b9937366ad683e2f')
breaking = requests.get(url)
#pprint.pprint(breaking.json())

## Genearal newsapi navigation

#### Get Breaking news (by query = 'bitcoin')

In [26]:
headlines = news.get_top_headlines(q='bitcoin',                         # Topic -- with quotes (") for exact match, can handle logial searching AND, OR, NOT
                                    language='en')                      # Country
#print(headlines)

#### Get everything on a topic, from given sources/domains, within a given range

In [27]:
# Everything on a topic
all_articles = news.get_everything(q='bitcoin',                            # Topic
                                    sources='bbc-news,the-verge',          # From sources
                                    domains='bbc.co.uk,techcrunch.com',    # From domains (comma separated)
                                    from_parameter='2018-01-01',           # Within date range (yyyy-mm-dd)
                                    to='2018-03-08',
                                    language='en',                         # Language
                                    sort_by='publishedAt',                 # sort by relevancy = closely related to q come first; 
                                                                               # popularity = from popular sources/publishers come first; 
                                                                               # publishedAt = newest articles come first (defaul)
                                    page_size = 100,                        # allow for the number of results to return per page. 20 default, 100 max.
                                    page = 1)                               # Use this to page through the results.
print(type(all_articles))   
#pprint.pprint(all_articles.keys())
#print(all_articles)
#pprint.pprint(all_articles)

<class 'dict'>


#### Get sources for the above, can filter by category

In [28]:
# Get sources
sources = news.get_sources(category = 'technology')     # Sources index --> https://newsapi.org/sources
                                                        # Filter source by topic: business, entertainment, general, health, science, sports, technology. 
                                                        # Default: all categories
#print(type(sources))
#print(sources.keys())
#pprint.pprint(sources)

# JSON to Pandas DataFrame and Back

All of the prior were examples of reading, and exploring JSON data files.  For simplicity, it helps if they are converted to pandas dataframe

In [29]:
# allow work with the data type
import json
# since interested in 'everything' file:
print(type(everything))    #<class 'requests.models.Response'>

<class 'requests.models.Response'>


In [30]:
### Method one: by accessing text attribute ###

# create text object using .text method from response object
txt = everything.text # <class 'str'>

# create dictionary object from text object
obj = json.loads(txt)                   # <class 'dict'>

# convert dictionary to dataframe
df1 = pd.DataFrame(obj)                 # <class 'pandas.core.frame.DataFrame'>
print(df1.keys())
df1.head(3)

Index(['articles', 'status', 'totalResults'], dtype='object')


Unnamed: 0,articles,status,totalResults
0,"{'source': {'id': None, 'name': 'Qq.com'}, 'au...",ok,44832
1,"{'source': {'id': None, 'name': 'Youbrandinc.c...",ok,44832
2,"{'source': {'id': None, 'name': 'Interia.pl'},...",ok,44832


In [31]:
### Method 2: using methods to extract values as lists ###

# create list object from response object
obj = list(everything.json().items())   #everything.json()                is class: 'dict'
                                        #everything.json().items()        is class: 'dict_items'
                                        #list(everything.json().items())  is class: 'list'

# convert set of lists into dictionary
df = pd.DataFrame(dict(obj))            #<class 'pandas.core.frame.DataFrame'>

print(df.keys())
df.head(3)

Index(['articles', 'status', 'totalResults'], dtype='object')


Unnamed: 0,articles,status,totalResults
0,"{'source': {'id': None, 'name': 'Qq.com'}, 'au...",ok,44832
1,"{'source': {'id': None, 'name': 'Youbrandinc.c...",ok,44832
2,"{'source': {'id': None, 'name': 'Interia.pl'},...",ok,44832


## Write out to JSON

In [32]:
## Ensure file is writen do desired directory
import os
# check 
os.getcwd()
# change
#os.chdir("home/Users/yourname/folder1/folder2/etc")

'/home/elias/Desktop/IEOR 135/riskex/WebScraping'

In [33]:
### Method 3: serialize into JSON and write out
# want to write out 'everything' pass a dictionary as paramenter NOT a dataframe

serializeddata = json.dumps(obj)

## to write to a textfile named "whatever":
f = open("2mo_webapi_riskex.json", "w")
f.write(serializeddata)
f.close()


### If reading a JSON file

In [34]:
2mo_webapi_riskex.json create object
data_file = "2mo_webapi_riskex.json"

# extract JSON objects as lists
with open(data_file) as f:
    data = json.load(f) 

# convert set of lists into dictionary
df = pd.DataFrame(dict(data))

SyntaxError: invalid syntax (<ipython-input-34-1b2c6901c809>, line 1)

In [167]:
df.head(3)

Unnamed: 0,articles,status,totalResults
0,"{'source': {'id': None, 'name': 'Python.org'},...",ok,44813
1,"{'source': {'id': None, 'name': 'Apkmirror.com...",ok,44813
2,"{'source': {'id': None, 'name': 'Youbrandinc.c...",ok,44813
