# Data loading, storage, and file formats

In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt

## JSON data

In [2]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
              {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""

In [3]:
import json
result = json.loads(obj)
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'},
  {'name': 'Katie', 'age': 33, 'pet': 'Cisco'}]}

In [4]:
asjson = json.dumps(result)
siblings = DataFrame(result['siblings'], columns=['name', 'age'])
siblings

Unnamed: 0,name,age
0,Scott,25
1,Katie,33


### XML and HTML, Web scraping

#### Simple example

In [5]:
!cat "DLSFF_Data/example.html"

<html>
<head>
</head>
<body>
This is a test page.<br>
<a href='http://www.postech.ac.kr'>POSTECH</a><br>
<a href='http://ime.postech.ac.kr'>Department of Industrial and Management Engineering</a><br>
<a href='http://aim.postech.ac.kr'>Analytics & Information Management Lab</a><br>
<br>
<br>
<table>
  <tr>
    <td>&nbsp;</td>
    <td>Knocky</td>
    <td>Flor</td>
    <td>Ella</td>
    <td>Juan</td>
  </tr>
  <tr>
    <td>Breed</td>
    <td>Jack Russell</td>
    <td>Poodle</td>
    <td>Streetdog</td>
    <td>Cocker Spaniel</td>
  </tr>
  <tr>
    <td>Age</td>
    <td>16</td>
    <td>9</td>
    <td>10</td>
    <td>5</td>
  </tr>
  <tr>
    <td>Owner</td>
    <td>Mother-in-law</td>
    <td>Me</td>
    <td>Me</td>
    <td>Sister-in-law</td>
  </tr>
  <tr>
    <td>Eating Habits</td>
    <td>Eats everyone's leftovers</td>
    <td>Nibbles at food</td>
    <td>Hearty eater</td>
    <td>Will eat till he explodes</td>
  </tr>
</table>
</body>
</html>


In [7]:
from lxml.html import parse
from urllib.request import urlopen

In [59]:
table

<Element table at 0x1f5db8c9ae8>

In [11]:
parsed = parse("DLSFF_Data/example.html")
doc = parsed.getroot()

In [12]:
links = doc.findall('.//a')
links

[<Element a at 0x1f5db8c96d8>,
 <Element a at 0x1f5db8c9728>,
 <Element a at 0x1f5db8c9778>]

In [13]:
lnk = links[0]
lnk.get('href')

'http://www.postech.ac.kr'

In [14]:
lnk.text_content()

'POSTECH'

In [15]:
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
urls

['http://www.postech.ac.kr',
 'http://ime.postech.ac.kr',
 'http://aim.postech.ac.kr']

In [16]:
lnks = [lnk.text_content() for lnk in doc.findall('.//a')]
lnks

['POSTECH',
 'Department of Industrial and Management Engineering',
 'Analytics & Information Management Lab']

In [17]:
tables = doc.findall('.//table')

In [18]:
table = tables[0]
rows = table.findall('.//tr')

In [19]:
def _unpack(row, kind='td'):
    elts = row.findall('.//%s' % kind)
    return [val.text_content() for val in elts]

In [20]:
_unpack(rows[2], kind = 'td')

['Age', '16', '9', '10', '5']

[<Element tr at 0x1f5db8c9e08>,
 <Element tr at 0x1f5db8cc098>,
 <Element tr at 0x1f5db8cc138>,
 <Element tr at 0x1f5db8cc188>,
 <Element tr at 0x1f5db8cc1d8>]

In [24]:
from pandas.io.parsers import TextParser

def parse_table_data(table):
    rows = table.findall('.//tr')
    data = [_unpack(r) for r in rows [0:]]
    return TextParser(data).get_chunk()

In [25]:
call_data = parse_table_data(table)
call_data

Unnamed: 0,Unnamed: 1,Knocky,Flor,Ella,Juan
0,Breed,Jack Russell,Poodle,Streetdog,Cocker Spaniel
1,Age,16,9,10,5
2,Owner,Mother-in-law,Me,Me,Sister-in-law
3,Eating Habits,Eats everyone's leftovers,Nibbles at food,Hearty eater,Will eat till he explodes


#### Scrapping APPLE’s call & put option prices

In [26]:
parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc = parsed.getroot()

In [27]:
links = doc.findall('.//a')
links[:10]

[<Element a at 0x1f5db8d4db8>,
 <Element a at 0x1f5db8eddb8>,
 <Element a at 0x1f5db8edcc8>,
 <Element a at 0x1f5db8edf98>,
 <Element a at 0x1f5db8edd18>,
 <Element a at 0x1f5db8ede58>,
 <Element a at 0x1f5db8edea8>,
 <Element a at 0x1f5db8edef8>,
 <Element a at 0x1f5db8edf48>,
 <Element a at 0x1f5db902048>]

In [28]:
lnk = links[3]
lnk.get('href')

'/quote/AAPL/key-statistics?p=AAPL'

In [29]:
lnk.text_content()

'Statistics'

In [30]:
lnk = links[4]
lnk.get('href')

'/quote/AAPL/history?p=AAPL'

In [31]:
lnk.text_content()

'Historical Data'

In [32]:
lnk = links[5]
lnk.get('href')

'/quote/AAPL/profile?p=AAPL'

In [33]:
lnk.text_content()

'Profile'

In [34]:
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
urls[:20]

['https://finance.yahoo.com/',
 'https://mail.yahoo.com/?.intl=us&.lang=en-US&.partner=none&.src=finance',
 '/quote/AAPL?p=AAPL',
 '/quote/AAPL/key-statistics?p=AAPL',
 '/quote/AAPL/history?p=AAPL',
 '/quote/AAPL/profile?p=AAPL',
 '/quote/AAPL/financials?p=AAPL',
 '/quote/AAPL/analysis?p=AAPL',
 '/quote/AAPL/options?p=AAPL',
 '/quote/AAPL/holders?p=AAPL',
 '/quote/AAPL/sustainability?p=AAPL',
 '/quote/AAPL/options?ltr=1&straddle=true',
 '/quote/AAPL200501C00115000?p=AAPL200501C00115000',
 '/quote/AAPL/options?strike=115&straddle=false',
 '/quote/AAPL200501C00120000?p=AAPL200501C00120000',
 '/quote/AAPL/options?strike=120&straddle=false',
 '/quote/AAPL200501C00125000?p=AAPL200501C00125000',
 '/quote/AAPL/options?strike=125&straddle=false',
 '/quote/AAPL200501C00130000?p=AAPL200501C00130000',
 '/quote/AAPL/options?strike=130&straddle=false']

In [35]:
tables = doc.findall('.//table')

In [36]:
calls = tables[0]
puts = tables[1] 

In [37]:
rows = calls.findall('.//tr')

In [38]:
def _unpack(row, kind='td'):
    elts = row.findall('.//%s' % kind)
    return [val.text_content() for val in elts]

In [39]:
from pandas.io.parsers import TextParser

def parse_options_data(table):
    rows = table.findall('.//tr')
    header = _unpack(rows[0], kind = 'th')
    data = [_unpack(r) for r in rows [1:]]
    return TextParser(data, names=header).get_chunk()

In [40]:
call_data = parse_options_data(calls)
put_data = parse_options_data(puts)

In [41]:
call_data[:10]

Unnamed: 0,Contract Name,Last Trade Date,Strike,Last Price,Bid,Ask,Change,% Change,Volume,Open Interest,Implied Volatility
0,AAPL200501C00115000,2020-04-28 3:55PM EDT,115.0,164.0,0.0,0.0,0.0,-,12,0,0.00%
1,AAPL200501C00120000,2020-04-29 1:57PM EDT,120.0,167.3,0.0,0.0,0.0,-,2,0,0.00%
2,AAPL200501C00125000,2020-04-29 3:55PM EDT,125.0,162.85,0.0,0.0,0.0,-,3,0,0.00%
3,AAPL200501C00130000,2020-03-27 5:13AM EDT,130.0,120.75,151.8,153.25,0.0,-,3,1,0.00%
4,AAPL200501C00135000,2020-04-30 1:13PM EDT,135.0,155.85,0.0,0.0,0.0,-,1,0,0.00%
5,AAPL200501C00150000,2020-04-06 12:11AM EDT,150.0,133.2,0.0,0.0,0.0,-,10,0,0.00%
6,AAPL200501C00160000,2020-04-06 12:11AM EDT,160.0,81.17,0.0,0.0,0.0,-,1,0,0.00%
7,AAPL200501C00165000,2020-04-14 11:58AM EDT,165.0,119.49,0.0,0.0,0.0,-,2,0,0.00%
8,AAPL200501C00175000,2020-04-27 12:11AM EDT,175.0,94.2,0.0,0.0,0.0,-,-,0,0.00%
9,AAPL200501C00180000,2020-04-29 3:41PM EDT,180.0,109.0,0.0,0.0,0.0,-,5,0,0.00%


In [42]:
put_data[:10]

Unnamed: 0,Contract Name,Last Trade Date,Strike,Last Price,Bid,Ask,Change,% Change,Volume,Open Interest,Implied Volatility
0,AAPL200501P00115000,2020-04-29 11:51AM EDT,115.0,0.01,0.0,0.0,0.0,-,1,0,50.00%
1,AAPL200501P00120000,2020-04-30 12:53PM EDT,120.0,0.01,0.0,0.0,0.0,-,34,0,50.00%
2,AAPL200501P00125000,2020-04-28 10:14AM EDT,125.0,0.01,0.0,0.0,0.0,-,1,0,50.00%
3,AAPL200501P00130000,2020-04-22 10:02AM EDT,130.0,0.01,0.0,0.0,0.0,-,1,0,50.00%
4,AAPL200501P00135000,2020-04-28 10:52AM EDT,135.0,0.01,0.0,0.0,0.0,-,1,0,50.00%
5,AAPL200501P00140000,2020-04-17 3:17PM EDT,140.0,0.01,0.0,0.0,0.0,-,12,0,50.00%
6,AAPL200501P00145000,2020-04-23 3:24PM EDT,145.0,0.04,0.0,0.0,0.0,-,2,0,50.00%
7,AAPL200501P00150000,2020-04-27 11:03AM EDT,150.0,0.01,0.0,0.0,0.0,-,25,0,50.00%
8,AAPL200501P00155000,2020-04-23 3:58PM EDT,155.0,0.01,0.0,0.0,0.0,-,17,0,50.00%
9,AAPL200501P00160000,2020-04-23 3:31PM EDT,160.0,0.01,0.0,0.0,0.0,-,17,0,50.00%


#### Parsing XML with lxml.objectify

In [43]:
!head -21 'DLSFF_Data/Performance_MNR.xml'

<?xml  version="1.0" encoding="ISO-8859-1"?>
<PERFORMANCE>
<INDICATOR>
  <INDICATOR_SEQ>28445</INDICATOR_SEQ>
  <PARENT_SEQ></PARENT_SEQ>
  <AGENCY_NAME>Metro-North Railroad</AGENCY_NAME>
  <INDICATOR_NAME>On-Time Performance (West of Hudson)</INDICATOR_NAME>
  <DESCRIPTION>Percent of commuter trains that arrive at their destinations within 5 minutes and 59 seconds of the scheduled time. West of Hudson services include the Pascack Valley and Port Jervis lines. Metro-North Railroad contracts with New Jersey Transit to operate service on these lines.
</DESCRIPTION>
  <PERIOD_YEAR>2008</PERIOD_YEAR>
  <PERIOD_MONTH>1</PERIOD_MONTH>
  <CATEGORY>Service Indicators</CATEGORY>
  <FREQUENCY>M</FREQUENCY>
  <DESIRED_CHANGE>U</DESIRED_CHANGE>
  <INDICATOR_UNIT>%</INDICATOR_UNIT>
  <DECIMAL_PLACES>1</DECIMAL_PLACES>
  <YTD_TARGET>95.00</YTD_TARGET>
  <YTD_ACTUAL>96.90</YTD_ACTUAL>
  <MONTHLY_TARGET>95.00</MONTHLY_TARGET>
  <MONTHLY_ACTUAL>96.90</MONTHLY_ACTUAL>
</INDICATOR>


In [44]:
from lxml import objectify

path = 'DLSFF_Data/Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()

In [45]:
data = []

skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_CHANGE', 'DECIMAL_PLACES']

for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)
data

[{'AGENCY_NAME': 'Metro-North Railroad',
  'INDICATOR_NAME': 'On-Time Performance (West of Hudson)',
  'DESCRIPTION': 'Percent of commuter trains that arrive at their destinations within 5 minutes and 59 seconds of the scheduled time. West of Hudson services include the Pascack Valley and Port Jervis lines. Metro-North Railroad contracts with New Jersey Transit to operate service on these lines.\n',
  'PERIOD_YEAR': 2008,
  'PERIOD_MONTH': 1,
  'CATEGORY': 'Service Indicators',
  'FREQUENCY': 'M',
  'INDICATOR_UNIT': '%',
  'YTD_TARGET': 95.0,
  'YTD_ACTUAL': 96.9,
  'MONTHLY_TARGET': 95.0,
  'MONTHLY_ACTUAL': 96.9},
 {'AGENCY_NAME': 'Metro-North Railroad',
  'INDICATOR_NAME': 'On-Time Performance (West of Hudson)',
  'DESCRIPTION': 'Percent of commuter trains that arrive at their destinations within 5 minutes and 59 seconds of the scheduled time. West of Hudson services include the Pascack Valley and Port Jervis lines. Metro-North Railroad contracts with New Jersey Transit to operate 

In [46]:
perf = DataFrame(data)
perf

Unnamed: 0,AGENCY_NAME,INDICATOR_NAME,DESCRIPTION,PERIOD_YEAR,PERIOD_MONTH,CATEGORY,FREQUENCY,INDICATOR_UNIT,YTD_TARGET,YTD_ACTUAL,MONTHLY_TARGET,MONTHLY_ACTUAL
0,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,1,Service Indicators,M,%,95,96.9,95,96.9
1,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,2,Service Indicators,M,%,95,96,95,95
2,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,3,Service Indicators,M,%,95,96.3,95,96.9
3,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,4,Service Indicators,M,%,95,96.8,95,98.3
4,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,5,Service Indicators,M,%,95,96.6,95,95.8
...,...,...,...,...,...,...,...,...,...,...,...,...
643,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,8,Service Indicators,M,%,97,,97,
644,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,9,Service Indicators,M,%,97,,97,
645,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,10,Service Indicators,M,%,97,,97,
646,Metro-North Railroad,Escalator Availability,Percent of the time that escalators are operat...,2011,11,Service Indicators,M,%,97,,97,


### Reading Microsoft Excel Files

In [47]:
xls_file=pd.ExcelFile('DLSFF_Data/ex1.xlsx')

ImportError: Missing optional dependency 'xlrd'. Install xlrd >= 1.0.0 for Excel support Use pip or conda to install xlrd.

In [48]:
table=xls_file.parse('Sheet1')
table

NameError: name 'xls_file' is not defined

### Interacting with HTML and Web APIs

In [49]:
import requests
import json

In [50]:
url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels'
resp = requests.get(url)
resp

<Response [200]>

In [51]:
data = json.loads(resp.text)
data

[{'id': 76811,
  'node_id': 'MDU6TGFiZWw3NjgxMQ==',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Bug',
  'name': 'Bug',
  'color': 'e10c02',
  'default': False,
  'description': None},
 {'id': 76812,
  'node_id': 'MDU6TGFiZWw3NjgxMg==',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Enhancement',
  'name': 'Enhancement',
  'color': '4E9A06',
  'default': False,
  'description': None},
 {'id': 127681,
  'node_id': 'MDU6TGFiZWwxMjc2ODE=',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Refactor',
  'name': 'Refactor',
  'color': 'FCE94F',
  'default': False,
  'description': 'Internal refactoring of code'},
 {'id': 129350,
  'node_id': 'MDU6TGFiZWwxMjkzNTA=',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Build',
  'name': 'Build',
  'color': '75507B',
  'default': False,
  'description': 'Library building on various platforms'},
 {'id': 134699,
  'node_id': 'MDU6TGFiZWwxMzQ2OTk=',
  'url': 'https://api.github.com/repos

In [52]:
issue_labels = DataFrame(data)
issue_labels

Unnamed: 0,id,node_id,url,name,color,default,description
0,76811,MDU6TGFiZWw3NjgxMQ==,https://api.github.com/repos/pandas-dev/pandas...,Bug,e10c02,False,
1,76812,MDU6TGFiZWw3NjgxMg==,https://api.github.com/repos/pandas-dev/pandas...,Enhancement,4E9A06,False,
2,127681,MDU6TGFiZWwxMjc2ODE=,https://api.github.com/repos/pandas-dev/pandas...,Refactor,FCE94F,False,Internal refactoring of code
3,129350,MDU6TGFiZWwxMjkzNTA=,https://api.github.com/repos/pandas-dev/pandas...,Build,75507B,False,Library building on various platforms
4,134699,MDU6TGFiZWwxMzQ2OTk=,https://api.github.com/repos/pandas-dev/pandas...,Docs,3465A4,False,
5,211840,MDU6TGFiZWwyMTE4NDA=,https://api.github.com/repos/pandas-dev/pandas...,Timeseries,AFEEEE,False,
6,233160,MDU6TGFiZWwyMzMxNjA=,https://api.github.com/repos/pandas-dev/pandas...,Groupby,729FCF,False,
7,2301354,MDU6TGFiZWwyMzAxMzU0,https://api.github.com/repos/pandas-dev/pandas...,IO Data,06909A,False,IO issues that don't fit into a more specific ...
8,2413328,MDU6TGFiZWwyNDEzMzI4,https://api.github.com/repos/pandas-dev/pandas...,Visualization,8AE234,False,
9,2822098,MDU6TGFiZWwyODIyMDk4,https://api.github.com/repos/pandas-dev/pandas...,Indexing,0b02e1,False,"Related to indexing on series/frames, not to i..."


In [55]:
import pymongo
con = pymongo.MongoClient('localhost', port=27017)

In [56]:
labels = con.db.labels

In [57]:
import requests, json
url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels'
data = json.loads(requests.get(url).text)

for label in data:
    labels.save(label)

  


ServerSelectionTimeoutError: localhost:27017: [WinError 10061] 대상 컴퓨터에서 연결을 거부했으므로 연결하지 못했습니다