_My notebook on_
# Python for Data Analysis - Wes McKinney
## Chapter 6 - Data Loading, Storage, and File Formats
### Part 1 - Reading and Writing Data in Text Format

In [1]:
import pandas as pd
import numpy as np

In [2]:
#!type examples\ex1.csv
#!cat examples/ex1.csv

In [3]:
df = pd.read_csv('examples/ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
# for pandas csv is just a table with comma as separator
pd.read_table('examples/ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


headerless csv

In [5]:
#!cat examples/ex2.csv
#!type examples\ex2.csv

In [6]:
# do not use first line as header
pd.read_csv('examples/ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [7]:
# explicit set the header
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('examples/ex2.csv', names=names)

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [8]:
# explicit set the header and index
pd.read_csv('examples/ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [9]:
#!cat examples/csv_mindex.csv
#!type examples\csv_mindex.csv

In [10]:
# hierarchical index from multiple columns - by names
df = pd.read_csv('examples/csv_mindex.csv', index_col=['key1', 'key2'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [11]:
# hierarchical index from multiple columns - by numbers
df = pd.read_csv('examples/csv_mindex.csv', index_col=[0, 1])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [12]:
# custom delimiters
print(list(open('examples/ex3.txt')))

# regex could help
pd.read_table('examples/ex3.txt', sep='\s+')

['            A         B         C\n', 'aaa -0.264438 -1.026059 -0.619500\n', 'bbb  0.927272  0.302904 -0.032399\n', 'ccc -0.264273 -0.386314 -0.217601\n', 'ddd -0.871858 -0.348382  1.100491\n']


Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [13]:
# sometimes rows should be skipped
print(list(open('examples/ex4.csv')))

pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])

['# hey!\n', 'a,b,c,d,message\n', '# just wanted to make things more difficult for you\n', '# who reads CSV files with computers, anyway?\n', '1,2,3,4,hello\n', '5,6,7,8,world\n', '9,10,11,12,foo']


Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [14]:
# missing values: not present or signalled by sentinels as NA or NULL
print(list(open('examples/ex5.csv')))

result = pd.read_csv('examples/ex5.csv')
print('---')
print(result)
print('---')
print(pd.isnull(result))
print('---')
result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])
print(result)
print('diffent sentinels for columns')
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('examples/ex5.csv', na_values=sentinels)

['something,a,b,c,d,message\n', 'one,1,2,3,4,NA\n', 'two,5,6,,8,world\n', 'three,9,10,11,12,foo']
---
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
---
   something      a      b      c      d  message
0      False  False  False  False  False     True
1      False  False  False   True  False    False
2      False  False  False  False  False    False
---
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
diffent sentinels for columns


Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


Reading Text Files in Pieces

In [15]:
# show n rows from tables
pd.options.display.max_rows = 6

pd.read_csv('examples/ex6.csv')

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
...,...,...,...,...,...
9997,0.523331,0.787112,0.486066,1.093156,K
9998,-0.362559,0.598894,-1.843201,0.887292,G
9999,-0.096376,-1.012999,-0.657431,-0.573315,0


In [16]:
# read only n lines from a csv
pd.read_csv('examples/ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [17]:
# read a file in chunks
chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)
print('chunker is a', type(chunker))

tot_keys = pd.Series([])
for piece in chunker:
    tot_keys = tot_keys.add(piece['key'].value_counts(), fill_value=0)
tot_keys = tot_keys.sort_values(ascending=False)

pd.options.display.max_rows = None  # no limits
tot_keys[:10]

chunker is a <class 'pandas.io.parsers.TextFileReader'>


E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

Writing Data to Text Format

In [18]:
# DataFrame.to_csv() write the df in csv format
import sys

data = pd.read_csv('examples/ex5.csv')
data.to_csv(sys.stdout)

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [19]:
# to_csv() w/ custom separator, sentinel for missing values; w/o index, header
data.to_csv(sys.stdout, sep='|', na_rep='NULL', index=False, header=False)

one|1|2|3.0|4|NULL
two|5|6|NULL|8|world
three|9|10|11.0|12|foo


In [20]:
# to_csv() subset of columns
data.to_csv(sys.stdout, index=False, columns=['d', 'a', 'b'])

d,a,b
4,1,2
8,5,6
12,9,10


In [21]:
# Series.to_csv()
# as index, first 7 days from Jan 1, 2000
ts = pd.Series(range(7), index=pd.date_range('1/1/2000', periods=7))
ts.to_csv(sys.stdout)

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6


Working with Delimited Formats

In [25]:
# !type examples\ex7.csv
# !cat examples/ex7.csv

"a","b","c"
"1","2","3"
"1","2","3"


In [23]:
# csv reader()
import csv

with open('examples/ex7.csv') as f:
    lines = list(csv.reader(f))

header, values = lines[0], lines[1:]
print('header', header)
print('values', values)

data = {h: v for h, v in zip(header, zip(*values))}
data

header ['a', 'b', 'c']
values [['1', '2', '3'], ['1', '2', '3']]


{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

JSON Data

In [29]:
import json

In [37]:
# json.loads() to convert a json string to a python object (dict)
json_str = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [
   {"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
   {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}
 ]
}
"""

py_obj = json.loads(json_str)
py_obj

{'name': 'Wes',
 'pet': None,
 'places_lived': ['United States', 'Spain', 'Germany'],
 'siblings': [{'age': 30, 'name': 'Scott', 'pets': ['Zeus', 'Zuko']},
  {'age': 38, 'name': 'Katie', 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [38]:
# json.dumps() to convert a python object to a json string
json_str = json.dumps(result)
json_str

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]}, {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]}'

In [45]:
# from list of dicts to data frame
siblings = pd.DataFrame(py_obj['siblings'], columns=['name', 'age'])
siblings

Unnamed: 0,name,age
0,Scott,30
1,Katie,38


In [49]:
# pandas.read_json() to read json from file to series or dataframe
# !cat examples/example.json
# !type examples\example.json

data = pd.read_json('examples/example.json')
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [51]:
# dataframe and series have to_json() method to convert it to json
print(data.to_json())
print(data.to_json(orient='records'))

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}
[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]


XML and HTML: Web Scraping

In [62]:
tables = pd.read_html('examples/fdic_failed_bank_list.html')
print(len(tables), len(tables[0]))
failures = tables[0]
print(failures.loc[len(tables[0])-1]['Bank Name'])
failures.head()

1 547
Bank of Honolulu


Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"


In [63]:
close_timestamps = pd.to_datetime(failures['Closing Date'])
close_timestamps.dt.year.value_counts()

2010    157
2009    140
2011     92
2012     51
2008     25
2013     24
2014     18
2002     11
2015      8
2016      5
2004      4
2001      4
2007      3
2003      3
2000      2
Name: Closing Date, dtype: int64

Parsing XML with lxml.objectify

In [65]:
from lxml import objectify

path = 'datasets/mta_perf/Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()

# here root (tagged PERFORMANCE) contains many INDICATOR elements

In [68]:
data = []

# not interested in this elements in INDICATOR
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_CHANGE', 'DECIMAL_PLACES']

# for each INDICATOR element in root
for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
        data.append(el_data)

# the resulting data is pushed in a dataframe
perf = pd.DataFrame(data)
perf.head()

Unnamed: 0,AGENCY_NAME,CATEGORY,DESCRIPTION,FREQUENCY,INDICATOR_NAME,INDICATOR_UNIT,MONTHLY_ACTUAL,MONTHLY_TARGET,PERIOD_MONTH,PERIOD_YEAR,YTD_ACTUAL,YTD_TARGET
0,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.9,95,1,2008,96.9,95
1,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.9,95,1,2008,96.9,95
2,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.9,95,1,2008,96.9,95
3,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.9,95,1,2008,96.9,95
4,Metro-North Railroad,Service Indicators,Percent of commuter trains that arrive at thei...,M,On-Time Performance (West of Hudson),%,96.9,95,1,2008,96.9,95
