_My notebook on_
# Python for Data Analysis - Wes McKinney
## Chapter 6 - Data Loading, Storage, and File Formats

In [1]:
import pandas as pd
import numpy as np

### Part 2 - Binary Data Formats

In [2]:
# serializing a dataframe as pickle
df = pd.read_csv('examples/ex1.csv')
print(df)
df.to_pickle('examples/ex1.pkl')

# picking up a pickle
print('---')
pickle = pd.read_pickle('examples/ex1.pkl')
print(pickle)

   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
---
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo


Using HDF5 Format

In [3]:
df = pd.DataFrame({'a': np.random.randn(100)})
print('First element in df:', df.loc[0][0])
store = pd.HDFStore('examples/mydata.h5')
store['obj1'] = df
store['obj1_col'] = df['a']
print('First element in stored df:', store['obj1'].loc[0][0])
# print('First element in stored series:', store['obj1_col'][0])
store.close()

# read hdf
df2 = pd.read_hdf('examples/mydata.h5', 'obj1')
print('First element in stored df2:', df2.loc[0][0])

First element in df: -1.8291186965230715
First element in stored df: -1.8291186965230715
First element in stored df2: -1.8291186965230715


Reading Microsoft Excel Files

In [4]:
xlsx = pd.ExcelFile('examples/ex1.xlsx')
df = pd.read_excel(xlsx, 'Sheet1')
print(df)
print('---')
print(pd.read_excel('examples/ex1.xlsx', 'Sheet1'))

   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
---
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo


### Part 3 - Interacting with Web APIs

In [5]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
try:
    response = requests.get(url)
    data = response.json()
    print(data[0]['title'])
    
    issues = pd.DataFrame(data, columns=['number', 'title', 'labels', 'state'])
    print(issues)
except requests.exceptions.ConnectionError as ce:
    print(ce)


BUG to_clipboard passes the wrong sep to to_csv
    number                                              title  \
0    21398    BUG to_clipboard passes the wrong sep to to_csv   
1    21397  DOC: Adding missing dependency sphinxcontrib-s...   
2    21396      Missing dependency for sphinxcontrib-spelling   
3    21394                 Bugfix timedelta notimplemented eq   
4    21393  TST: adding test cases for verifying correct v...   
5    21391  Error when unpickling subclass of pandas.DataF...   
6    21390  groupby on 2 categorical columns, when one cat...   
7    21389                    Release 0.23.1 backports part I   
8    21386  regression: bar plot with multi-column categor...   
9    21385  to_clipboard undocumentedly changed from using...   
10   21384  Accepts integer/float string with units and ra...   
11   21383     Overflow in to_datetime when using nanoseconds   
12   21381  TST : Adding new test case for pivot_table() w...   
13   21380  pandas 0.23 broke unary negati

### Part 4 - Interacting with Databases

In [7]:
import sqlite3
query = 'DROP TABLE test;'
con = sqlite3.connect('mydata.sqlite')
con.execute(query)
con.commit()

query = 'CREATE TABLE test(a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER);'
con = sqlite3.connect('mydata.sqlite')
con.execute(query)
con.commit()


data = [
    ('Atlanta', 'Georgia', 1.25, 6),
    ('Tallahassee', 'Florida', 2.6, 3),
    ('Sacramento', 'California', 1.7, 5)
]

stmt = 'INSERT INTO test VALUES(?, ?, ?, ?)'
con.executemany(stmt, data)
con.commit()

In [8]:
cursor = con.execute('select * from test')
rows = cursor.fetchall()
print(rows)

df = pd.DataFrame(rows, columns=[x[0] for x in cursor.description])
print(df)

[('Atlanta', 'Georgia', 1.25, 6), ('Tallahassee', 'Florida', 2.6, 3), ('Sacramento', 'California', 1.7, 5)]
             a           b     c  d
0      Atlanta     Georgia  1.25  6
1  Tallahassee     Florida  2.60  3
2   Sacramento  California  1.70  5


Simpler job with SQL Alchemy

In [9]:
import sqlalchemy as sqla
db = sqla.create_engine('sqlite:///mydata.sqlite')
pd.read_sql('select * from test', db)

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
