# Chapter 6: Data Loading, Storage, and File Formats

## 6.1 Reading and Writing Data in Text Format

In [1]:
# Parsing functions in pandas
"""
read_csv
read_excel
read_table
read_sas
read_sql
read_stata

"""

'\nread_csv\nread_excel\nread_table\nread_sas\nread_sql\nread_stata\n\n'

In [7]:
import pandas as pd

In [14]:
df = pd.read_csv('ex1.txt',sep = ',')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [15]:

df = pd.read_csv('ex1.txt',index_col = ['message'])
df

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [None]:
# skiprows = [0,2,3]
# Read_csv/ read_table arguments
"""
path
sep/delimiter
header
index_col
names
skiprows
na_values
comment

"""

#### .to_csv() function

In [4]:
import pandas as pd
import numpy as np

In [5]:
dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv('examples/tseries.csv')

  This is separate from the ipykernel package so we can avoid doing imports until


#### HDF5 is a well-regarded file format intended for storing large quantities of scientific array data. 

### Reading Excel Files

In [6]:
# create a path
path = pd.ExcelFile('examples/AUS.xlsx')

In [7]:
# Read the file
aus_covid = pd.read_excel(path, 'Sheet1')

In [8]:
aus_covid.head()

Unnamed: 0.1,Unnamed: 0,Cases,Increase,Rate,log,Unnamed: 5
0,2020.03.17,455.0,,,2.658011,910.0
1,2020.03.18,596.0,141.0,0.30989,2.775246,
2,2020.03.19,756.0,160.0,0.268456,2.878522,
3,2020.03.20,928.0,172.0,0.227513,2.967548,1856.0
4,2020.03.21,1072.0,144.0,0.155172,3.030195,


In [11]:
# Write an excel file
frame = pd.read_excel('examples/ex1.xlsx', "Sheet1")
frame.to_excel('examples/ex2.xlsx','Sheet1')

## 6.3 Interacting with Web APIs

In [12]:
# Many websites have public APIs providing data feeds via JSON or some other format.
import requests

In [13]:
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)

In [14]:
resp

<Response [200]>

In [16]:
data = resp.json()
data[0]['title']

'BUG: DataError: No numeric types to aggregate during pd.pivot_table'

In [17]:
issues = pd.DataFrame(data,columns = ['number', 'title', 'labels', 'state'])
issues.head()

Unnamed: 0,number,title,labels,state
0,33515,BUG: DataError: No numeric types to aggregate ...,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
1,33514,Partial fix for Issue #28792,[],open
2,33513,BUG: Fix Categorical.min / max bug,[],open
3,33512,TST/CLN: Clean categorical min / max tests,[],open
4,33511,CLN: remove unused util.hashing functions,[],open


## 6.4 Interacting with Databases

In [None]:
# skip for now