# Data Processing

![gif](imgs/DP001.gif)

## Import

In [1]:
import pandas as pd

## Text

In [2]:
# read_csv       — csv file, default separator is «,»
# reat_table     — table-file, with sep «\t»
# read_fwf       — for files with fixed width of columns
# read_clipboard — like «read_table», but read from buffer

### with header

In [3]:
df = pd.read_csv('exs/ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### w/o header

In [4]:
df = pd.read_csv('exs/ex2.csv')
df

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


In [5]:
df = pd.read_csv('exs/ex2.csv', header=None)
df

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### custom header

In [6]:
df = pd.read_csv('exs/ex2.csv', names=list('abcdM'))
df

Unnamed: 0,a,b,c,d,M
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### M → index

In [7]:
names = list('abcd')
names.append('message')

df = pd.read_csv('exs/ex2.csv', names=names, index_col=4)
df

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [8]:
names = list('abcd')
names.append('message')

df = pd.read_csv('exs/ex2.csv', names=names, index_col='message')
df

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


### read_table

In [9]:
df = pd.read_csv('exs/ex3.csv')
df

Unnamed: 0,A B C
0,aaa -0.264438 -1.026059 -0.619500
1,bbb 0.927272 0.302904 -0.032399
2,ccc -0.264273 -0.386314 -0.217601
3,ddd -0.871858 -0.348382 1.100491


In [10]:
df = pd.read_table('exs/ex3.csv')
df

Unnamed: 0,A B C
0,aaa -0.264438 -1.026059 -0.619500
1,bbb 0.927272 0.302904 -0.032399
2,ccc -0.264273 -0.386314 -0.217601
3,ddd -0.871858 -0.348382 1.100491


In [11]:
df = pd.read_table('exs/ex3.csv', sep='\s+')
df

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


### skiprows

In [12]:
df = pd.read_csv('exs/ex4.csv')
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,# hey!
a,b,c,d,message
# just wanted to make things more difficult for you,,,,
# who reads CSV files with computers,anyway?,,,
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [13]:
df = pd.read_csv('exs/ex4.csv', skiprows=[0,2,3])
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [14]:
df = pd.read_csv('exs/ex4.csv', comment='#')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


## NA values

In [15]:
df = pd.read_csv('exs/ex5.csv')
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [16]:
pd.isnull(df)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [17]:
df = pd.read_csv('exs/ex5.csv', na_values=['NULL'])
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [18]:
sentinels = {'message': ['foo', 'NA'], 
             'something': ['two']} 

df = pd.read_csv('exs/ex5.csv', na_values=sentinels)
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


## Partically reading

In [19]:
df = pd.read_csv('exs/ex6.csv', nrows=5)
df

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [20]:
df = pd.read_csv('exs/ex6.csv', chunksize=1000)
df

<pandas.io.parsers.TextFileReader at 0x2db1718ba58>

In [21]:
df.get_chunk(size=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
