## Introduction to Pandas - DataFrame

In [2]:
import pandas as pd
import numpy as np

In [3]:
# name attribute
s = pd.Series(np.random.randn(5), name='My Attr')
print (s)

0    0.892028
1    1.000767
2    1.070785
3   -0.643793
4   -0.227195
Name: My Attr, dtype: float64


In [4]:
s.name

'My Attr'

### DataFrame

In [5]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [6]:
df['one']

a     1
b     2
c     3
d   NaN
Name: one, dtype: float64

In [7]:
df.one

a     1
b     2
c     3
d   NaN
Name: one, dtype: float64

In [138]:
# hmm...
df['d']

KeyError: 'd'

In [23]:
df['one']

a     1
b     2
c     3
d   NaN
Name: one, dtype: float64

In [29]:
df.one

a     1
b     2
c     3
d   NaN
Name: one, dtype: float64

In [24]:
4 in df

False

In [25]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4
b,2.0,2
a,1.0,1


In [30]:
df.iloc['d']

TypeError: cannot do label indexing on <class 'pandas.core.index.Index'> with these indexers [d] of <type 'str'>

In [26]:
pd.DataFrame(d, index="d")

TypeError: Index(...) must be called with a collection of some kind, 'd' was passed

### How to get row by index in this 2D array?

In [42]:
df.loc('d')

<pandas.core.indexing._LocIndexer at 0x85931d0>

In [40]:
df.loc['d']

one   NaN
two     4
Name: d, dtype: float64

In [35]:
s = pd.Series(np.nan, index=[49,48,47,46,45, 1, 2, 3, 4, 5])
s

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
dtype: float64

In [36]:
s.iloc[:3]

49   NaN
48   NaN
47   NaN
dtype: float64

In [37]:
s.loc[:3]

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

In [27]:
df.iloc[1].name

'b'

In [38]:
s.ix[:3] # tries to work as loc, if it fails, falls back to iloc

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

In [28]:
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [140]:
# row names
df.index

Index([u'a', u'b', u'c', u'd'], dtype='object')

In [141]:
# column names
df.columns

Index([u'one', u'two'], dtype='object')

In [43]:
# create it from a dict of lists
d = {'one' : [1., 2., 3., 4.], 'two' : [4., 3., 2., 1.]}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
0,1,4
1,2,3
2,3,2
3,4,1


In [143]:
data = np.zeros((2,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'a10')])
print "data is " + str(data)
data[:] = [(1,2.,'Hello'), (2,3.,"World")]
pd.DataFrame(data)

data is [(0, 0.0, '') (0, 0.0, '')]


Unnamed: 0,A,B,C
0,1,2,Hello
1,2,3,World


In [8]:
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [9]:
df['three'] = df['one'] * df['two']
print df

   one  two  three
a    1    1      1
b    2    2      4
c    3    3      9
d  NaN    4    NaN


In [10]:
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1,1.0,False
b,2.0,2,4.0,False
c,3.0,3,9.0,True
d,,4,,False


In [11]:
del df['two']
df

Unnamed: 0,one,three,flag
a,1.0,1.0,False
b,2.0,4.0,False
c,3.0,9.0,True
d,,,False


In [12]:
three = df.pop('three')
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [15]:
df['foo'] = 'bar'
df

Unnamed: 0,one,bar,flag,foo
a,1.0,1.0,False,bar
b,2.0,2.0,False,bar
c,3.0,3.0,True,bar
d,,,False,bar


In [16]:
# insert a column
df.insert(1, 'bar', df['one'])
df

ValueError: cannot insert bar, already exists

In [153]:
# select row at index 2
df.loc[2]

one        3
bar        3
flag    True
foo      bar
Name: 2, dtype: object

Data Alignment and Arithmetic

In [3]:
df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0.544726,1.792757,0.733547,1.553217
1,0.349357,-1.034169,-2.943986,0.6571
2,-0.961967,-0.875005,0.74474,-0.373284
3,-0.426513,1.711684,-1.028556,-0.43345
4,0.043059,-0.351244,-0.415061,-1.417764
5,0.736079,-1.680911,0.027885,0.990395
6,0.485015,-0.208122,1.120541,0.12528
7,0.638441,0.917049,1.721679,1.659859
8,-0.523147,1.255374,1.840382,0.006407
9,0.066843,0.683169,0.862425,-1.336554


In [4]:
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
df2

Unnamed: 0,A,B,C
0,-1.657966,-0.93617,0.086368
1,0.347342,1.495324,0.292153
2,0.11085,1.157296,-0.15219
3,0.369644,1.415164,-0.685517
4,0.214712,0.437963,-0.847862
5,-1.408939,-0.018786,-0.327571
6,0.237839,-0.92998,-0.000968


In [5]:
df + df2

Unnamed: 0,A,B,C,D
0,-1.11324,0.856587,0.819915,
1,0.696699,0.461155,-2.651833,
2,-0.851117,0.282292,0.59255,
3,-0.056869,3.126849,-1.714073,
4,0.257771,0.086719,-1.262923,
5,-0.67286,-1.699697,-0.299686,
6,0.722854,-1.138103,1.119573,
7,,,,
8,,,,
9,,,,


In [6]:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,-0.195369,-2.826926,-3.677533,-0.896117
2,-1.506693,-2.667761,0.011193,-1.926501
3,-0.971239,-0.081072,-1.762103,-1.986668
4,-0.501666,-2.144,-1.148608,-2.970982
5,0.191354,-3.473668,-0.705662,-0.562822
6,-0.059711,-2.000879,0.386994,-1.427938
7,0.093715,-0.875707,0.988131,0.106642
8,-1.067872,-0.537383,1.106835,-1.54681
9,-0.477883,-1.109588,0.128878,-2.889771


In [7]:
# the dataframe index is time series
index = pd.date_range('1/8/2016', periods=8)
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=list('ABC'))
df

Unnamed: 0,A,B,C
2016-01-08,1.73437,0.17364,0.022324
2016-01-09,0.970906,1.180251,1.847922
2016-01-10,0.538901,0.269247,-1.097668
2016-01-11,-1.373993,-0.246839,0.55907
2016-01-12,0.61821,-1.968536,3.042382
2016-01-13,0.855625,-0.208851,1.304321
2016-01-14,-0.049068,0.116095,0.61347
2016-01-15,1.43944,-0.493175,-0.959416


In [8]:
type(df['A'])

pandas.core.series.Series

In [10]:
df.sub(df['A'], axis=0) # it was df - df['A'], depricated now

Unnamed: 0,A,B,C
2016-01-08,0,-1.560729,-1.712046
2016-01-09,0,0.209344,0.877016
2016-01-10,0,-0.269654,-1.636569
2016-01-11,0,1.127154,1.933062
2016-01-12,0,-2.586746,2.424172
2016-01-13,0,-1.064476,0.448695
2016-01-14,0,0.165164,0.662538
2016-01-15,0,-1.932616,-2.398857


In [17]:
df.sub(df['A'], axis=1) # it was df - df['A'], depricated now

KeyError: 'A'

In [11]:
df * 5 + 2

Unnamed: 0,A,B,C
2016-01-08,10.671848,2.868202,2.111619
2016-01-09,6.854532,7.901254,11.239609
2016-01-10,4.694506,3.346234,-3.488342
2016-01-11,-4.869963,0.765807,4.795349
2016-01-12,5.091051,-7.842678,17.211912
2016-01-13,6.278125,0.955745,8.521603
2016-01-14,1.754658,2.580476,5.067348
2016-01-15,9.197202,-0.465876,-2.797081


In [12]:
1 / df

Unnamed: 0,A,B,C
2016-01-08,0.576578,5.759029,44.795295
2016-01-09,1.029966,0.847278,0.541148
2016-01-10,1.855628,3.714066,-0.911022
2016-01-11,-0.727806,-4.05123,1.788685
2016-01-12,1.617573,-0.507992,0.32869
2016-01-13,1.168736,-4.788103,0.766683
2016-01-14,-20.379682,8.613623,1.630072
2016-01-15,0.694714,-2.027677,-1.0423


Boolean Operators

In [13]:
df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1] }, dtype=bool)
df1

Unnamed: 0,a,b
0,True,False
1,False,True
2,True,True


In [14]:
df2 = pd.DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0] }, dtype=bool)
df2

Unnamed: 0,a,b
0,False,True
1,True,True
2,True,False


In [15]:
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [17]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [18]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


Trasposing

In [19]:
df[:5]

Unnamed: 0,A,B,C
2016-01-08,1.73437,0.17364,0.022324
2016-01-09,0.970906,1.180251,1.847922
2016-01-10,0.538901,0.269247,-1.097668
2016-01-11,-1.373993,-0.246839,0.55907
2016-01-12,0.61821,-1.968536,3.042382


In [20]:
#trapspose 
df[:5].T

Unnamed: 0,2016-01-08 00:00:00,2016-01-09 00:00:00,2016-01-10 00:00:00,2016-01-11 00:00:00,2016-01-12 00:00:00
A,1.73437,0.970906,0.538901,-1.373993,0.61821
B,0.17364,1.180251,0.269247,-0.246839,-1.968536
C,0.022324,1.847922,-1.097668,0.55907,3.042382


Interoperability with Numpy Functions

In [21]:
np.exp(df)

Unnamed: 0,A,B,C
2016-01-08,5.665355,1.189628,1.022575
2016-01-09,2.640336,3.25519,6.346617
2016-01-10,1.714122,1.308978,0.333648
2016-01-11,0.253094,0.781267,1.749045
2016-01-12,1.855604,0.139661,20.955106
2016-01-13,2.352845,0.811516,3.685184
2016-01-14,0.952116,1.123103,1.846828
2016-01-15,4.218335,0.610684,0.383116


In [22]:
np.asarray(df)

array([[ 1.7343696 ,  0.17364039,  0.02232377],
       [ 0.9709063 ,  1.18025073,  1.84792185],
       [ 0.53890112,  0.26924671, -1.09766833],
       [-1.37399265, -0.24683861,  0.55906983],
       [ 0.61821017, -1.96853569,  3.04238232],
       [ 0.85562504, -0.20885097,  1.30432052],
       [-0.04906848,  0.11609517,  0.61346966],
       [ 1.4394405 , -0.49317527, -0.95941622]])

Matrix Multiplication

In [23]:
df.T.dot(df)

Unnamed: 0,A,B,C
A,9.317642,-0.179939,2.058901
B,-0.179939,5.732017,-3.965722
C,2.058901,-3.965722,17.186915


GroupBy

In [16]:
df = pd.DataFrame({'BoolCol': [True, False, False, True, True]}, index=[10,20,30,40,50])
print df

   BoolCol
10    True
20   False
30   False
40    True
50    True


In [15]:
print df[df['BoolCol']]

   BoolCol
10    True
40    True
50    True


In [19]:
df[df['BoolCol']].index.tolist()

[10, 40, 50]