# Pandas Documentation on DataFrame

In this notebook, you will work through the Pandas documentation on DataFrames.

## Imports

In [5]:
import numpy as np
import pandas as pd

## DataFrame

In this notebook, you are going to learn how to use `pandas.DataFrame` by typing the code from the Pandas documentation into this notebook.

* Go to the Pandas [DataFrame Documentation](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe).
* Type all of the code from that section of the documentation into this notebook and get it working.
* **To learn this API well, you must type the code rather than copy and pasting it**.
* Create a new cell in this section for each `In[]` prompt in the documentation.
* Ignore the cells in the **Grading** section below.
* No Markdown comments are needed.
* Skip the following sub-sections:
  - From structured or record array
  - Alternate Constructors
  - Assigning New Columns in Method Chains
  - Console display
  - DataFrame column attribute access and IPython completion

## Grading

In [6]:
s = pd.Series(np.random.randn(5), index=['a','b','c','d','e'])

In [7]:
s

a    0.349896
b   -1.271529
c   -1.354553
d    0.862540
e   -0.226642
dtype: float64

s.index

In [8]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [9]:
pd.Series(np.random.randn(5))

0   -0.499529
1    1.616056
2    0.429679
3    0.047368
4    0.043899
dtype: float64

In [10]:
d={'a':0.,'b':1.,'c':2.}

In [11]:
pd.Series(d)

a    0
b    1
c    2
dtype: float64

In [12]:
pd.Series(d, index=['b','c','d','a'])

b     1
c     2
d   NaN
a     0
dtype: float64

In [13]:
pd.Series(5., index=['a','b','c','d','e'])

a    5
b    5
c    5
d    5
e    5
dtype: float64

In [14]:
s[0]

0.34989625747158865

In [15]:
s[:3]

a    0.349896
b   -1.271529
c   -1.354553
dtype: float64

In [16]:
s[s>s.median()]

a    0.349896
d    0.862540
dtype: float64

In [17]:
s[[4,3,1]]

e   -0.226642
d    0.862540
b   -1.271529
dtype: float64

In [18]:
np.exp(s)

a    1.418920
b    0.280402
c    0.258063
d    2.369171
e    0.797207
dtype: float64

In [19]:
s['a']

0.34989625747158865

In [20]:
s['e'] = 12

In [21]:
s

a     0.349896
b    -1.271529
c    -1.354553
d     0.862540
e    12.000000
dtype: float64

In [22]:
'e' in s

True

In [23]:
'f' in s

False

In [24]:
d={'one' : pd.Series([1.,2.,3.,4.], index=['a','b','c','d']),
   'two' : pd.Series([1.,2.,3.,4.], index=['a','b','c','d'])}

In [25]:
df=pd.DataFrame(d)

In [26]:
df

Unnamed: 0,one,two
a,1,1
b,2,2
c,3,3
d,4,4


In [29]:
pd.DataFrame(d, index=['d','b','a'])

Unnamed: 0,one,two
d,4,4
b,2,2
a,1,1


In [31]:
pd.DataFrame(d, index=['a','b','a'],columns=['two','three'])

Unnamed: 0,two,three
a,1,
b,2,
a,1,


In [34]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [36]:
df.columns

Index(['one', 'two'], dtype='object')

In [37]:
d={'one':[1.,2.,3.,4.],
  'two' :[4.,3.,2.,1.]}

In [38]:
pd.DataFrame(d)

Unnamed: 0,one,two
0,1,4
1,2,3
2,3,2
3,4,1


In [39]:
pd.DataFrame(d, index=['a','b','c','d'])

Unnamed: 0,one,two
a,1,4
b,2,3
c,3,2
d,4,1


In [42]:
data = np.zeros((2,), dtype=[('A', 'i4'),('B','f4'),('C','a10')])

In [43]:
data[:]=[(1,2.,'Hello'),(2,3.,"World")]

In [44]:
pd.DataFrame(data)

Unnamed: 0,A,B,C
0,1,2,b'Hello'
1,2,3,b'World'


In [45]:
pd.DataFrame(data,index=['first','second'])

Unnamed: 0,A,B,C
first,1,2,b'Hello'
second,2,3,b'World'


In [46]:
pd.DataFrame(data,columns=['C','A','B'])

Unnamed: 0,C,A,B
0,b'Hello',1,2
1,b'World',2,3


In [48]:
data2 = [{'a':1,'b':2},{'a':5,'b':10,'c':20}]

In [50]:
data2=[{'a':1,'b':2},{'a':5,'b':10,'c':20}]

In [51]:
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [52]:
pd.DataFrame(data2,index={'first','second'})

Unnamed: 0,a,b,c
second,1,2,
first,5,10,20.0


In [54]:
pd.DataFrame(data2,columns=['a','b'])

Unnamed: 0,a,b
0,1,2
1,5,10


In [55]:
pd.DataFrame({('a','b'):{('A','B'):1,('A','C'):2},
             ('a','a'): {('A','C'):3,('A','B'):4},
             ('a','c'): {('A','B'):5, ('A','C'):6},
             ('b','a'): {('A','C'):7, ('A','B'):8},
             ('b','b'): {('A','D'):9, ('A','B'):10}})

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,c,a,b
A,B,4.0,1.0,5.0,8.0,10.0
A,C,3.0,2.0,6.0,7.0,
A,D,,,,,9.0


In [56]:
df['one']

a    1
b    2
c    3
d    4
Name: one, dtype: float64

In [57]:
df['three']=df['one'] * df['two']

In [58]:
df['flag']=df['one']>2

In [59]:
df

Unnamed: 0,one,two,three,flag
a,1,1,1,False
b,2,2,4,False
c,3,3,9,True
d,4,4,16,True


In [61]:
del df['two']

In [62]:
three = df.pop('three')

In [63]:
df

Unnamed: 0,one,flag
a,1,False
b,2,False
c,3,True
d,4,True


In [64]:
df['one_trunc'] = df['one'][:2]

In [65]:
df

Unnamed: 0,one,flag,one_trunc
a,1,False,1.0
b,2,False,2.0
c,3,True,
d,4,True,


In [66]:
df.insert(1,'bar',df['one'])

In [67]:
df

Unnamed: 0,one,bar,flag,one_trunc
a,1,1,False,1.0
b,2,2,False,2.0
c,3,3,True,
d,4,4,True,


In [54]:
df = pd.DataFrame(np.random.randn(10, 4), columns=['A','B','C','D'])

In [56]:
df2 = pd.DataFrame(np.random.randn(7,3), columns=['A','B','C'])

In [57]:
df + df2

Unnamed: 0,A,B,C,D
0,-1.093325,-1.548602,0.457028,
1,0.814472,-1.194264,1.196727,
2,-0.27003,1.464778,-0.498705,
3,-0.091006,1.998862,2.048702,
4,0.901156,1.696842,-0.64005,
5,-0.50619,-1.191921,0.662982,
6,2.201591,0.27359,-2.309112,
7,,,,
8,,,,
9,,,,


In [59]:
df-df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,0.865818,1.902901,1.620972,-1.003181
2,0.720878,2.699835,-0.76523,-1.13154
3,0.765674,2.509923,1.214715,-1.222565
4,0.244294,1.504375,-1.612325,-0.370854
5,1.381779,2.248765,1.679489,-0.132285
6,2.599563,1.926052,-0.088687,0.306417
7,-0.273202,1.067887,-2.642898,-0.616478
8,0.002664,1.564766,0.619542,0.787381
9,0.578157,0.877959,-0.640105,0.937435


In [62]:
index = pd.date_range('1/1/2000', periods=8)

In [64]:
df = pd.DataFrame(np.random.randn(8,3),index=index,columns=list('ABC'))

In [65]:
df

Unnamed: 0,A,B,C
2000-01-01,0.480095,0.641914,-1.218566
2000-01-02,-1.371226,2.544081,-0.287181
2000-01-03,0.166508,0.008321,-0.136533
2000-01-04,-0.011782,-0.431178,-0.535045
2000-01-05,1.312651,-1.130508,-0.445269
2000-01-06,0.5235,0.184191,-0.337508
2000-01-07,-0.439535,-1.970023,-0.330147
2000-01-08,-0.03662,1.172507,1.022095


In [66]:
type(df['A'])

pandas.core.series.Series

In [67]:
df-df['A']

Unnamed: 0,2000-01-01 00:00:00,2000-01-02 00:00:00,2000-01-03 00:00:00,2000-01-04 00:00:00,2000-01-05 00:00:00,2000-01-06 00:00:00,2000-01-07 00:00:00,2000-01-08 00:00:00,A,B,C
2000-01-01,,,,,,,,,,,
2000-01-02,,,,,,,,,,,
2000-01-03,,,,,,,,,,,
2000-01-04,,,,,,,,,,,
2000-01-05,,,,,,,,,,,
2000-01-06,,,,,,,,,,,
2000-01-07,,,,,,,,,,,
2000-01-08,,,,,,,,,,,


In [68]:
df * 5 + 2

Unnamed: 0,A,B,C
2000-01-01,4.400473,5.209572,-4.09283
2000-01-02,-4.856128,14.720406,0.564097
2000-01-03,2.832541,2.041605,1.317335
2000-01-04,1.94109,-0.155889,-0.675227
2000-01-05,8.563256,-3.652539,-0.226345
2000-01-06,4.617501,2.920956,0.312461
2000-01-07,-0.197675,-7.850117,0.349265
2000-01-08,1.816899,7.862537,7.110473


In [69]:
1/df

Unnamed: 0,A,B,C
2000-01-01,2.082923,1.55784,-0.820637
2000-01-02,-0.729275,0.393069,-3.48213
2000-01-03,6.005709,120.178087,-7.324239
2000-01-04,-84.875395,-2.319228,-1.869
2000-01-05,0.761817,-0.884558,-2.245833
2000-01-06,1.910219,5.429144,-2.962895
2000-01-07,-2.275132,-0.507608,-3.028954
2000-01-08,-27.307382,0.852873,0.978383


In [70]:
df ** 4

Unnamed: 0,A,B,C
2000-01-01,0.05312604,0.1697886,2.204937
2000-01-02,3.535377,41.89131,0.006802
2000-01-03,0.0007686754,4.794009e-09,0.000347
2000-01-04,1.92696e-08,0.03456415,0.081953
2000-01-05,2.968913,1.633407,0.039309
2000-01-06,0.07510481,0.001150998,0.012976
2000-01-07,0.03732275,15.0621,0.01188
2000-01-08,1.798373e-06,1.890003,1.091351


In [74]:
df1 = pd.DataFrame({'a':[1,0,1],'b':[0,1,1]}, dtype=bool)

In [76]:
df2 = pd.DataFrame({'a':[0,1,1],'b':[1,1,0]},dtype=bool)

In [77]:
df1&df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [78]:
df1|df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [79]:
df1^df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [80]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


In [81]:
df[:5].T

Unnamed: 0,2000-01-01 00:00:00,2000-01-02 00:00:00,2000-01-03 00:00:00,2000-01-04 00:00:00,2000-01-05 00:00:00
A,0.480095,-1.371226,0.166508,-0.011782,1.312651
B,0.641914,2.544081,0.008321,-0.431178,-1.130508
C,-1.218566,-0.287181,-0.136533,-0.535045,-0.445269


In [82]:
np.exp(df)

Unnamed: 0,A,B,C
2000-01-01,1.616227,1.900115,0.295654
2000-01-02,0.253796,12.731526,0.750376
2000-01-03,1.181173,1.008356,0.872378
2000-01-04,0.988287,0.649743,0.585643
2000-01-05,3.716013,0.322869,0.640652
2000-01-06,1.687926,1.202246,0.713546
2000-01-07,0.644336,0.139454,0.718818
2000-01-08,0.964042,3.230082,2.77901


In [83]:
np.asarray(df)

array([[ 0.48009464,  0.64191439, -1.21856592],
       [-1.3712257 ,  2.54408127, -0.28718057],
       [ 0.16650825,  0.00832098, -0.13653295],
       [-0.01178198, -0.43117789, -0.53504543],
       [ 1.31265125, -1.13050783, -0.44526901],
       [ 0.5235003 ,  0.18419112, -0.33750773],
       [-0.43953495, -1.97002335, -0.33014693],
       [-0.03662013,  1.17250746,  1.02209465]])

In [84]:
df.T.dot(df)

Unnamed: 0,A,B,C
A,4.330252,-3.738446,-0.861154
B,-3.738446,13.638127,1.006763
C,-0.861154,1.006763,3.338141


In [87]:
s1 = pd.Series(np.arange(5,10))

In [88]:
s1.dot(s1)

255

In [105]:
df = pd.DataFrame({'foo1' : np.random.randn(5),
                   'foo2' : np.random.randn(5)})

In [106]:
df

Unnamed: 0,foo1,foo2
0,0.171834,-2.324822
1,0.255507,0.381363
2,0.13943,0.303211
3,-0.187266,0.525964
4,1.28125,-0.409387


In [107]:
df.foo1

0    0.171834
1    0.255507
2    0.139430
3   -0.187266
4    1.281250
Name: foo1, dtype: float64