# Pandas Documentation on DataFrame

In this notebook, you will work through the Pandas documentation on DataFrames.

## Imports

In [1]:
import numpy as np
import pandas as pd

## DataFrame

In this notebook, you are going to learn how to use `pandas.DataFrame` by typing the code from the Pandas documentation into this notebook.

* Go to the Pandas [DataFrame Documentation](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe).
* Type all of the code from that section of the documentation into this notebook and get it working.
* **To learn this API well, you must type the code rather than copy and pasting it**.
* Create a new cell in this section for each `In[]` prompt in the documentation.
* Ignore the cells in the **Grading** section below.
* No Markdown comments are needed.
* Skip the following sub-sections:
  - From structured or record array
  - Alternate Constructors
  - Assigning New Columns in Method Chains
  - Console display
  - DataFrame column attribute access and IPython completion

In [2]:
# From dict of Series or dicts
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

In [4]:
df = pd.DataFrame(d)

In [5]:
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [6]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4
b,2.0,2
a,1.0,1


In [7]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4,
b,2,
a,1,


In [8]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [9]:
df.columns

Index(['one', 'two'], dtype='object')

In [10]:
#From dict of ndarrays / lists
d = {'one' : [1., 2., 3., 4.],
     'two' : [4., 3., 2., 1.]}

In [11]:
pd.DataFrame(d)

Unnamed: 0,one,two
0,1,4
1,2,3
2,3,2
3,4,1


In [12]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1,4
b,2,3
c,3,2
d,4,1


In [13]:
#From a list of dicts
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

In [14]:
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [15]:
pd.DataFrame(data2, index=['first', 'second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [16]:
pd.DataFrame(data2, columns=['a', 'b'])

Unnamed: 0,a,b
0,1,2
1,5,10


In [17]:
#From a dict of tuples
pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
            ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
            ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
            ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
            ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,c,a,b
A,B,4.0,1.0,5.0,8.0,10.0
A,C,3.0,2.0,6.0,7.0,
A,D,,,,,9.0


In [18]:
# Column selection, addition, deletion
df['one']

a     1
b     2
c     3
d   NaN
Name: one, dtype: float64

In [19]:
df['three'] = df['one'] * df['two']

In [20]:
df['flag'] = df['one'] > 2

In [21]:
df

Unnamed: 0,one,two,three,flag
a,1.0,1,1.0,False
b,2.0,2,4.0,False
c,3.0,3,9.0,True
d,,4,,False


In [22]:
del df['two']

In [23]:
three = df.pop('three')

In [24]:
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [25]:
df['foo'] = 'bar'

In [26]:
df

Unnamed: 0,one,flag,foo
a,1.0,False,bar
b,2.0,False,bar
c,3.0,True,bar
d,,False,bar


In [27]:
df['one_trunc'] = df['one'][:2]

In [28]:
df

Unnamed: 0,one,flag,foo,one_trunc
a,1.0,False,bar,1.0
b,2.0,False,bar,2.0
c,3.0,True,bar,
d,,False,bar,


In [29]:
df.insert(1, 'bar', df['one'])

In [30]:
df

Unnamed: 0,one,bar,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,2.0,False,bar,2.0
c,3.0,3.0,True,bar,
d,,,False,bar,


In [31]:
# Indexing / Selection
df.loc['b']

one              2
bar              2
flag         False
foo            bar
one_trunc        2
Name: b, dtype: object

In [34]:
df.iloc[2]

one             3
bar             3
flag         True
foo           bar
one_trunc     NaN
Name: c, dtype: object

In [36]:
# Data alignment and arithmetic
df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])

In [37]:
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])

In [38]:
df + df2

Unnamed: 0,A,B,C,D
0,0.434323,-1.197688,-0.731445,
1,-2.045114,0.361957,0.934576,
2,2.283346,-1.661548,-0.365496,
3,0.689802,-1.569281,-0.394586,
4,2.174707,1.457089,-1.021049,
5,2.295179,2.627608,-2.394854,
6,0.774245,-2.303286,2.168623,
7,,,,
8,,,,
9,,,,


In [39]:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,-1.435274,1.653979,1.016708,0.762393
2,1.570049,0.468393,1.185907,-0.009244
3,-0.107869,-0.215297,2.064967,0.397877
4,1.448595,1.170715,0.344023,-0.674166
5,1.915559,2.412957,-1.333673,-1.081102
6,1.353978,0.151738,4.082581,0.084721
7,0.926132,-0.453094,0.155878,-1.115117
8,-1.067707,1.350968,-0.642044,0.085699
9,-0.69405,0.940699,1.945134,0.780209


In [41]:
index = pd.date_range('1/1/2000', periods=8)

In [42]:
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=list('ABC'))

In [43]:
df

Unnamed: 0,A,B,C
2000-01-01,0.718672,0.372832,0.442254
2000-01-02,1.222787,-0.21229,-0.838569
2000-01-03,-0.29388,1.254302,-0.896642
2000-01-04,-0.04304,1.108561,1.344462
2000-01-05,1.070482,2.268161,-0.79466
2000-01-06,1.397188,0.93812,1.358135
2000-01-07,1.428773,-0.720463,0.652456
2000-01-08,0.929601,-0.579516,2.385747


In [44]:
type(df['A'])

pandas.core.series.Series

In [45]:
df - df['A']

Unnamed: 0,2000-01-01 00:00:00,2000-01-02 00:00:00,2000-01-03 00:00:00,2000-01-04 00:00:00,2000-01-05 00:00:00,2000-01-06 00:00:00,2000-01-07 00:00:00,2000-01-08 00:00:00,A,B,C
2000-01-01,,,,,,,,,,,
2000-01-02,,,,,,,,,,,
2000-01-03,,,,,,,,,,,
2000-01-04,,,,,,,,,,,
2000-01-05,,,,,,,,,,,
2000-01-06,,,,,,,,,,,
2000-01-07,,,,,,,,,,,
2000-01-08,,,,,,,,,,,


In [46]:
df * 5 + 2

Unnamed: 0,A,B,C
2000-01-01,5.593358,3.864161,4.211269
2000-01-02,8.113935,0.93855,-2.192847
2000-01-03,0.530602,8.271508,-2.483208
2000-01-04,1.784799,7.542807,8.722309
2000-01-05,7.352411,13.340804,-1.9733
2000-01-06,8.985942,6.690598,8.790677
2000-01-07,9.143865,-1.602314,5.262279
2000-01-08,6.648007,-0.897581,13.928735


In [48]:
1 / df

Unnamed: 0,A,B,C
2000-01-01,1.391456,2.682172,2.261145
2000-01-02,0.817804,-4.710537,-1.192507
2000-01-03,-3.402753,0.797256,-1.115273
2000-01-04,-23.234099,0.90207,0.743792
2000-01-05,0.934159,0.440886,-1.2584
2000-01-06,0.715723,1.065962,0.736304
2000-01-07,0.699901,-1.387997,1.532671
2000-01-08,1.07573,-1.725578,0.419156


In [49]:
df ** 4

Unnamed: 0,A,B,C
2000-01-01,0.266761,0.019322,0.038255
2000-01-02,2.235647,0.002031,0.494489
2000-01-03,0.007459,2.475186,0.646361
2000-01-04,3e-06,1.510216,3.267336
2000-01-05,1.31316,26.466431,0.398773
2000-01-06,3.810833,0.774521,3.402297
2000-01-07,4.167282,0.26943,0.181219
2000-01-08,0.74677,0.112788,32.396457


In [50]:
df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1] }, dtype=bool)

In [51]:
df2 = pd.DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0] }, dtype=bool)

In [52]:
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [53]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [54]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [55]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


In [56]:
# Transposing
df[:5].T

Unnamed: 0,2000-01-01 00:00:00,2000-01-02 00:00:00,2000-01-03 00:00:00,2000-01-04 00:00:00,2000-01-05 00:00:00
A,0.718672,1.222787,-0.29388,-0.04304,1.070482
B,0.372832,-0.21229,1.254302,1.108561,2.268161
C,0.442254,-0.838569,-0.896642,1.344462,-0.79466


In [57]:
# DataFrame interoperability with NumPy functions
np.exp(df)

Unnamed: 0,A,B,C
2000-01-01,2.051706,1.451841,1.556211
2000-01-02,3.396641,0.80873,0.432329
2000-01-03,0.745366,3.505389,0.407937
2000-01-04,0.957873,3.029996,3.836121
2000-01-05,2.916785,9.661615,0.451735
2000-01-06,4.043814,2.555172,3.888935
2000-01-07,4.173575,0.486527,1.920251
2000-01-08,2.533499,0.560169,10.867177


In [58]:
np.asarray(df)

array([[ 0.71867151,  0.37283215,  0.44225389],
       [ 1.22278704, -0.21229003, -0.83856948],
       [-0.29387966,  1.25430151, -0.89664155],
       [-0.04304019,  1.10856139,  1.34446173],
       [ 1.07048211,  2.26816082, -0.79465992],
       [ 1.39718842,  0.93811966,  1.35813538],
       [ 1.42877292, -0.72046287,  0.65245586],
       [ 0.92960137, -0.57951611,  2.38574696]])

In [59]:
df.T.dot(df)

Unnamed: 0,A,B,C
A,8.103533,1.762691,3.694989
B,1.762691,9.865779,-1.672306
C,3.694989,-1.672306,12.103834


In [60]:
s1 = pd.Series(np.arange(5,10))

In [61]:
s1.dot(s1)

255

## Grading

YOUR ANSWER HERE