In [1]:
# http://pandas.pydata.org/pandas-docs/stable/dsintro.html
import numpy as np
import pandas as pd

## Series
Series is a one-dimensional **labeled array** capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The **axis labels** are collectively referred to as the **index**.

### from ndarray

In [3]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
print(s)
print(s.index)

a   -0.920897
b   -1.552979
c    0.141887
d    1.541013
e   -0.915602
dtype: float64
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


In [4]:
pd.Series(np.random.randn(5))

0    0.386815
1   -0.700178
2   -0.266853
3    1.843512
4   -0.552224
dtype: float64

### from dict

In [5]:
d = {'a' : 0., 'b' : 1., 'c' : 2.}
pd.Series(d)

a    0.0
b    1.0
c    2.0
dtype: float64

In [6]:
pd.Series(d, index=['b', 'c', 'd', 'a'])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [8]:
pd.Series([1,2,3])

0    1
1    2
2    3
dtype: int64

### From scalar value
If data is a scalar value, an index must be provided. The value will be repeated to match the length of index

In [9]:
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

### Series is ndarray-like
Series acts very similarly to a ndarray, and is a valid argument to most NumPy functions. However, **things like slicing also slice the index**.

In [19]:
s = pd.Series([0,1,2,3,4], index=['a', 'b', 'c', 'd', 'e'])
print(type(s[1]))
print(type(s[:3]))

<class 'numpy.int64'>
<class 'pandas.core.series.Series'>


In [20]:
s[s > s.median()]

d    3
e    4
dtype: int64

In [21]:
s[[4, 3, 1]]

e    4
d    3
b    1
dtype: int64

In [22]:
np.square(s)

a     0
b     1
c     4
d     9
e    16
dtype: int64

### Series is dick-like
A Series is like a fixed-size dict in that you can **get and set values by index label**

In [23]:
s['a']

0

In [26]:
type(s.a)

numpy.int64

In [27]:
'a' in s

True

### Vectorized operations and label alignment with Series

In [28]:
s + s

a    0
b    2
c    4
d    6
e    8
dtype: int64

In [29]:
s * 2

a    0
b    2
c    4
d    6
e    8
dtype: int64

In [30]:
np.square(s)

a     0
b     1
c     4
d     9
e    16
dtype: int64

In [33]:
s[1:] + s[:-1]

a    NaN
b    2.0
c    4.0
d    6.0
e    NaN
dtype: float64

In [36]:
s.name

## DataFrame 
**DataFrame** is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a **spreadsheet** or **SQL table**, or **a dict of Series** objects.

### From dict of 1d array like objects
such as dict of 1d ndarray, lists, dicts, Series

In [39]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [40]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [41]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [42]:
df.index # row labels

Index(['a', 'b', 'c', 'd'], dtype='object')

In [43]:
df.columns # column labels

Index(['one', 'two'], dtype='object')

In [44]:
# From dict of ndarrays/lists
d = {'one' : [1., 2., 3., 4.],
     'two' : [4., 3., 2., 1.]}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [45]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [46]:
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [53]:
df[1:3]

Unnamed: 0,one,two
b,2.0,2.0
c,3.0,3.0


In [48]:
df.one

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [49]:
df[0]

KeyError: 0