In [1]:
# pandas intro
# source: http://pandas.pydata.org/pandas-docs/stable/10min.html

In [2]:
# import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# make a series; default index
s = pd.Series([1,3,5,np.nan,6,8])

In [4]:
# make a dataframe; date index
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.794155,0.057361,-0.568897,0.688838
2013-01-02,2.059673,-1.872538,-0.450496,-0.667979
2013-01-03,1.35542,0.814284,0.572758,0.433346
2013-01-04,1.882907,0.086599,-1.002096,-0.443995
2013-01-05,-1.127852,0.390151,-0.903296,0.74727
2013-01-06,-0.370765,0.716415,-0.856431,1.295398


In [7]:
# make a dataframe from a dictionary
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [8]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [None]:
# Viewing data

In [10]:
df.head()
# or
df.tail(2)

Unnamed: 0,A,B,C,D
2013-01-05,-1.127852,0.390151,-0.903296,0.74727
2013-01-06,-0.370765,0.716415,-0.856431,1.295398


In [11]:
# Inspecting df elements
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [13]:
df.values

array([[ 0.79415488,  0.05736115, -0.5688972 ,  0.68883842],
       [ 2.05967256, -1.87253842, -0.45049553, -0.66797868],
       [ 1.35542021,  0.81428398,  0.5727585 ,  0.43334604],
       [ 1.88290675,  0.08659897, -1.00209564, -0.44399475],
       [-1.12785187,  0.3901511 , -0.90329621,  0.74726954],
       [-0.37076508,  0.71641488, -0.85643061,  1.29539799]])

In [15]:
# stats

In [16]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.76559,0.032045,-0.534743,0.342146
std,1.276573,0.983825,0.581817,0.753617
min,-1.127852,-1.872538,-1.002096,-0.667979
25%,-0.079535,0.064671,-0.89158,-0.22466
50%,1.074788,0.238375,-0.712664,0.561092
75%,1.751035,0.634849,-0.480096,0.732662
max,2.059673,0.814284,0.572758,1.295398


In [17]:
# transpose
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.794155,2.059673,1.35542,1.882907,-1.127852,-0.370765
B,0.057361,-1.872538,0.814284,0.086599,0.390151,0.716415
C,-0.568897,-0.450496,0.572758,-1.002096,-0.903296,-0.856431
D,0.688838,-0.667979,0.433346,-0.443995,0.74727,1.295398


In [28]:
# sorting by axis; columns (axis 1) in reverse alphabetical order
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.688838,-0.568897,0.057361,0.794155
2013-01-02,-0.667979,-0.450496,-1.872538,2.059673
2013-01-03,0.433346,0.572758,0.814284,1.35542
2013-01-04,-0.443995,-1.002096,0.086599,1.882907
2013-01-05,0.74727,-0.903296,0.390151,-1.127852
2013-01-06,1.295398,-0.856431,0.716415,-0.370765


In [29]:
# sorting by axis; rows (axis 0) in reverse date order
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,-0.370765,0.716415,-0.856431,1.295398
2013-01-05,-1.127852,0.390151,-0.903296,0.74727
2013-01-04,1.882907,0.086599,-1.002096,-0.443995
2013-01-03,1.35542,0.814284,0.572758,0.433346
2013-01-02,2.059673,-1.872538,-0.450496,-0.667979
2013-01-01,0.794155,0.057361,-0.568897,0.688838


In [30]:
# sorting by values; by column C, numbers in descending order
df.sort_values(by='C', ascending=False)

Unnamed: 0,A,B,C,D
2013-01-03,1.35542,0.814284,0.572758,0.433346
2013-01-02,2.059673,-1.872538,-0.450496,-0.667979
2013-01-01,0.794155,0.057361,-0.568897,0.688838
2013-01-06,-0.370765,0.716415,-0.856431,1.295398
2013-01-05,-1.127852,0.390151,-0.903296,0.74727
2013-01-04,1.882907,0.086599,-1.002096,-0.443995


In [None]:
# Selection
# Note While standard Python / Numpy expressions for selecting and setting 
# are intuitive for production code, we recommend the optimized pandas data access methods:
# .at, .iat, .loc, .iloc and .ix.

In [None]:
# Upto