# Object Creation

In [4]:
#Pandas Demo
import plotly.tools as tls
tls.embed('https://plot.ly/~chris/7365')

Creating a Series by passing a list of values, letting pandas create a default integer index:

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#np.nan = nont a number = NaN
s=pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns:

In [23]:
#typo: period->ValueError: Must specify two of start, end, or periods
dates=pd.date_range('20160101', periods=6)
dates

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

In [44]:
df=pd.DataFrame(np.random.randn(4,6), index=list('MTWT'), columns=dates)
df

Unnamed: 0,2016-01-01 00:00:00,2016-01-02 00:00:00,2016-01-03 00:00:00,2016-01-04 00:00:00,2016-01-05 00:00:00,2016-01-06 00:00:00
M,1.585902,-0.241186,-0.008384,1.470767,0.479194,1.73184
T,2.321499,-0.443847,-1.850088,0.614447,-0.746913,0.947189
W,0.520307,1.277056,-0.779309,1.149619,0.836865,1.165001
T,-0.093841,1.41321,1.181843,0.988841,0.255439,1.317878


In [45]:
#Switch the columns and the index
df=pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2016-01-01,0.699853,-0.825632,-0.798779,1.574983
2016-01-02,0.967239,0.329266,0.788252,0.32409
2016-01-03,1.106569,-1.186498,0.149952,0.455777
2016-01-04,-0.055588,0.022692,-0.811055,0.494865
2016-01-05,0.195484,0.137291,1.672395,-0.457097
2016-01-06,1.261299,0.796316,2.091776,-0.543367


Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [46]:
#Lexicographical order
df2=pd.DataFrame({'Version':1.,
                  'Date':pd.Timestamp('20160102'),
                  'Ratio':pd.Series(1,index=list(range(4)),dtype='float32'),
                  'Rate':np.array([3]*4,dtype='int32'),
                  'Category':pd.Categorical(['Apple', 'HP','Microsoft','IBM']),
                  'Stock':'In stock'})
df2

Unnamed: 0,Category,Date,Rate,Ratio,Stock,Version
0,Apple,2016-01-02,3,1.0,In stock,1.0
1,HP,2016-01-02,3,1.0,In stock,1.0
2,Microsoft,2016-01-02,3,1.0,In stock,1.0
3,IBM,2016-01-02,3,1.0,In stock,1.0


Having specific dtypes

In [47]:
df2.dtypes

Category          category
Date        datetime64[ns]
Rate                 int32
Ratio              float32
Stock               object
Version            float64
dtype: object

If using IPython, tab completion for column names (as well as public attributes) is automatically enabled. Here’s a subset of the attributes that will be completed:

# Viewing Data

See the top & bottom rows of the frame

In [48]:
df.head(3)

Unnamed: 0,A,B,C,D
2016-01-01,0.699853,-0.825632,-0.798779,1.574983
2016-01-02,0.967239,0.329266,0.788252,0.32409
2016-01-03,1.106569,-1.186498,0.149952,0.455777


In [49]:
df.tail(2)

Unnamed: 0,A,B,C,D
2016-01-05,0.195484,0.137291,1.672395,-0.457097
2016-01-06,1.261299,0.796316,2.091776,-0.543367


Display the index, columns, and the underlying numpy data

In [50]:
df.index

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

In [51]:
df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [53]:
df.values

array([[ 0.6998529 , -0.82563242, -0.79877905,  1.57498347],
       [ 0.96723928,  0.32926592,  0.78825244,  0.32408975],
       [ 1.10656865, -1.1864981 ,  0.14995211,  0.45577722],
       [-0.05558778,  0.02269174, -0.81105532,  0.49486507],
       [ 0.195484  ,  0.13729109,  1.67239482, -0.45709735],
       [ 1.26129911,  0.79631553,  2.09177588, -0.54336717]])

In [57]:
#df.describe (without brackets/parentheses)will mess up the table
#Describe shows a quick statistic summary of your data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.695809,-0.121094,0.515423,0.308208
std,0.524808,0.743419,1.226226,0.771033
min,-0.055588,-1.186498,-0.811055,-0.543367
25%,0.321576,-0.613551,-0.561596,-0.261801
50%,0.833546,0.079991,0.469102,0.389933
75%,1.071736,0.281272,1.451359,0.485093
max,1.261299,0.796316,2.091776,1.574983


Transposing your data

In [60]:
#original table
df

Unnamed: 0,A,B,C,D
2016-01-01,0.699853,-0.825632,-0.798779,1.574983
2016-01-02,0.967239,0.329266,0.788252,0.32409
2016-01-03,1.106569,-1.186498,0.149952,0.455777
2016-01-04,-0.055588,0.022692,-0.811055,0.494865
2016-01-05,0.195484,0.137291,1.672395,-0.457097
2016-01-06,1.261299,0.796316,2.091776,-0.543367


In [61]:
#After Transposition
df.T

Unnamed: 0,2016-01-01 00:00:00,2016-01-02 00:00:00,2016-01-03 00:00:00,2016-01-04 00:00:00,2016-01-05 00:00:00,2016-01-06 00:00:00
A,0.699853,0.967239,1.106569,-0.055588,0.195484,1.261299
B,-0.825632,0.329266,-1.186498,0.022692,0.137291,0.796316
C,-0.798779,0.788252,0.149952,-0.811055,1.672395,2.091776
D,1.574983,0.32409,0.455777,0.494865,-0.457097,-0.543367


Sorting by an axis

In [63]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2016-01-01,1.574983,-0.798779,-0.825632,0.699853
2016-01-02,0.32409,0.788252,0.329266,0.967239
2016-01-03,0.455777,0.149952,-1.186498,1.106569
2016-01-04,0.494865,-0.811055,0.022692,-0.055588
2016-01-05,-0.457097,1.672395,0.137291,0.195484
2016-01-06,-0.543367,2.091776,0.796316,1.261299


Sorting by values

In [64]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2016-01-03,1.106569,-1.186498,0.149952,0.455777
2016-01-01,0.699853,-0.825632,-0.798779,1.574983
2016-01-04,-0.055588,0.022692,-0.811055,0.494865
2016-01-05,0.195484,0.137291,1.672395,-0.457097
2016-01-02,0.967239,0.329266,0.788252,0.32409
2016-01-06,1.261299,0.796316,2.091776,-0.543367


In [66]:
df.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C,D
2016-01-06,1.261299,0.796316,2.091776,-0.543367
2016-01-02,0.967239,0.329266,0.788252,0.32409
2016-01-05,0.195484,0.137291,1.672395,-0.457097
2016-01-04,-0.055588,0.022692,-0.811055,0.494865
2016-01-01,0.699853,-0.825632,-0.798779,1.574983
2016-01-03,1.106569,-1.186498,0.149952,0.455777


# Selection

Getting
------------
Selecting a single column, which yields a Series, equivalent to df.A

In [68]:
#print with index
df['A']

2016-01-01    0.699853
2016-01-02    0.967239
2016-01-03    1.106569
2016-01-04   -0.055588
2016-01-05    0.195484
2016-01-06    1.261299
Freq: D, Name: A, dtype: float64

Selecting via [ ], which slices the rows.

In [69]:
df[0:3]

Unnamed: 0,A,B,C,D
2016-01-01,0.699853,-0.825632,-0.798779,1.574983
2016-01-02,0.967239,0.329266,0.788252,0.32409
2016-01-03,1.106569,-1.186498,0.149952,0.455777


In [71]:
#Don't forget the single quotes or single quotation
df['20160102':'20160104']

Unnamed: 0,A,B,C,D
2016-01-02,0.967239,0.329266,0.788252,0.32409
2016-01-03,1.106569,-1.186498,0.149952,0.455777
2016-01-04,-0.055588,0.022692,-0.811055,0.494865


Selection by Label
-------
For getting a cross section using a label

In [73]:
#select the first row value with dates[0]
df.loc[dates[01]]

A    0.967239
B    0.329266
C    0.788252
D    0.324090
Name: 2016-01-02 00:00:00, dtype: float64

Selecting on a multi-axis by label

In [74]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2016-01-01,0.699853,-0.825632
2016-01-02,0.967239,0.329266
2016-01-03,1.106569,-1.186498
2016-01-04,-0.055588,0.022692
2016-01-05,0.195484,0.137291
2016-01-06,1.261299,0.796316


Showing label slicing, both endpoints are included



In [76]:
df.loc['20160102':'20160104',['A','B']]

Unnamed: 0,A,B
2016-01-02,0.967239,0.329266
2016-01-03,1.106569,-1.186498
2016-01-04,-0.055588,0.022692


Reduction in the dimensions of the returned object

In [77]:
#Select the values at 20160102 and columns A&B
df.loc['20160102',['A','B']]

A    0.967239
B    0.329266
Name: 2016-01-02 00:00:00, dtype: float64

For getting a scalar value

In [79]:
df.loc[dates[0],'A']

0.69985290496301211

For getting fast access to a scalar (equiv to the prior method)

In [80]:
df.at[dates[0],'A']

0.69985290496301211

...See Pandas_Data_2