This is a short introduction to pandas, geared mainly for new users. You can see more complex recipes in the Cookbook.

http://pandas.pydata.org/pandas-docs/stable/user_guide/cookbook.html#cookbook

### Object Creation

In [1]:
import pandas as pd
import numpy as np

# Create a series by passing a list with a default integer index
s = pd.Series([1,3,5,np.nan,68,8,1])
s

0     1.0
1     3.0
2     5.0
3     NaN
4    68.0
5     8.0
6     1.0
dtype: float64

In [2]:
# Create a dataframe by passing a np array with a datetime index and column names
# Create dates by using pd.date_range()
dates = pd.date_range('20190601', periods=6)
dates

DatetimeIndex(['2019-06-01', '2019-06-02', '2019-06-03', '2019-06-04',
               '2019-06-05', '2019-06-06'],
              dtype='datetime64[ns]', freq='D')

In [3]:
date_df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [4]:
date_df

Unnamed: 0,A,B,C,D
2019-06-01,1.283453,1.062953,0.812585,-1.484148
2019-06-02,1.880243,-0.99226,-1.486853,-1.007034
2019-06-03,-0.097054,0.313709,-0.468998,-0.319139
2019-06-04,-0.927449,-0.72303,0.027718,-0.075901
2019-06-05,0.341502,-0.817393,-0.280935,-0.54693
2019-06-06,-0.676519,0.503469,0.405988,0.581358


In [18]:
# Create a dataframe by passing a numpy random array with 6 columns
date_df2 = pd.DataFrame(np.random.randn(6,6), index=dates, columns=list('ABCDEF'))
date_df2

Unnamed: 0,A,B,C,D,E,F
2019-06-01,0.097439,-0.034041,-2.099028,-0.637316,1.922408,0.686636
2019-06-02,1.341363,1.817084,-0.24059,-0.726681,0.540367,-1.085983
2019-06-03,0.959534,-0.288851,-1.209702,0.182261,0.202367,-0.066258
2019-06-04,1.131026,0.344924,-0.43636,0.855684,1.214946,0.479336
2019-06-05,0.903707,-1.697083,1.136747,0.934143,-1.699898,-0.658816
2019-06-06,1.291794,-0.49465,-0.314539,-1.271031,-2.528389,0.413623


In [27]:
# Create a dataframe by passing a dictionary of objects that can be
# converted to series-like
df = pd.DataFrame({
    'A': 1,
    'B': pd.Timestamp('20190601'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(["test", "train", "test", "train"]),
    'F': 'foo',
    'G': pd.date_range('20190601', periods=4)
})
df

Unnamed: 0,A,B,C,D,E,F,G
0,1,2019-06-01,1.0,3,test,foo,2019-06-01
1,1,2019-06-01,1.0,3,train,foo,2019-06-02
2,1,2019-06-01,1.0,3,test,foo,2019-06-03
3,1,2019-06-01,1.0,3,train,foo,2019-06-04


In [29]:
df.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G    datetime64[ns]
dtype: object

In [30]:
type(df)

pandas.core.frame.DataFrame

In [33]:
df.loc[0]

A                      1
B    2019-06-01 00:00:00
C                      1
D                      3
E                   test
F                    foo
G    2019-06-01 00:00:00
Name: 0, dtype: object

In [34]:
df.loc[0:2]

Unnamed: 0,A,B,C,D,E,F,G
0,1,2019-06-01,1.0,3,test,foo,2019-06-01
1,1,2019-06-01,1.0,3,train,foo,2019-06-02
2,1,2019-06-01,1.0,3,test,foo,2019-06-03


### Viewing Data

In [40]:
print(df.head())
print(df.tail())

   A          B    C  D      E    F          G
0  1 2019-06-01  1.0  3   test  foo 2019-06-01
1  1 2019-06-01  1.0  3  train  foo 2019-06-02
2  1 2019-06-01  1.0  3   test  foo 2019-06-03
3  1 2019-06-01  1.0  3  train  foo 2019-06-04
   A          B    C  D      E    F          G
0  1 2019-06-01  1.0  3   test  foo 2019-06-01
1  1 2019-06-01  1.0  3  train  foo 2019-06-02
2  1 2019-06-01  1.0  3   test  foo 2019-06-03
3  1 2019-06-01  1.0  3  train  foo 2019-06-04


In [41]:
# Display index
df.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [42]:
# Display columns
df.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')

In [43]:
df.shape

(4, 7)

In [46]:
'''
df.to_numpy() gives a numPy rep of the underlying data.

NumPy arrays have one dtype for the entire array.
Pandas DataFrames have one dtype per column.
'''
date_df.to_numpy()

array([[ 1.28345313,  1.06295291,  0.81258538, -1.48414782],
       [ 1.88024307, -0.9922603 , -1.48685269, -1.00703391],
       [-0.09705441,  0.3137089 , -0.46899807, -0.31913915],
       [-0.92744899, -0.72302973,  0.02771811, -0.07590131],
       [ 0.34150216, -0.81739301, -0.28093462, -0.54692973],
       [-0.67651861,  0.50346911,  0.40598827,  0.58135808]])

In [48]:
date_df  # does not include the index or column labels in the output

Unnamed: 0,A,B,C,D
2019-06-01,1.283453,1.062953,0.812585,-1.484148
2019-06-02,1.880243,-0.99226,-1.486853,-1.007034
2019-06-03,-0.097054,0.313709,-0.468998,-0.319139
2019-06-04,-0.927449,-0.72303,0.027718,-0.075901
2019-06-05,0.341502,-0.817393,-0.280935,-0.54693
2019-06-06,-0.676519,0.503469,0.405988,0.581358


In [49]:
# Summary
df.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [50]:
# Transpose
df.T

Unnamed: 0,0,1,2,3
A,1,1,1,1
B,2019-06-01 00:00:00,2019-06-01 00:00:00,2019-06-01 00:00:00,2019-06-01 00:00:00
C,1,1,1,1
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo
G,2019-06-01 00:00:00,2019-06-02 00:00:00,2019-06-03 00:00:00,2019-06-04 00:00:00


In [51]:
# Sorting by an axis
df.sort_index(axis=1, ascending=False)

Unnamed: 0,G,F,E,D,C,B,A
0,2019-06-01,foo,test,3,1.0,2019-06-01,1
1,2019-06-02,foo,train,3,1.0,2019-06-01,1
2,2019-06-03,foo,test,3,1.0,2019-06-01,1
3,2019-06-04,foo,train,3,1.0,2019-06-01,1


In [52]:
# Sorting by an axis
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D,E,F,G
3,1,2019-06-01,1.0,3,train,foo,2019-06-04
2,1,2019-06-01,1.0,3,test,foo,2019-06-03
1,1,2019-06-01,1.0,3,train,foo,2019-06-02
0,1,2019-06-01,1.0,3,test,foo,2019-06-01


In [54]:
# Sorting by values
df.sort_values(by='B')

Unnamed: 0,A,B,C,D,E,F,G
0,1,2019-06-01,1.0,3,test,foo,2019-06-01
1,1,2019-06-01,1.0,3,train,foo,2019-06-02
2,1,2019-06-01,1.0,3,test,foo,2019-06-03
3,1,2019-06-01,1.0,3,train,foo,2019-06-04


### Selection