In [2]:
import pandas as pd
import numpy as np

## Object Creation

In [3]:
series = pd.Series([1,3,5,6,"hello",5,5,5])
series

0        1
1        3
2        5
3        6
4    hello
5        5
6        5
7        5
dtype: object

## Creating a DataFrame by passing a numpy array

In [4]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
dataframe = pd.DataFrame(np.random.randn(6,4), index = dates, columns= ["First","Second","Third", "Fourth"])
dataframe

Unnamed: 0,First,Second,Third,Fourth
2013-01-01,-1.842115,-0.026576,1.35647,0.121193
2013-01-02,0.541807,-1.106518,-0.306958,-1.794347
2013-01-03,0.366455,0.095784,0.262932,2.393236
2013-01-04,1.028147,-0.275259,0.577324,-0.089614
2013-01-05,1.44377,0.239411,-1.453753,1.67602
2013-01-06,-0.676326,1.379437,0.365095,-1.334908


## Creating a DataFrame by passing a dict of objects

In [6]:
dataframe2 = pd.DataFrame({"A": 1,
                           "B": pd.Timestamp("20130102"),
                           "C": pd.Series(1,index = list(range(4)),dtype = "float" ),
                           "D": np.array([3] * 4, dtype="int32"),
                           "E": pd.Categorical(["test","train","test","train"]),
                           "F": "Foo"})
dataframe2

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1.0,3,test,Foo
1,1,2013-01-02,1.0,3,train,Foo
2,1,2013-01-02,1.0,3,test,Foo
3,1,2013-01-02,1.0,3,train,Foo


In [7]:
dataframe2.dtypes

A             int64
B    datetime64[ns]
C           float64
D             int32
E          category
F            object
dtype: object

## Viewing Data

In [8]:
#See the top & bottom rows of the frame
dataframe.head()

Unnamed: 0,First,Second,Third,Fourth
2013-01-01,-1.842115,-0.026576,1.35647,0.121193
2013-01-02,0.541807,-1.106518,-0.306958,-1.794347
2013-01-03,0.366455,0.095784,0.262932,2.393236
2013-01-04,1.028147,-0.275259,0.577324,-0.089614
2013-01-05,1.44377,0.239411,-1.453753,1.67602


In [9]:
dataframe.tail()

Unnamed: 0,First,Second,Third,Fourth
2013-01-02,0.541807,-1.106518,-0.306958,-1.794347
2013-01-03,0.366455,0.095784,0.262932,2.393236
2013-01-04,1.028147,-0.275259,0.577324,-0.089614
2013-01-05,1.44377,0.239411,-1.453753,1.67602
2013-01-06,-0.676326,1.379437,0.365095,-1.334908


In [10]:
# Display the index, columns, and the underlying numpy data
dataframe.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
dataframe.columns

Index(['First', 'Second', 'Third', 'Fourth'], dtype='object')

In [12]:
dataframe.values

array([[-1.84211515, -0.02657621,  1.35647009,  0.12119265],
       [ 0.54180719, -1.10651827, -0.30695799, -1.79434684],
       [ 0.36645459,  0.09578358,  0.26293236,  2.39323609],
       [ 1.02814714, -0.27525884,  0.57732375, -0.08961441],
       [ 1.44376997,  0.23941063, -1.4537528 ,  1.67601952],
       [-0.67632605,  1.37943655,  0.36509543, -1.33490821]])

In [13]:
# Describe shows a quick statistic summary of your data
dataframe.describe()

Unnamed: 0,First,Second,Third,Fourth
count,6.0,6.0,6.0,6.0
mean,0.143623,0.051046,0.133518,0.16193
std,1.20801,0.806791,0.946192,1.637295
min,-1.842115,-1.106518,-1.453753,-1.794347
25%,-0.415631,-0.213088,-0.164485,-1.023585
50%,0.454131,0.034604,0.314014,0.015789
75%,0.906562,0.203504,0.524267,1.287313
max,1.44377,1.379437,1.35647,2.393236


In [14]:
# Transposing your data
dataframe.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
First,-1.842115,0.541807,0.366455,1.028147,1.44377,-0.676326
Second,-0.026576,-1.106518,0.095784,-0.275259,0.239411,1.379437
Third,1.35647,-0.306958,0.262932,0.577324,-1.453753,0.365095
Fourth,0.121193,-1.794347,2.393236,-0.089614,1.67602,-1.334908


In [15]:
# Sorting by an axis
dataframe.sort_index(axis = 1, ascending= False)

Unnamed: 0,Third,Second,Fourth,First
2013-01-01,1.35647,-0.026576,0.121193,-1.842115
2013-01-02,-0.306958,-1.106518,-1.794347,0.541807
2013-01-03,0.262932,0.095784,2.393236,0.366455
2013-01-04,0.577324,-0.275259,-0.089614,1.028147
2013-01-05,-1.453753,0.239411,1.67602,1.44377
2013-01-06,0.365095,1.379437,-1.334908,-0.676326


In [16]:
# Sorting by values
dataframe.sort_values(by = "Second")

Unnamed: 0,First,Second,Third,Fourth
2013-01-02,0.541807,-1.106518,-0.306958,-1.794347
2013-01-04,1.028147,-0.275259,0.577324,-0.089614
2013-01-01,-1.842115,-0.026576,1.35647,0.121193
2013-01-03,0.366455,0.095784,0.262932,2.393236
2013-01-05,1.44377,0.239411,-1.453753,1.67602
2013-01-06,-0.676326,1.379437,0.365095,-1.334908


## Selection

In [17]:
# .at, .iat, .loc, .iloc and .ix

In [18]:
# Selecting a single column, which yields a Series, equivalent to df.A
dataframe["First"]

2013-01-01   -1.842115
2013-01-02    0.541807
2013-01-03    0.366455
2013-01-04    1.028147
2013-01-05    1.443770
2013-01-06   -0.676326
Freq: D, Name: First, dtype: float64

In [19]:
# slices the rows
dataframe[0:3]

Unnamed: 0,First,Second,Third,Fourth
2013-01-01,-1.842115,-0.026576,1.35647,0.121193
2013-01-02,0.541807,-1.106518,-0.306958,-1.794347
2013-01-03,0.366455,0.095784,0.262932,2.393236


In [20]:
# Selection by Label
dataframe.loc[dates[0]]

First    -1.842115
Second   -0.026576
Third     1.356470
Fourth    0.121193
Name: 2013-01-01 00:00:00, dtype: float64

In [21]:
# Selecting on a multiaxis by label
dataframe.loc[:,["First","Second"]]

Unnamed: 0,First,Second
2013-01-01,-1.842115,-0.026576
2013-01-02,0.541807,-1.106518
2013-01-03,0.366455,0.095784
2013-01-04,1.028147,-0.275259
2013-01-05,1.44377,0.239411
2013-01-06,-0.676326,1.379437


In [22]:
# label slicing, both endpoints are included
dataframe.loc["2013-01-01":"2013-01-04",["First","Second"]]

Unnamed: 0,First,Second
2013-01-01,-1.842115,-0.026576
2013-01-02,0.541807,-1.106518
2013-01-03,0.366455,0.095784
2013-01-04,1.028147,-0.275259


In [23]:
# For getting a scalar value
dataframe.loc["2013-01-01","First"]

-1.8421151511996043

In [24]:
# For getting fast access to a scalar (equiv to the prior method)
dataframe.at["2013-01-01","First"]

-1.8421151511996043

In [25]:
# Selection by Position

In [26]:
dataframe.iloc[3]

First     1.028147
Second   -0.275259
Third     0.577324
Fourth   -0.089614
Name: 2013-01-04 00:00:00, dtype: float64

In [27]:
dataframe.iloc[3:5,0:2]

Unnamed: 0,First,Second
2013-01-04,1.028147,-0.275259
2013-01-05,1.44377,0.239411


In [28]:
dataframe.iloc[[1,2,4],[0,2]]

Unnamed: 0,First,Third
2013-01-02,0.541807,-0.306958
2013-01-03,0.366455,0.262932
2013-01-05,1.44377,-1.453753


In [29]:
# For slicing rows explicitly
dataframe.iloc[1:3,:]

Unnamed: 0,First,Second,Third,Fourth
2013-01-02,0.541807,-1.106518,-0.306958,-1.794347
2013-01-03,0.366455,0.095784,0.262932,2.393236


In [30]:
# For getting a value explicitly
dataframe.iloc[1,1]

-1.1065182724598308

In [31]:
# Boolean Indexing

In [39]:
dataframe[dataframe > 0]

Unnamed: 0,First,Second,Third,Fourth
2013-01-01,,,1.35647,0.121193
2013-01-02,0.541807,,,
2013-01-03,0.366455,0.095784,0.262932,2.393236
2013-01-04,1.028147,,0.577324,
2013-01-05,1.44377,0.239411,,1.67602
2013-01-06,,1.379437,0.365095,


In [41]:
# Using the isin() method for filtering:
dataframe2 = dataframe.copy()
dataframe2["Fifth"] = ["one","one","two","three","four","three"]
dataframe2

Unnamed: 0,First,Second,Third,Fourth,Fifth
2013-01-01,-1.842115,-0.026576,1.35647,0.121193,one
2013-01-02,0.541807,-1.106518,-0.306958,-1.794347,one
2013-01-03,0.366455,0.095784,0.262932,2.393236,two
2013-01-04,1.028147,-0.275259,0.577324,-0.089614,three
2013-01-05,1.44377,0.239411,-1.453753,1.67602,four
2013-01-06,-0.676326,1.379437,0.365095,-1.334908,three


In [43]:
dataframe2[dataframe2["Fifth"].isin(["two","four"])]

Unnamed: 0,First,Second,Third,Fourth,Fifth
2013-01-03,0.366455,0.095784,0.262932,2.393236,two
2013-01-05,1.44377,0.239411,-1.453753,1.67602,four
