# pandas User Guide

### 10 minutes to pandas

In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [18]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.162679,-1.649026,-1.041572,-2.236693
2013-01-02,-2.172921,1.741841,1.136509,-1.05874
2013-01-03,-1.860195,0.252737,-1.375827,0.08435
2013-01-04,-1.262883,-0.330754,-0.584355,0.525323
2013-01-05,1.264089,-1.521646,-0.778519,1.099233
2013-01-06,-0.050749,-0.100869,-0.914942,-0.76508


In [5]:
df = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
df.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:
df.A
df.B
df.C
df.D


0    3
1    3
2    3
3    3
Name: D, dtype: int32

In [8]:
df.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df.tail(3)

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [10]:
df.index # for rows
# OR
df.columns # for cols

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [11]:
# for Numpy representation
df.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [12]:
# For quick statistics
df.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [13]:
# Transposing your data
df.T

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [15]:
# Sorting by an axis, when axis 1 -> COLUMN
df.sort_index(axis=1, ascending=False)

Unnamed: 0,F,E,D,C,B,A
0,foo,test,3,1.0,2013-01-02,1.0
1,foo,train,3,1.0,2013-01-02,1.0
2,foo,test,3,1.0,2013-01-02,1.0
3,foo,train,3,1.0,2013-01-02,1.0


In [16]:
# when axis 0 -> ROW
df.sort_index(axis=0, ascending=False) # descending=True

Unnamed: 0,A,B,C,D,E,F
3,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
0,1.0,2013-01-02,1.0,3,test,foo


In [27]:
# Sorting from small to big -> ascending
df.sort_values(by="A") 

Unnamed: 0,A,B,C,D
2013-01-02,-2.172921,1.741841,1.136509,-1.05874
2013-01-03,-1.860195,0.252737,-1.375827,0.08435
2013-01-04,-1.262883,-0.330754,-0.584355,0.525323
2013-01-01,-0.162679,-1.649026,-1.041572,-2.236693
2013-01-06,-0.050749,-0.100869,-0.914942,-0.76508
2013-01-05,1.264089,-1.521646,-0.778519,1.099233


In [32]:
df["A"] # 1D
df[["A", "B"]] # 2D

Unnamed: 0,A,B
2013-01-01,-0.162679,-1.649026
2013-01-02,-2.172921,1.741841
2013-01-03,-1.860195,0.252737
2013-01-04,-1.262883,-0.330754
2013-01-05,1.264089,-1.521646
2013-01-06,-0.050749,-0.100869


# Data Slicing. 
### We use loc, iloc, and index nos

In [46]:
df[0:6]

Unnamed: 0,A,B,C,D
2013-01-01,-0.162679,-1.649026,-1.041572,-2.236693
2013-01-02,-2.172921,1.741841,1.136509,-1.05874
2013-01-03,-1.860195,0.252737,-1.375827,0.08435
2013-01-04,-1.262883,-0.330754,-0.584355,0.525323
2013-01-05,1.264089,-1.521646,-0.778519,1.099233
2013-01-06,-0.050749,-0.100869,-0.914942,-0.76508


In [54]:
# for rows before comma 0:6, after comma 0:4 for cols
df.iloc[0:, 0:] # iloc for index loc

Unnamed: 0,A,B,C,D
2013-01-01,-0.162679,-1.649026,-1.041572,-2.236693
2013-01-02,-2.172921,1.741841,1.136509,-1.05874
2013-01-03,-1.860195,0.252737,-1.375827,0.08435
2013-01-04,-1.262883,-0.330754,-0.584355,0.525323
2013-01-05,1.264089,-1.521646,-0.778519,1.099233
2013-01-06,-0.050749,-0.100869,-0.914942,-0.76508


In [57]:
# To find col names we use
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [48]:
df.loc[:, ["A", "B"]] # loc and then calling them from column names

Unnamed: 0,A,B
2013-01-01,-0.162679,-1.649026
2013-01-02,-2.172921,1.741841
2013-01-03,-1.860195,0.252737
2013-01-04,-1.262883,-0.330754
2013-01-05,1.264089,-1.521646
2013-01-06,-0.050749,-0.100869


In [58]:
df.iloc[3] # 4th row values

A   -1.262883
B   -0.330754
C   -0.584355
D    0.525323
Name: 2013-01-04 00:00:00, dtype: float64

In [59]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-1.262883,-0.330754
2013-01-05,1.264089,-1.521646
