# 10 Minutes to pandas
## Object Creation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range('20171122', periods=6)

In [4]:
dates

DatetimeIndex(['2017-11-22', '2017-11-23', '2017-11-24', '2017-11-25',
               '2017-11-26', '2017-11-27'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6, 4), 
                  index=dates, 
                  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2017-11-22,-0.833324,-0.527916,0.342883,0.24918
2017-11-23,1.796409,2.109904,-0.326079,0.322876
2017-11-24,-1.380897,0.727351,0.757998,-0.164632
2017-11-25,-1.458775,-0.406354,0.735791,0.208168
2017-11-26,0.644822,0.965369,-0.822161,0.504486
2017-11-27,1.176051,0.694408,0.252426,-0.656584


In [6]:
df2 = pd.DataFrame({
    'A' : 1.,
    'B' : pd.Timestamp('20171122'),
    'C' : pd.Series(1, index=list(range(4)), dtype='float32'),
    'D' : np.array([3] * 4, dtype='int32'),
    'E' : pd.Categorical(['test', 'train', 'test', 'train']),
    'F' :'foo'
})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2017-11-22,1.0,3,test,foo
1,1.0,2017-11-22,1.0,3,train,foo
2,1.0,2017-11-22,1.0,3,test,foo
3,1.0,2017-11-22,1.0,3,train,foo


In [7]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [8]:
# df2.<TAB>

# Viewing Data

In [9]:
df.head()

Unnamed: 0,A,B,C,D
2017-11-22,-0.833324,-0.527916,0.342883,0.24918
2017-11-23,1.796409,2.109904,-0.326079,0.322876
2017-11-24,-1.380897,0.727351,0.757998,-0.164632
2017-11-25,-1.458775,-0.406354,0.735791,0.208168
2017-11-26,0.644822,0.965369,-0.822161,0.504486


In [10]:
df.tail(3)

Unnamed: 0,A,B,C,D
2017-11-25,-1.458775,-0.406354,0.735791,0.208168
2017-11-26,0.644822,0.965369,-0.822161,0.504486
2017-11-27,1.176051,0.694408,0.252426,-0.656584


In [11]:
df.index

DatetimeIndex(['2017-11-22', '2017-11-23', '2017-11-24', '2017-11-25',
               '2017-11-26', '2017-11-27'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [13]:
df.values

array([[-0.83332446, -0.52791596,  0.34288327,  0.24917962],
       [ 1.79640851,  2.10990427, -0.32607905,  0.32287554],
       [-1.38089707,  0.72735145,  0.75799765, -0.16463176],
       [-1.45877466, -0.40635416,  0.73579093,  0.20816773],
       [ 0.64482185,  0.96536928, -0.82216068,  0.50448645],
       [ 1.17605101,  0.69440818,  0.25242599, -0.65658403]])

In [14]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.009286,0.593794,0.15681,0.077249
std,1.396767,0.971921,0.621607,0.421089
min,-1.458775,-0.527916,-0.822161,-0.656584
25%,-1.244004,-0.131164,-0.181453,-0.071432
50%,-0.094251,0.71088,0.297655,0.228674
75%,1.043244,0.905865,0.637564,0.304452
max,1.796409,2.109904,0.757998,0.504486


In [15]:
df.T

Unnamed: 0,2017-11-22 00:00:00,2017-11-23 00:00:00,2017-11-24 00:00:00,2017-11-25 00:00:00,2017-11-26 00:00:00,2017-11-27 00:00:00
A,-0.833324,1.796409,-1.380897,-1.458775,0.644822,1.176051
B,-0.527916,2.109904,0.727351,-0.406354,0.965369,0.694408
C,0.342883,-0.326079,0.757998,0.735791,-0.822161,0.252426
D,0.24918,0.322876,-0.164632,0.208168,0.504486,-0.656584


In [16]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2017-11-22,0.24918,0.342883,-0.527916,-0.833324
2017-11-23,0.322876,-0.326079,2.109904,1.796409
2017-11-24,-0.164632,0.757998,0.727351,-1.380897
2017-11-25,0.208168,0.735791,-0.406354,-1.458775
2017-11-26,0.504486,-0.822161,0.965369,0.644822
2017-11-27,-0.656584,0.252426,0.694408,1.176051


In [17]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2017-11-22,-0.833324,-0.527916,0.342883,0.24918
2017-11-25,-1.458775,-0.406354,0.735791,0.208168
2017-11-27,1.176051,0.694408,0.252426,-0.656584
2017-11-24,-1.380897,0.727351,0.757998,-0.164632
2017-11-26,0.644822,0.965369,-0.822161,0.504486
2017-11-23,1.796409,2.109904,-0.326079,0.322876


## Selection

In [18]:
df.loc[:]

Unnamed: 0,A,B,C,D
2017-11-22,-0.833324,-0.527916,0.342883,0.24918
2017-11-23,1.796409,2.109904,-0.326079,0.322876
2017-11-24,-1.380897,0.727351,0.757998,-0.164632
2017-11-25,-1.458775,-0.406354,0.735791,0.208168
2017-11-26,0.644822,0.965369,-0.822161,0.504486
2017-11-27,1.176051,0.694408,0.252426,-0.656584


### Getting

In [19]:
df['A']

2017-11-22   -0.833324
2017-11-23    1.796409
2017-11-24   -1.380897
2017-11-25   -1.458775
2017-11-26    0.644822
2017-11-27    1.176051
Freq: D, Name: A, dtype: float64

In [20]:
df[0:3]

Unnamed: 0,A,B,C,D
2017-11-22,-0.833324,-0.527916,0.342883,0.24918
2017-11-23,1.796409,2.109904,-0.326079,0.322876
2017-11-24,-1.380897,0.727351,0.757998,-0.164632


In [21]:
df['20171125':'20171128']

Unnamed: 0,A,B,C,D
2017-11-25,-1.458775,-0.406354,0.735791,0.208168
2017-11-26,0.644822,0.965369,-0.822161,0.504486
2017-11-27,1.176051,0.694408,0.252426,-0.656584


### Selection by Label

In [22]:
df.loc[dates[0]]

A   -0.833324
B   -0.527916
C    0.342883
D    0.249180
Name: 2017-11-22 00:00:00, dtype: float64

In [23]:
df.loc[:,['A', 'B']]

Unnamed: 0,A,B
2017-11-22,-0.833324,-0.527916
2017-11-23,1.796409,2.109904
2017-11-24,-1.380897,0.727351
2017-11-25,-1.458775,-0.406354
2017-11-26,0.644822,0.965369
2017-11-27,1.176051,0.694408


In [24]:
df.loc['20171125':'20171128',['A', 'B']]

Unnamed: 0,A,B
2017-11-25,-1.458775,-0.406354
2017-11-26,0.644822,0.965369
2017-11-27,1.176051,0.694408


In [25]:
df.loc['20171125',['A', 'B']]

A   -1.458775
B   -0.406354
Name: 2017-11-25 00:00:00, dtype: float64

In [26]:
df.loc[dates[0],['A', 'B']]

A   -0.833324
B   -0.527916
Name: 2017-11-22 00:00:00, dtype: float64

In [27]:
df.at[dates[0],'A']

-0.83332446422912831

### Selection by Position

In [28]:
df.iloc[3]

A   -1.458775
B   -0.406354
C    0.735791
D    0.208168
Name: 2017-11-25 00:00:00, dtype: float64

In [29]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2017-11-25,-1.458775,-0.406354
2017-11-26,0.644822,0.965369


In [30]:
df.iloc[[1, 2, 4],[0, 2]]

Unnamed: 0,A,C
2017-11-23,1.796409,-0.326079
2017-11-24,-1.380897,0.757998
2017-11-26,0.644822,-0.822161


In [31]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2017-11-23,1.796409,2.109904,-0.326079,0.322876
2017-11-24,-1.380897,0.727351,0.757998,-0.164632


In [32]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2017-11-22,-0.527916,0.342883
2017-11-23,2.109904,-0.326079
2017-11-24,0.727351,0.757998
2017-11-25,-0.406354,0.735791
2017-11-26,0.965369,-0.822161
2017-11-27,0.694408,0.252426


In [33]:
df.iloc[1, 1]

2.1099042744964378

In [34]:
df.iat[1, 1]

2.1099042744964378

### bloolean Indexing

In [35]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2017-11-23,1.796409,2.109904,-0.326079,0.322876
2017-11-26,0.644822,0.965369,-0.822161,0.504486
2017-11-27,1.176051,0.694408,0.252426,-0.656584


In [36]:
df[df > 0]

Unnamed: 0,A,B,C,D
2017-11-22,,,0.342883,0.24918
2017-11-23,1.796409,2.109904,,0.322876
2017-11-24,,0.727351,0.757998,
2017-11-25,,,0.735791,0.208168
2017-11-26,0.644822,0.965369,,0.504486
2017-11-27,1.176051,0.694408,0.252426,


In [37]:
df2 = df.copy()

In [38]:
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2017-11-22,-0.833324,-0.527916,0.342883,0.24918,one
2017-11-23,1.796409,2.109904,-0.326079,0.322876,one
2017-11-24,-1.380897,0.727351,0.757998,-0.164632,two
2017-11-25,-1.458775,-0.406354,0.735791,0.208168,three
2017-11-26,0.644822,0.965369,-0.822161,0.504486,four
2017-11-27,1.176051,0.694408,0.252426,-0.656584,three


In [39]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2017-11-24,-1.380897,0.727351,0.757998,-0.164632,two
2017-11-26,0.644822,0.965369,-0.822161,0.504486,four


### Setting

In [40]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20171201', periods=6))
s1

2017-12-01    1
2017-12-02    2
2017-12-03    3
2017-12-04    4
2017-12-05    5
2017-12-06    6
Freq: D, dtype: int64

In [41]:
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D
2017-11-22,0.0,-0.527916,0.342883,0.24918
2017-11-23,1.796409,2.109904,-0.326079,0.322876
2017-11-24,-1.380897,0.727351,0.757998,-0.164632
2017-11-25,-1.458775,-0.406354,0.735791,0.208168
2017-11-26,0.644822,0.965369,-0.822161,0.504486
2017-11-27,1.176051,0.694408,0.252426,-0.656584


In [42]:
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D
2017-11-22,0.0,0.0,0.342883,0.24918
2017-11-23,1.796409,2.109904,-0.326079,0.322876
2017-11-24,-1.380897,0.727351,0.757998,-0.164632
2017-11-25,-1.458775,-0.406354,0.735791,0.208168
2017-11-26,0.644822,0.965369,-0.822161,0.504486
2017-11-27,1.176051,0.694408,0.252426,-0.656584


In [43]:
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D
2017-11-22,0.0,0.0,0.342883,5
2017-11-23,1.796409,2.109904,-0.326079,5
2017-11-24,-1.380897,0.727351,0.757998,5
2017-11-25,-1.458775,-0.406354,0.735791,5
2017-11-26,0.644822,0.965369,-0.822161,5
2017-11-27,1.176051,0.694408,0.252426,5


In [44]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D
2017-11-22,0.0,0.0,-0.342883,-5
2017-11-23,-1.796409,-2.109904,-0.326079,-5
2017-11-24,-1.380897,-0.727351,-0.757998,-5
2017-11-25,-1.458775,-0.406354,-0.735791,-5
2017-11-26,-0.644822,-0.965369,-0.822161,-5
2017-11-27,-1.176051,-0.694408,-0.252426,-5


##  Missing Data

In [45]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,E
2017-11-22,0.0,0.0,0.342883,5,
2017-11-23,1.796409,2.109904,-0.326079,5,
2017-11-24,-1.380897,0.727351,0.757998,5,
2017-11-25,-1.458775,-0.406354,0.735791,5,


In [46]:
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,E
2017-11-22,0.0,0.0,0.342883,5,1.0
2017-11-23,1.796409,2.109904,-0.326079,5,1.0
2017-11-24,-1.380897,0.727351,0.757998,5,
2017-11-25,-1.458775,-0.406354,0.735791,5,


In [47]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2017-11-22,0.0,0.0,0.342883,5,1.0
2017-11-23,1.796409,2.109904,-0.326079,5,1.0


In [48]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2017-11-22,0.0,0.0,0.342883,5,1.0
2017-11-23,1.796409,2.109904,-0.326079,5,1.0
2017-11-24,-1.380897,0.727351,0.757998,5,5.0
2017-11-25,-1.458775,-0.406354,0.735791,5,5.0


In [49]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2017-11-22,False,False,False,False,False
2017-11-23,False,False,False,False,False
2017-11-24,False,False,False,False,True
2017-11-25,False,False,False,False,True


## Operations

### Stats

In [50]:
df.mean()

A    0.129602
B    0.681780
C    0.156810
D    5.000000
dtype: float64

In [51]:
df.mean(1)

2017-11-22    1.335721
2017-11-23    2.145058
2017-11-24    1.276113
2017-11-25    0.967666
2017-11-26    1.447008
2017-11-27    1.780721
Freq: D, dtype: float64

In [52]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2017-11-22    NaN
2017-11-23    NaN
2017-11-24    1.0
2017-11-25    3.0
2017-11-26    5.0
2017-11-27    NaN
Freq: D, dtype: float64

In [53]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D
2017-11-22,,,,
2017-11-23,,,,
2017-11-24,-2.380897,-0.272649,-0.242002,4.0
2017-11-25,-4.458775,-3.406354,-2.264209,2.0
2017-11-26,-4.355178,-4.034631,-5.822161,0.0
2017-11-27,,,,


### Apply

In [54]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2017-11-22,0.0,0.0,0.342883,5
2017-11-23,1.796409,2.109904,0.016804,10
2017-11-24,0.415511,2.837256,0.774802,15
2017-11-25,-1.043263,2.430902,1.510593,20
2017-11-26,-0.398441,3.396271,0.688432,25
2017-11-27,0.77761,4.090679,0.940858,30


In [55]:
df.apply(lambda x: x.max() - x.min())

A    3.255183
B    2.516258
C    1.580158
D    0.000000
dtype: float64

### Histogramming

In [56]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    6
1    4
2    4
3    0
4    3
5    6
6    3
7    1
8    1
9    2
dtype: int64

In [57]:
s.value_counts()

6    2
4    2
3    2
1    2
2    1
0    1
dtype: int64

### String Methods

In [58]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

## Merge