In [1]:
import numpy as np
import pandas as pd

## Object creation

In [3]:
a = pd.Series([2,3,4,6, np.nan, 7, 3,3,])
a

0    2.0
1    3.0
2    4.0
3    6.0
4    NaN
5    7.0
6    3.0
7    3.0
dtype: float64

In [6]:
dates = pd.date_range('20220222', periods = 4)
dates

DatetimeIndex(['2022-02-22', '2022-02-23', '2022-02-24', '2022-02-25'], dtype='datetime64[ns]', freq='D')

In [10]:
df = pd.DataFrame(np.random.randn(4,5), index = dates, columns = list('ABCDE'))
df

Unnamed: 0,A,B,C,D,E
2022-02-22,1.33104,0.972273,-0.38348,-0.858213,-0.881928
2022-02-23,-0.329274,-0.615656,-1.257533,-0.682474,1.017461
2022-02-24,2.587289,-0.846842,1.024638,-0.027445,-1.03391
2022-02-25,-1.792777,-0.285709,-0.771742,0.234596,0.629214


In [24]:
df1 = pd.DataFrame({
    'A': 1,
    'B': np.array([3]*4),
    'C': pd.Series(1, index = list(range(4))),
    'D': pd.Timestamp(20220222),
    'E': 'foo',
    'F': pd.Categorical(['ok', 'ok', 'okay', 'good'])
})
df1

Unnamed: 0,A,B,C,D,E,F
0,1,3,1,1970-01-01 00:00:00.020220222,foo,ok
1,1,3,1,1970-01-01 00:00:00.020220222,foo,ok
2,1,3,1,1970-01-01 00:00:00.020220222,foo,okay
3,1,3,1,1970-01-01 00:00:00.020220222,foo,good


In [25]:
df1.dtypes

A             int64
B             int32
C             int64
D    datetime64[ns]
E            object
F          category
dtype: object

In [26]:
df1.head()

Unnamed: 0,A,B,C,D,E,F
0,1,3,1,1970-01-01 00:00:00.020220222,foo,ok
1,1,3,1,1970-01-01 00:00:00.020220222,foo,ok
2,1,3,1,1970-01-01 00:00:00.020220222,foo,okay
3,1,3,1,1970-01-01 00:00:00.020220222,foo,good


In [27]:
df1.tail()

Unnamed: 0,A,B,C,D,E,F
0,1,3,1,1970-01-01 00:00:00.020220222,foo,ok
1,1,3,1,1970-01-01 00:00:00.020220222,foo,ok
2,1,3,1,1970-01-01 00:00:00.020220222,foo,okay
3,1,3,1,1970-01-01 00:00:00.020220222,foo,good


In [28]:
df.index

DatetimeIndex(['2022-02-22', '2022-02-23', '2022-02-24', '2022-02-25'], dtype='datetime64[ns]', freq='D')

In [29]:
df.columns

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [30]:
df.to_numpy()

array([[ 1.33104047,  0.97227272, -0.38347966, -0.85821273, -0.8819277 ],
       [-0.32927386, -0.61565612, -1.25753267, -0.68247406,  1.01746139],
       [ 2.58728916, -0.84684243,  1.02463791, -0.02744541, -1.03390995],
       [-1.79277744, -0.28570934, -0.77174173,  0.23459566,  0.62921362]])

In [31]:
df.describe()

Unnamed: 0,A,B,C,D,E
count,4.0,4.0,4.0,4.0,4.0
mean,0.44907,-0.193984,-0.347029,-0.333384,-0.067291
std,1.913248,0.810884,0.981868,0.520739,1.0424
min,-1.792777,-0.846842,-1.257533,-0.858213,-1.03391
25%,-0.69515,-0.673453,-0.893189,-0.726409,-0.919923
50%,0.500883,-0.450683,-0.577611,-0.35496,-0.126357
75%,1.645103,0.028786,-0.03145,0.038065,0.726276
max,2.587289,0.972273,1.024638,0.234596,1.017461


In [32]:
df.T

Unnamed: 0,2022-02-22,2022-02-23,2022-02-24,2022-02-25
A,1.33104,-0.329274,2.587289,-1.792777
B,0.972273,-0.615656,-0.846842,-0.285709
C,-0.38348,-1.257533,1.024638,-0.771742
D,-0.858213,-0.682474,-0.027445,0.234596
E,-0.881928,1.017461,-1.03391,0.629214


In [36]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,E,D,C,B,A
2022-02-22,-0.881928,-0.858213,-0.38348,0.972273,1.33104
2022-02-23,1.017461,-0.682474,-1.257533,-0.615656,-0.329274
2022-02-24,-1.03391,-0.027445,1.024638,-0.846842,2.587289
2022-02-25,0.629214,0.234596,-0.771742,-0.285709,-1.792777


In [41]:
df.sort_index(axis = 0, ascending = False)

Unnamed: 0,A,B,C,D,E
2022-02-25,-1.792777,-0.285709,-0.771742,0.234596,0.629214
2022-02-24,2.587289,-0.846842,1.024638,-0.027445,-1.03391
2022-02-23,-0.329274,-0.615656,-1.257533,-0.682474,1.017461
2022-02-22,1.33104,0.972273,-0.38348,-0.858213,-0.881928


In [46]:
df.sort_values(by = 'E')

Unnamed: 0,A,B,C,D,E
2022-02-24,2.587289,-0.846842,1.024638,-0.027445,-1.03391
2022-02-22,1.33104,0.972273,-0.38348,-0.858213,-0.881928
2022-02-25,-1.792777,-0.285709,-0.771742,0.234596,0.629214
2022-02-23,-0.329274,-0.615656,-1.257533,-0.682474,1.017461


## Selection

In [50]:
print(df.head(3))
df.shape

                   A         B         C         D         E
2022-02-22  1.331040  0.972273 -0.383480 -0.858213 -0.881928
2022-02-23 -0.329274 -0.615656 -1.257533 -0.682474  1.017461
2022-02-24  2.587289 -0.846842  1.024638 -0.027445 -1.033910


(4, 5)

In [53]:
df['A']

2022-02-22    1.331040
2022-02-23   -0.329274
2022-02-24    2.587289
2022-02-25   -1.792777
Freq: D, Name: A, dtype: float64

In [56]:
df[0:2]

Unnamed: 0,A,B,C,D,E
2022-02-22,1.33104,0.972273,-0.38348,-0.858213,-0.881928
2022-02-23,-0.329274,-0.615656,-1.257533,-0.682474,1.017461


In [59]:
df.loc[dates[0:2]]

Unnamed: 0,A,B,C,D,E
2022-02-22,1.33104,0.972273,-0.38348,-0.858213,-0.881928
2022-02-23,-0.329274,-0.615656,-1.257533,-0.682474,1.017461


In [65]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2022-02-22,1.33104,0.972273
2022-02-23,-0.329274,-0.615656
2022-02-24,2.587289,-0.846842
2022-02-25,-1.792777,-0.285709


In [69]:
df.loc['20220222':'20220223', ['A', 'E']]

Unnamed: 0,A,E
2022-02-22,1.33104,-0.881928
2022-02-23,-0.329274,1.017461


In [70]:
df.loc[dates[0], 'A']

1.331040466344079

In [72]:
df.at[dates[0], 'B']

0.9722727150237331

## Selection by position

In [73]:
df.iloc[3]

A   -1.792777
B   -0.285709
C   -0.771742
D    0.234596
E    0.629214
Name: 2022-02-25 00:00:00, dtype: float64

In [76]:
df.iloc[2:4, 0:2]

Unnamed: 0,A,B
2022-02-24,2.587289,-0.846842
2022-02-25,-1.792777,-0.285709


In [78]:
df.iloc[[1,2,3,3,3,3], [0,2]]

Unnamed: 0,A,C
2022-02-23,-0.329274,-1.257533
2022-02-24,2.587289,1.024638
2022-02-25,-1.792777,-0.771742
2022-02-25,-1.792777,-0.771742
2022-02-25,-1.792777,-0.771742
2022-02-25,-1.792777,-0.771742


In [79]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D,E
2022-02-23,-0.329274,-0.615656,-1.257533,-0.682474,1.017461
2022-02-24,2.587289,-0.846842,1.024638,-0.027445,-1.03391


In [82]:
df.iloc[:, 0:2]

Unnamed: 0,A,B
2022-02-22,1.33104,0.972273
2022-02-23,-0.329274,-0.615656
2022-02-24,2.587289,-0.846842
2022-02-25,-1.792777,-0.285709


In [83]:
df.iloc[1,1]

-0.6156561168141156

In [84]:
df.iat[1,1]

-0.6156561168141156

## Boolean indexing

In [85]:
df['A'] >0

2022-02-22     True
2022-02-23    False
2022-02-24     True
2022-02-25    False
Freq: D, Name: A, dtype: bool

In [86]:
df[df['A'] >0]

Unnamed: 0,A,B,C,D,E
2022-02-22,1.33104,0.972273,-0.38348,-0.858213,-0.881928
2022-02-24,2.587289,-0.846842,1.024638,-0.027445,-1.03391


In [89]:
df2 = df.copy()
df2.head()

Unnamed: 0,A,B,C,D,E
2022-02-22,1.33104,0.972273,-0.38348,-0.858213,-0.881928
2022-02-23,-0.329274,-0.615656,-1.257533,-0.682474,1.017461
2022-02-24,2.587289,-0.846842,1.024638,-0.027445,-1.03391
2022-02-25,-1.792777,-0.285709,-0.771742,0.234596,0.629214


In [90]:
df2.shape

(4, 5)

In [92]:
df2['F'] = ['one','two','nine', 'seven']
df2

Unnamed: 0,A,B,C,D,E,F
2022-02-22,1.33104,0.972273,-0.38348,-0.858213,-0.881928,one
2022-02-23,-0.329274,-0.615656,-1.257533,-0.682474,1.017461,two
2022-02-24,2.587289,-0.846842,1.024638,-0.027445,-1.03391,nine
2022-02-25,-1.792777,-0.285709,-0.771742,0.234596,0.629214,seven


In [95]:
df2['F'].isin(['nine'])

2022-02-22    False
2022-02-23    False
2022-02-24     True
2022-02-25    False
Freq: D, Name: F, dtype: bool

In [97]:
df2[df2['F'].isin(['one', 'two'])]

Unnamed: 0,A,B,C,D,E,F
2022-02-22,1.33104,0.972273,-0.38348,-0.858213,-0.881928,one
2022-02-23,-0.329274,-0.615656,-1.257533,-0.682474,1.017461,two


## Setting

In [99]:
s1 = pd.Series([1,2,3,4,5,6], index = pd.date_range('20220222', periods = 6))

In [100]:
df['G'] = s1

In [101]:
df

Unnamed: 0,A,B,C,D,E,G
2022-02-22,1.33104,0.972273,-0.38348,-0.858213,-0.881928,1
2022-02-23,-0.329274,-0.615656,-1.257533,-0.682474,1.017461,2
2022-02-24,2.587289,-0.846842,1.024638,-0.027445,-1.03391,3
2022-02-25,-1.792777,-0.285709,-0.771742,0.234596,0.629214,4


In [104]:
df.at[dates[0], "A"] = 0

In [108]:
df

Unnamed: 0,A,B,C,D,E,G
2022-02-22,0.0,0.972273,-0.38348,-0.858213,-0.881928,1
2022-02-23,-0.329274,-0.615656,-1.257533,-0.682474,1.017461,2
2022-02-24,2.587289,-0.846842,1.024638,-0.027445,-1.03391,3
2022-02-25,-1.792777,-0.285709,-0.771742,0.234596,0.629214,4


In [111]:
df.iat[0,0]

0.0

In [112]:
df.iat[0,1] = 1

In [117]:
df.loc[:, 'D'] = np.array([5] * len(df))

In [125]:
df2 = df.copy()

In [127]:
df2[df2>0] = -df2

In [128]:
df2

Unnamed: 0,A,B,C,D,E,G
2022-02-22,0.0,-1.0,-0.38348,-5,-0.881928,-1
2022-02-23,-0.329274,-0.615656,-1.257533,-5,-1.017461,-2
2022-02-24,-2.587289,-0.846842,-1.024638,-5,-1.03391,-3
2022-02-25,-1.792777,-0.285709,-0.771742,-5,-0.629214,-4


## Missing data

In [187]:
df3 = df.copy()

In [188]:
df3 = df.reindex(index = dates[0:5], columns = list(df.columns) + ['H'] )

In [189]:
df3.loc[dates[0]: dates[1], 'H'] = 1

In [190]:
df3

Unnamed: 0,A,B,C,D,E,G,H
2022-02-22,0.0,1.0,-0.38348,5,-0.881928,1,1.0
2022-02-23,-0.329274,-0.615656,-1.257533,5,1.017461,2,1.0
2022-02-24,2.587289,-0.846842,1.024638,5,-1.03391,3,
2022-02-25,-1.792777,-0.285709,-0.771742,5,0.629214,4,


In [192]:
df3.dropna(how = 'any')

Unnamed: 0,A,B,C,D,E,G,H
2022-02-22,0.0,1.0,-0.38348,5,-0.881928,1,1.0
2022-02-23,-0.329274,-0.615656,-1.257533,5,1.017461,2,1.0


In [193]:
df3.fillna(value = 500)

Unnamed: 0,A,B,C,D,E,G,H
2022-02-22,0.0,1.0,-0.38348,5,-0.881928,1,1.0
2022-02-23,-0.329274,-0.615656,-1.257533,5,1.017461,2,1.0
2022-02-24,2.587289,-0.846842,1.024638,5,-1.03391,3,500.0
2022-02-25,-1.792777,-0.285709,-0.771742,5,0.629214,4,500.0


In [194]:
pd.isna(df3)

Unnamed: 0,A,B,C,D,E,G,H
2022-02-22,False,False,False,False,False,False,False
2022-02-23,False,False,False,False,False,False,False
2022-02-24,False,False,False,False,False,False,True
2022-02-25,False,False,False,False,False,False,True


## Stats

In [196]:
df.mean()

A    0.116309
B   -0.187052
C   -0.347029
D    5.000000
E   -0.067291
G    2.500000
dtype: float64

In [197]:
df.mean(1)

2022-02-22    0.955765
2022-02-23    0.969166
2022-02-24    1.621862
2022-02-25    1.129831
Freq: D, dtype: float64

In [207]:
s = pd.Series([1, 3, 5, np.nan], index=dates).shift(2)
s

2022-02-22    NaN
2022-02-23    NaN
2022-02-24    1.0
2022-02-25    3.0
Freq: D, dtype: float64

In [208]:
df.sub(s, axis = 'index')

Unnamed: 0,A,B,C,D,E,G
2022-02-22,,,,,,
2022-02-23,,,,,,
2022-02-24,1.587289,-1.846842,0.024638,4.0,-2.03391,2.0
2022-02-25,-4.792777,-3.285709,-3.771742,2.0,-2.370786,1.0


In [210]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,E,G
2022-02-22,0.0,1.0,-0.38348,5,-0.881928,1
2022-02-23,-0.329274,0.384344,-1.641012,10,0.135534,3
2022-02-24,2.258015,-0.462499,-0.616374,15,-0.898376,6
2022-02-25,0.465238,-0.748208,-1.388116,20,-0.269163,10


In [211]:
df.apply(lambda x: x.max() -x.min())

A    4.380067
B    1.846842
C    2.282171
D    0.000000
E    2.051371
G    3.000000
dtype: float64

In [212]:
df.apply(lambda x:x.max())

A    2.587289
B    1.000000
C    1.024638
D    5.000000
E    1.017461
G    4.000000
dtype: float64

## Histogramming

In [216]:
s = pd.Series(np.random.randint(0,7, size = 12))
s

0     1
1     2
2     2
3     0
4     3
5     3
6     5
7     1
8     1
9     5
10    5
11    4
dtype: int32

In [217]:
s.value_counts()

1    3
5    3
2    2
3    2
0    1
4    1
dtype: int64

In [220]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'dog'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6     dog
dtype: object

In [219]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6     dog
dtype: object

## Merge

In [224]:
df = pd.DataFrame(np.random.randn(11,4))
df

Unnamed: 0,0,1,2,3
0,0.10934,-0.445395,-1.268148,-0.522612
1,1.24529,-1.083627,0.439259,0.689589
2,-1.401183,1.087502,2.387266,-0.021953
3,-1.044383,0.576713,1.898864,1.359016
4,-0.072932,0.422696,-0.713917,0.952043
5,1.373862,-0.369932,-0.84531,-1.87164
6,-0.676709,0.992725,0.291615,-0.132909
7,-0.750712,-0.744416,-0.332289,1.016649
8,1.490205,0.189281,-1.025338,0.631108
9,1.31039,0.147416,0.721146,0.204342


In [226]:
pieces =  [df[:2], df[2:9], df[9:]]
pieces

[         0         1         2         3
 0  0.10934 -0.445395 -1.268148 -0.522612
 1  1.24529 -1.083627  0.439259  0.689589,
           0         1         2         3
 2 -1.401183  1.087502  2.387266 -0.021953
 3 -1.044383  0.576713  1.898864  1.359016
 4 -0.072932  0.422696 -0.713917  0.952043
 5  1.373862 -0.369932 -0.845310 -1.871640
 6 -0.676709  0.992725  0.291615 -0.132909
 7 -0.750712 -0.744416 -0.332289  1.016649
 8  1.490205  0.189281 -1.025338  0.631108,
            0         1         2         3
 9   1.310390  0.147416  0.721146  0.204342
 10 -0.524533 -1.484152  1.586943 -0.564833]

In [227]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,0.10934,-0.445395,-1.268148,-0.522612
1,1.24529,-1.083627,0.439259,0.689589
2,-1.401183,1.087502,2.387266,-0.021953
3,-1.044383,0.576713,1.898864,1.359016
4,-0.072932,0.422696,-0.713917,0.952043
5,1.373862,-0.369932,-0.84531,-1.87164
6,-0.676709,0.992725,0.291615,-0.132909
7,-0.750712,-0.744416,-0.332289,1.016649
8,1.490205,0.189281,-1.025338,0.631108
9,1.31039,0.147416,0.721146,0.204342


In [228]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [229]:
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [230]:
pd.merge(left, right, on = 'key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


## Grouping

In [232]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.370276,0.290362
1,bar,one,1.160408,-0.792568
2,foo,two,-0.13263,0.181319
3,bar,three,0.651648,-1.482973
4,foo,two,1.301262,0.200005
5,bar,two,0.519067,-1.433988
6,foo,one,-0.501263,0.818614
7,foo,three,0.612446,2.690533


In [234]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,2.331123,-3.709529
foo,0.909539,4.180833


In [235]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.160408,-0.792568
bar,three,0.651648,-1.482973
bar,two,0.519067,-1.433988
foo,one,-0.871538,1.108976
foo,three,0.612446,2.690533
foo,two,1.168631,0.381324


## Reshaping

In [237]:
tuples = list(
    zip(
        *[
            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
    )
)
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [239]:
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [240]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.049694,1.168174
bar,two,-0.770454,-1.464116
baz,one,-1.402159,-1.010946
baz,two,1.772272,-1.599681
foo,one,0.284989,-1.062372
foo,two,0.470624,-0.926545
qux,one,0.780557,1.089586
qux,two,-0.323948,-1.391903


In [244]:
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.049694,1.168174
bar,two,-0.770454,-1.464116
baz,one,-1.402159,-1.010946
baz,two,1.772272,-1.599681


In [247]:
stacked = df2.stack()
stacked

first  second   
bar    one     A    0.049694
               B    1.168174
       two     A   -0.770454
               B   -1.464116
baz    one     A   -1.402159
               B   -1.010946
       two     A    1.772272
               B   -1.599681
dtype: float64

In [248]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.049694,1.168174
bar,two,-0.770454,-1.464116
baz,one,-1.402159,-1.010946
baz,two,1.772272,-1.599681


In [249]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,0.049694,-0.770454
bar,B,1.168174,-1.464116
baz,A,-1.402159,1.772272
baz,B,-1.010946,-1.599681


In [250]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.049694,-1.402159
one,B,1.168174,-1.010946
two,A,-0.770454,1.772272
two,B,-1.464116,-1.599681


## Pivot tables

In [251]:
df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,0.514404,-0.187388
1,one,B,foo,1.05775,1.183442
2,two,C,foo,-1.899667,0.195577
3,three,A,bar,-1.19842,-2.503649
4,one,B,bar,1.694591,1.017238
5,one,C,bar,-0.025455,-0.401853
6,two,A,foo,-0.979552,2.036017
7,three,B,foo,-0.827325,-1.76302
8,one,C,foo,-0.197484,0.849015
9,one,A,bar,-1.326551,0.554795


In [253]:
pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-1.326551,0.514404
one,B,1.694591,1.05775
one,C,-0.025455,-0.197484
three,A,-1.19842,
three,B,,-0.827325
three,C,-0.311079,
two,A,,-0.979552
two,B,1.15305,
two,C,,-1.899667


## Time series

In [257]:
rng = pd.date_range("1/1/2012", periods=100, freq="S")
rng

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               '2012-01-01 00:00:10', '2012-01-01 00:00:11',
               '2012-01-01 00:00:12', '2012-01-01 00:00:13',
               '2012-01-01 00:00:14', '2012-01-01 00:00:15',
               '2012-01-01 00:00:16', '2012-01-01 00:00:17',
               '2012-01-01 00:00:18', '2012-01-01 00:00:19',
               '2012-01-01 00:00:20', '2012-01-01 00:00:21',
               '2012-01-01 00:00:22', '2012-01-01 00:00:23',
               '2012-01-01 00:00:24', '2012-01-01 00:00:25',
               '2012-01-01 00:00:26', '2012-01-01 00:00:27',
               '2012-01-01 00:00:28', '2012-01-01 00:00:29',
               '2012-01-01 00:00:30', '2012-01-01 00:00:31',
               '2012-01-

In [256]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts

2012-01-01 00:00:00    176
2012-01-01 00:00:01     50
2012-01-01 00:00:02    150
2012-01-01 00:00:03    483
2012-01-01 00:00:04    162
                      ... 
2012-01-01 00:01:35    102
2012-01-01 00:01:36    497
2012-01-01 00:01:37     59
2012-01-01 00:01:38    438
2012-01-01 00:01:39    490
Freq: S, Length: 100, dtype: int32

In [258]:
ts.resample('5Min').sum()

2012-01-01    23601
Freq: 5T, dtype: int32

In [259]:
rng = pd.date_range("3/6/2012 00:00", periods=5, freq="D")

In [261]:
ts = pd.Series(np.random.randn(len(rng)), rng)
ts

2012-03-06   -0.519400
2012-03-07    0.534945
2012-03-08   -0.466615
2012-03-09   -0.545919
2012-03-10    1.753281
Freq: D, dtype: float64

In [263]:
ts_utc = ts.tz_localize("UTC")
ts_utc

2012-03-06 00:00:00+00:00   -0.519400
2012-03-07 00:00:00+00:00    0.534945
2012-03-08 00:00:00+00:00   -0.466615
2012-03-09 00:00:00+00:00   -0.545919
2012-03-10 00:00:00+00:00    1.753281
Freq: D, dtype: float64

In [264]:
ts_utc.tz_convert("US/Eastern")

2012-03-05 19:00:00-05:00   -0.519400
2012-03-06 19:00:00-05:00    0.534945
2012-03-07 19:00:00-05:00   -0.466615
2012-03-08 19:00:00-05:00   -0.545919
2012-03-09 19:00:00-05:00    1.753281
Freq: D, dtype: float64

In [265]:
rng = pd.date_range("1/1/2012", periods=5, freq="M")

In [267]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2012-01-31    1.847439
2012-02-29    0.582173
2012-03-31    0.172062
2012-04-30   -1.349362
2012-05-31    0.276011
Freq: M, dtype: float64

In [269]:
ps = ts.to_period()
ps

2012-01    1.847439
2012-02    0.582173
2012-03    0.172062
2012-04   -1.349362
2012-05    0.276011
Freq: M, dtype: float64

In [270]:
ps.to_timestamp()

2012-01-01    1.847439
2012-02-01    0.582173
2012-03-01    0.172062
2012-04-01   -1.349362
2012-05-01    0.276011
Freq: MS, dtype: float64

In [271]:
prng = pd.period_range("1990Q1", "2000Q4", freq="Q-NOV")
ts = pd.Series(np.random.randn(len(prng)), prng)

ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9
ts.head()

1990-03-01 09:00    0.781555
1990-06-01 09:00    1.142050
1990-09-01 09:00   -0.115313
1990-12-01 09:00   -1.525530
1991-03-01 09:00   -0.650719
Freq: H, dtype: float64

## Categoricals

In [272]:
df = pd.DataFrame(
    {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
)
df

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [273]:
df['grade'] = df['raw_grade'].astype('category')

In [274]:
df['grade']

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [275]:
df["grade"].cat.categories = ["very good", "good", "very bad"]
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [276]:
df["grade"] = df["grade"].cat.set_categories(
    ["very bad", "bad", "medium", "good", "very good"]
)
df

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bad


In [277]:
df['grade']

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): ['very bad', 'bad', 'medium', 'good', 'very good']

In [278]:
df.sort_values(by = 'grade')

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
1,2,b,good
2,3,b,good
0,1,a,very good
3,4,a,very good
4,5,a,very good


In [279]:
df.groupby('grade').size()

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64