# pandas 基础

---

导入库文件

In [1]:
import numpy as np
import pandas as pd

### 数据结构

##### Series

创建 Series

In [2]:
s = pd.Series(np.arange(5))
print(s)
s = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
print(s)
print(s.index)

d = {'a': 0, 'b': 1, 'c':2}
s = pd.Series(d)
print(s)
s = pd.Series(d, index = ['d', 'c', 'b', 'a'])
print(s)
s = pd.Series(5, index = ['a', 'b', 'c', 'd', 'e'])
print(s)

0    0
1    1
2    2
3    3
4    4
dtype: int64
a    0
b    1
c    2
d    3
e    4
dtype: int64
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
a    0
b    1
c    2
dtype: int64
d    NaN
c    2.0
b    1.0
a    0.0
dtype: float64
a    5
b    5
c    5
d    5
e    5
dtype: int64


取值

In [3]:
s = pd.Series(np.arange(5) + 10, index = ['a', 'b', 'c', 'd', 'e'])
# index
print(s)
print(s[1])
print(s[:3])
print(s[s > s.median()])
print(s[[3, 1]])

# dict
print(s['b'])
print('c' in s)
print(s.get('f', np.nan))

a    10
b    11
c    12
d    13
e    14
dtype: int64
11
a    10
b    11
c    12
dtype: int64
d    13
e    14
dtype: int64
d    13
b    11
dtype: int64
11
True
nan


数值操作

In [4]:
s = pd.Series(np.arange(5), index = ['a', 'b', 'c', 'd', 'e'])
print(s + s)
print(s * 3)
print(np.exp(s))
print(s[1:])
# 没有相同 labels 时为空
print(s[1:] + s[:-1])

a    0
b    2
c    4
d    6
e    8
dtype: int64
a     0
b     3
c     6
d     9
e    12
dtype: int64
a     1.000000
b     2.718282
c     7.389056
d    20.085537
e    54.598150
dtype: float64
b    1
c    2
d    3
e    4
dtype: int64
a    NaN
b    2.0
c    4.0
d    6.0
e    NaN
dtype: float64


##### DataFrame

创建

In [5]:
# index
d = {'one': pd.Series([1, 2, 3], index = ['a', 'b', 'c']),
     'two': pd.Series([1, 2, 3, 4], index = ['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df)
df = pd.DataFrame(d, index = ['d', 'b', 'a'])
print(df)
df = pd.DataFrame(d, index = ['d', 'b', 'a'], columns = ['two', 'three'])
print(df)

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
   one  two
d  NaN    4
b  2.0    2
a  1.0    1
   two three
d    4   NaN
b    2   NaN
a    1   NaN


In [6]:
# dict/list
d = {'one': [1, 2, 3, 4],
     'two': [4, 3, 2, 1]}

df = pd.DataFrame(d)
print(df)
df = pd.DataFrame(d, index = ['a', 'b', 'c', 'd'])
print(df)

   one  two
0    1    4
1    2    3
2    3    2
3    4    1
   one  two
a    1    4
b    2    3
c    3    2
d    4    1


In [7]:
# structured array
data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')])
data[:] = [(1, 2., 'Hello'), (2, 3., 'Hello')]
print(data)
print(pd.DataFrame(data))
print(pd.DataFrame(data, index=['first', 'second']))
print(pd.DataFrame(data, columns=['C', 'B', 'A']))

[(1, 2., b'Hello') (2, 3., b'Hello')]
   A    B         C
0  1  2.0  b'Hello'
1  2  3.0  b'Hello'
        A    B         C
first   1  2.0  b'Hello'
second  2  3.0  b'Hello'
          C    B  A
0  b'Hello'  2.0  1
1  b'Hello'  3.0  2


In [8]:
# list of dicts
data = [{'a':  1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
print(pd.DataFrame(data))
print(pd.DataFrame(data, index=['first', 'second']))
print(pd.DataFrame(data, columns=['a', 'b']))

   a   b     c
0  1   2   NaN
1  5  10  20.0
        a   b     c
first   1   2   NaN
second  5  10  20.0
   a   b
0  1   2
1  5  10


In [9]:
# dict of tuples
pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
              ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
              ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
              ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
              ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


In [10]:
# from_dict 可设置 orient 转换行和列
pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]), orient='index')

Unnamed: 0,0,1,2
A,1,2,3
B,4,5,6


In [11]:
# from_records 可使用 index 设置列
data = np.array([(1, 2., b'Hello'), (2, 3., b'World')], dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])
print(data)
print(pd.DataFrame(data, index=['x', 'y']))
print(pd.DataFrame.from_records(data))
print(pd.DataFrame.from_records(data, index=['x', 'y']))
print(pd.DataFrame.from_records(data, index='B'))
print(pd.DataFrame.from_records(data, index='C'))

[(1, 2., b'Hello') (2, 3., b'World')]
   A    B         C
x  1  2.0  b'Hello'
y  2  3.0  b'World'
   A    B         C
0  1  2.0  b'Hello'
1  2  3.0  b'World'
   A    B         C
x  1  2.0  b'Hello'
y  2  3.0  b'World'
     A         C
B               
2.0  1  b'Hello'
3.0  2  b'World'
          A    B
C               
b'Hello'  1  2.0
b'World'  2  3.0


### 数据操作

In [40]:
df = pd.DataFrame({'one': [1, 2, 3], 
                   'two': [1, 2, 3], 
                   'three': [2, 4, 9]}, index=['a', 'b', 'c'])
print(df)

df['three'] = df['one'] * df['two']
df['flag'] = df['one'] > 1
print(df)

del df['two']
print(df)
three = df.pop('three')
print(df)

df['foo'] = 'bar'
print(df)

df['one_trunc'] = df['one'][:2]
print(df)

df.insert(1, 'bar', df['one'])
print(df)
df.at['a', 'one'] = 5
print(df['one']['a'])
print(df)

   one  two  three
a    1    1      2
b    2    2      4
c    3    3      9
   one  two  three   flag
a    1    1      1  False
b    2    2      4   True
c    3    3      9   True
   one  three   flag
a    1      1  False
b    2      4   True
c    3      9   True
   one   flag
a    1  False
b    2   True
c    3   True
   one   flag  foo
a    1  False  bar
b    2   True  bar
c    3   True  bar
   one   flag  foo  one_trunc
a    1  False  bar        1.0
b    2   True  bar        2.0
c    3   True  bar        NaN
   one  bar   flag  foo  one_trunc
a    1    1  False  bar        1.0
b    2    2   True  bar        2.0
c    3    3   True  bar        NaN
5
   one  bar   flag  foo  one_trunc
a    5    1  False  bar        1.0
b    2    2   True  bar        2.0
c    3    3   True  bar        NaN


In [13]:
df = pd.DataFrame({'one': [1, 2, 3], 
                   'two': [1, 2, 3], 
                   'three': [2, 4, 9]}, index=['a', 'b', 'c'])

dff = df.assign(other=df['one'] / df['three'])
print(df)
print(dff)
dff = df.assign(other=lambda x: x['one'] + 1)
print(df)
print(dff)

   one  two  three
a    1    1      2
b    2    2      4
c    3    3      9
   one  two  three     other
a    1    1      2  0.500000
b    2    2      4  0.500000
c    3    3      9  0.333333
   one  two  three
a    1    1      2
b    2    2      4
c    3    3      9
   one  two  three  other
a    1    1      2      2
b    2    2      4      3
c    3    3      9      4


In [14]:
df = pd.DataFrame({'one': [1, 2, 3], 
                   'two': [1, 2, 3], 
                   'three': [2, 4, 9]}, index=['a', 'b', 'c'])

print(df)
print(df.loc['b'])
print(df.iloc[2])
print(df.loc['c', 'three'])
print(df.loc['a':'b'])

   one  two  three
a    1    1      2
b    2    2      4
c    3    3      9
one      2
two      2
three    4
Name: b, dtype: int64
one      3
two      3
three    9
Name: c, dtype: int64
9
   one  two  three
a    1    1      2
b    2    2      4


In [15]:
df = pd.DataFrame({'one': [1, 2, 3], 
                   'two': [1, 2, 3], 
                   'three': [2, 4, 9]}, index=['a', 'b', 'c'])

print(df)
print(df[df.one > 1])
print(df[df > 1])
print(df[df.two.isin([1, 3])])

   one  two  three
a    1    1      2
b    2    2      4
c    3    3      9
   one  two  three
b    2    2      4
c    3    3      9
   one  two  three
a  NaN  NaN      2
b  2.0  2.0      4
c  3.0  3.0      9
   one  two  three
a    1    1      2
c    3    3      9


### 算术运算

In [16]:
df = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['a', 'b', 'c'])

print(df + df2)
print(df - df.iloc[0])
print(df * 5 + 2)
print(1 / df)
print(df ** 4)

          a         b         c   d   e
0  3.330750  2.085646  4.134636 NaN NaN
1  2.522848 -1.417129  0.054945 NaN NaN
2 -1.615018  0.710973  0.273414 NaN NaN
3 -0.877516 -2.186180 -2.070944 NaN NaN
4  0.065184 -0.611641 -0.981919 NaN NaN
5  0.111324 -1.800228 -0.384548 NaN NaN
6  1.266283 -0.811280 -1.975692 NaN NaN
7       NaN       NaN       NaN NaN NaN
8       NaN       NaN       NaN NaN NaN
9       NaN       NaN       NaN NaN NaN
          a         b         c         d         e
0  0.000000  0.000000  0.000000  0.000000  0.000000
1  0.273008 -4.092552 -1.929524 -1.861572  2.033297
2 -1.376759 -2.965392 -1.821963 -3.969247  1.824151
3 -0.394945 -3.302974 -2.252613 -1.263425  0.675627
4 -0.665777 -3.099852 -1.915899 -0.217539  1.318769
5  0.752806 -4.868927 -0.774219 -1.909848  2.505260
6 -0.100766 -3.391954 -2.407273 -2.018001  2.319248
7 -0.770945 -3.520691 -2.250355  0.100463  1.357207
8 -1.183263 -4.727772 -0.467288 -1.126362  1.923501
9 -0.765361 -3.435360 -2.289108 -1.67099

**包含 date 的运算**

会列向广播

In [17]:
index = pd.date_range('1/1/2000', periods = 8)

df = pd.DataFrame(np.random.randn(8, 3), index = index, columns = list('ABC'))
print(df)

print(df - df['A'])

                   A         B         C
2000-01-01  0.692553  0.349052 -0.304831
2000-01-02 -0.564191 -0.434350  0.808060
2000-01-03 -0.730110 -0.454349 -0.366816
2000-01-04 -0.610859 -0.075399  0.130657
2000-01-05 -0.816259 -1.719771 -0.845364
2000-01-06  0.042271 -0.395275 -0.339231
2000-01-07  0.822979 -0.829899 -0.029342
2000-01-08 -0.026568 -1.166015  0.012877
            2000-01-01 00:00:00  2000-01-02 00:00:00  2000-01-03 00:00:00  \
2000-01-01                  NaN                  NaN                  NaN   
2000-01-02                  NaN                  NaN                  NaN   
2000-01-03                  NaN                  NaN                  NaN   
2000-01-04                  NaN                  NaN                  NaN   
2000-01-05                  NaN                  NaN                  NaN   
2000-01-06                  NaN                  NaN                  NaN   
2000-01-07                  NaN                  NaN                  NaN   
2000-01-08     

### 逻辑运算

In [18]:
df1 = pd.DataFrame({'a': [3, 0, -1], 'b': [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool)

print(df1)
print(df1 & df2)
print(df1 | df2)
print(df1 ^ df2)
print(-df1)

       a      b
0   True  False
1  False   True
2   True   True
       a      b
0  False  False
1  False   True
2   True  False
      a     b
0  True  True
1  True  True
2  True  True
       a      b
0   True   True
1   True  False
2  False   True
       a      b
0  False   True
1   True  False
2  False  False


### 使用 numpy 库

In [19]:
df = pd.DataFrame(np.random.randn(3, 5))
print(df)
print(df.T)
print(np.exp(df))
print(np.asarray(df))

s = pd.Series([1, 2, 3, 4, 5])
print(np.exp(s))

          0         1         2         3         4
0  2.087496 -0.478904 -0.333340  1.174382 -0.287025
1  0.307408 -0.743478  0.890287 -0.547813 -1.662365
2  0.433934 -0.622469 -0.036222 -1.220431 -0.935433
          0         1         2
0  2.087496  0.307408  0.433934
1 -0.478904 -0.743478 -0.622469
2 -0.333340  0.890287 -0.036222
3  1.174382 -0.547813 -1.220431
4 -0.287025 -1.662365 -0.935433
          0         1         2         3         4
0  8.064694  0.619462  0.716527  3.236143  0.750493
1  1.359896  0.475457  2.435829  0.578213  0.189690
2  1.543317  0.536618  0.964426  0.295103  0.392416
[[ 2.08749578 -0.4789036  -0.33333966  1.1743822  -0.28702489]
 [ 0.30740832 -0.74347849  0.8902872  -0.54781276 -1.66236498]
 [ 0.43393405 -0.62246883 -0.03622247 -1.22043084 -0.93543344]]
0      2.718282
1      7.389056
2     20.085537
3     54.598150
4    148.413159
dtype: float64


### 丢失的数据

In [20]:
df = pd.DataFrame({'one': [1, 2, np.nan],
                   'two': [1, 2, 3],
                   'three': [1, np.nan, 3]})

print(df)
print(df.dropna())
print(df.dropna(how='any'))
print(df.fillna(5))
print(df.isna())

   one  two  three
0  1.0    1    1.0
1  2.0    2    NaN
2  NaN    3    3.0
   one  two  three
0  1.0    1    1.0
   one  two  three
0  1.0    1    1.0
   one  two  three
0  1.0    1    1.0
1  2.0    2    5.0
2  5.0    3    3.0
     one    two  three
0  False  False  False
1  False  False   True
2   True  False  False


### 其他操作

In [21]:
df = pd.DataFrame(np.arange(15).reshape(5, 3), columns=list('ABC'))

print(df)
print(df.mean())
print(df.mean(1))
print(df.sum())
print(df.sum(1))

    A   B   C
0   0   1   2
1   3   4   5
2   6   7   8
3   9  10  11
4  12  13  14
A    6.0
B    7.0
C    8.0
dtype: float64
0     1.0
1     4.0
2     7.0
3    10.0
4    13.0
dtype: float64
A    30
B    35
C    40
dtype: int64
0     3
1    12
2    21
3    30
4    39
dtype: int64


apply: 调用函数

In [22]:
df = pd.DataFrame(np.arange(15).reshape(3, 5), columns=list('ABCDE'))

print(df)
print(df.apply(np.cumsum))
print(df.apply(lambda x: x.max() - x.min()))

    A   B   C   D   E
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
    A   B   C   D   E
0   0   1   2   3   4
1   5   7   9  11  13
2  15  18  21  24  27
A    10
B    10
C    10
D    10
E    10
dtype: int64


直方图函数

In [23]:
s = pd.Series(np.random.randint(0, 7, size=10))
print(s)
print(s.value_counts())

0    0
1    4
2    0
3    0
4    0
5    4
6    4
7    4
8    0
9    1
dtype: int64
0    5
4    4
1    1
dtype: int64


连接函数(concat, merge, append)

In [24]:
df1 = pd.DataFrame(np.random.randn(10, 3))
print(df1)

df2 = pd.DataFrame(np.random.randn(10, 3))
print(df2)
print(pd.concat([df1[:3], df2[1:6]]))

df1 = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
df2 = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [3, 5]})
print(df1)
print(df2)
print(pd.merge(df1, df2, on='key'))

df1 = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
df2 = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [3, 5]})
print(df1)
print(df2)
print(pd.merge(df1, df2, on='key'))

df = pd.DataFrame(np.random.randn(8, 3), columns=list('ABC'))
print(df)
print(df.append(df.iloc[3], ignore_index=True))

          0         1         2
0  0.389654  0.132951 -0.185719
1 -0.031104  0.780390  0.895148
2  0.432392 -0.141363  0.584487
3 -0.377385 -0.241167  0.282097
4 -1.039993  0.246469  0.954342
5 -0.759267 -0.525595  0.724355
6  0.560931  0.447554 -1.209913
7 -0.625240  0.866905 -0.509506
8  0.107855 -0.410119  0.693797
9 -0.339627  0.612623  1.348514
          0         1         2
0  0.870342  0.114908 -0.273670
1  1.588141  2.923471 -0.121900
2 -0.532022 -1.105307 -1.523883
3  0.438919  0.736406 -0.400885
4  0.048989  1.071322 -1.818710
5  1.439038  1.535364  1.778150
6  0.174681  1.572112 -0.448438
7  0.833963 -1.007743  0.036044
8  0.118418 -2.479949 -1.421893
9  0.207663 -2.013950 -0.410850
          0         1         2
0  0.389654  0.132951 -0.185719
1 -0.031104  0.780390  0.895148
2  0.432392 -0.141363  0.584487
1  1.588141  2.923471 -0.121900
2 -0.532022 -1.105307 -1.523883
3  0.438919  0.736406 -0.400885
4  0.048989  1.071322 -1.818710
5  1.439038  1.535364  1.778150
   key  

Grouping

In [25]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'bar'],
                   'B': ['one', 'one', 'two', 'three', 'two', 'tow', 'one', 'three'],
                   'C': np.random.randn(8),
                   'D': np.random.randn(8)})

print(df)
print(df.groupby('A').sum())
print(df.groupby(['A', 'B']).sum())

     A      B         C         D
0  foo    one  1.265666 -0.214716
1  bar    one -0.272539  0.435138
2  foo    two  1.902522 -1.873425
3  bar  three  0.798464 -0.522836
4  foo    two  0.838822 -0.352077
5  bar    tow  0.503512 -0.896307
6  foo    one  1.499019 -1.350517
7  bar  three  0.626966  1.083584
            C         D
A                      
bar  1.656402  0.099579
foo  5.506029 -3.790735
                  C         D
A   B                        
bar one   -0.272539  0.435138
    three  1.425430  0.560748
    tow    0.503512 -0.896307
foo one    2.764685 -1.565233
    two    2.741344 -2.225502


### Reshaping

1. Stack

In [26]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'bar'],
                   'B': ['one', 'one', 'two', 'three', 'two', 'tow', 'one', 'three'],
                   'C': np.random.randn(8),
                   'D': np.random.randn(8)})

print(df)
print(df.stack())
print(df.stack().unstack())

     A      B         C         D
0  foo    one  1.988716  0.824695
1  bar    one  0.160458  0.342379
2  foo    two  0.189523  0.218679
3  bar  three -0.606517  0.930221
4  foo    two  0.946979 -0.629034
5  bar    tow  0.070013 -0.580997
6  foo    one  1.012384  0.015211
7  bar  three -0.471639 -0.176326
0  A          foo
   B          one
   C      1.98872
   D     0.824695
1  A          bar
   B          one
   C     0.160458
   D     0.342379
2  A          foo
   B          two
   C     0.189523
   D     0.218679
3  A          bar
   B        three
   C    -0.606517
   D     0.930221
4  A          foo
   B          two
   C     0.946979
   D    -0.629034
5  A          bar
   B          tow
   C    0.0700134
   D    -0.580997
6  A          foo
   B          one
   C      1.01238
   D    0.0152106
7  A          bar
   B        three
   C    -0.471639
   D    -0.176326
dtype: object
     A      B          C          D
0  foo    one    1.98872   0.824695
1  bar    one   0.160458   0.342

2. Pivot tables

In [27]:
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
                   'B': ['A', 'B', 'C'] * 4,
                   'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D': np.random.randn(12),
                   'E': np.random.randn(12)})
print(df)
print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']))

        A  B    C         D         E
0     one  A  foo -1.585687  0.186302
1     one  B  foo  0.442664  0.379674
2     two  C  foo  2.449075 -0.480867
3   three  A  bar  0.582720  0.459319
4     one  B  bar  0.778986 -0.066089
5     one  C  bar -0.129267  0.747121
6     two  A  foo  1.512353  0.028118
7   three  B  foo -0.563416  1.922253
8     one  C  foo  0.039638  1.719558
9     one  A  bar  1.463450  2.058046
10    two  B  bar -0.084120 -0.017062
11  three  C  bar -1.137139 -0.689691
C             bar       foo
A     B                    
one   A  1.463450 -1.585687
      B  0.778986  0.442664
      C -0.129267  0.039638
three A  0.582720       NaN
      B       NaN -0.563416
      C -1.137139       NaN
two   A       NaN  1.512353
      B -0.084120       NaN
      C       NaN  2.449075


### Time Series

In [28]:
rng = pd.date_range('1/1/2000', periods = 100, freq = 'D')

ts = pd.Series(np.random.randint(0, 500, len(rng)), index = rng)
print(ts)
print(ts.tz_localize('UTC').tz_convert('US/Eastern'))
print(ts.to_period())

2000-01-01    341
2000-01-02    382
2000-01-03    392
2000-01-04      1
2000-01-05     18
             ... 
2000-04-05    253
2000-04-06    367
2000-04-07    393
2000-04-08    101
2000-04-09    493
Freq: D, Length: 100, dtype: int64
1999-12-31 19:00:00-05:00    341
2000-01-01 19:00:00-05:00    382
2000-01-02 19:00:00-05:00    392
2000-01-03 19:00:00-05:00      1
2000-01-04 19:00:00-05:00     18
                            ... 
2000-04-04 20:00:00-04:00    253
2000-04-05 20:00:00-04:00    367
2000-04-06 20:00:00-04:00    393
2000-04-07 20:00:00-04:00    101
2000-04-08 20:00:00-04:00    493
Freq: D, Length: 100, dtype: int64
2000-01-01    341
2000-01-02    382
2000-01-03    392
2000-01-04      1
2000-01-05     18
             ... 
2000-04-05    253
2000-04-06    367
2000-04-07    393
2000-04-08    101
2000-04-09    493
Freq: D, Length: 100, dtype: int64


### Categoricals

In [29]:
df = pd.DataFrame({'id': [1, 2, 3, 4, 5, 6],
                   'raw_grade': ['a', 'b', 'b', 'a', 'a', 'e']})
print(df)
df['grade'] = df['raw_grade'].astype('category')
print(df)
df['grade'].cat.categories = ["very good", "good", "very bad"]
print(df)
df['grade'] = df['grade'].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
print(df)
print(df.sort_values(by='grade'))

   id raw_grade
0   1         a
1   2         b
2   3         b
3   4         a
4   5         a
5   6         e
   id raw_grade grade
0   1         a     a
1   2         b     b
2   3         b     b
3   4         a     a
4   5         a     a
5   6         e     e
   id raw_grade      grade
0   1         a  very good
1   2         b       good
2   3         b       good
3   4         a  very good
4   5         a  very good
5   6         e   very bad
   id raw_grade      grade
0   1         a  very good
1   2         b       good
2   3         b       good
3   4         a  very good
4   5         a  very good
5   6         e   very bad
   id raw_grade      grade
5   6         e   very bad
1   2         b       good
2   3         b       good
0   1         a  very good
3   4         a  very good
4   5         a  very good


### Plotting

In [30]:
ts = pd.Series(np.random.randn(1000), index = pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
print(ts)

ts.plot()

2000-01-01    -2.131697
2000-01-02    -3.229068
2000-01-03    -3.085780
2000-01-04    -3.916855
2000-01-05    -4.568948
                ...    
2002-09-22    27.626399
2002-09-23    27.720935
2002-09-24    27.964200
2002-09-25    27.100834
2002-09-26    27.580475
Freq: D, Length: 1000, dtype: float64


<matplotlib.axes._subplots.AxesSubplot at 0x11233f4d0>