# Series 基本概念及创建

In [1]:
import numpy as np
import pandas as pd

In [2]:
# 创建Series
ar = np.random.rand(5)
s = pd.Series(ar)
print(ar, type(ar))
print(s, type(s))
print("----------------")
print(list(s.index))
print(s.values, type(s.values))

[0.03648525 0.00764529 0.40628105 0.40834876 0.27007441] <class 'numpy.ndarray'>
0    0.036485
1    0.007645
2    0.406281
3    0.408349
4    0.270074
dtype: float64 <class 'pandas.core.series.Series'>
----------------
[0, 1, 2, 3, 4]
[0.03648525 0.00764529 0.40628105 0.40834876 0.27007441] <class 'numpy.ndarray'>


In [3]:
# 创建series ，字典创建, key 是index,values是values
dic = {'a':1, 'b':2, 'c':3}
s =pd.Series(dic)
print(s, type(s))

a    1
b    2
c    3
dtype: int64 <class 'pandas.core.series.Series'>


In [4]:
# 通过数组创建
arr = np.random.rand(5)
s = pd.Series(arr, index = list('abcde'), name = 'test')
print(s)
s1 = s.rename("hahah")
print(s1)

a    0.059796
b    0.489866
c    0.773100
d    0.300860
e    0.297685
Name: test, dtype: float64
a    0.059796
b    0.489866
c    0.773100
d    0.300860
e    0.297685
Name: hahah, dtype: float64


In [5]:
# 通过标量创建
s = pd.Series(100, index = range(4))
print(s)

0    100
1    100
2    100
3    100
dtype: int64


# Series 索引

In [6]:
# 下标索引
import numpy as np
import pandas as pd

In [7]:
s = pd.Series(np.random.rand(10))
print(s)
print(s[5], type(s[5]))

0    0.308567
1    0.907527
2    0.305534
3    0.874218
4    0.106907
5    0.354304
6    0.727857
7    0.934227
8    0.351439
9    0.067371
dtype: float64
0.3543036084842405 <class 'numpy.float64'>


In [8]:
# 标签索引
s = pd.Series(np.random.rand(5), index = list('abcde'))
print(s)
print(s['a'], type(['a']), s['a'].dtype)
print(s[['a','b']])

a    0.131514
b    0.229659
c    0.776905
d    0.264497
e    0.479370
dtype: float64
0.13151431046305817 <class 'list'> float64
a    0.131514
b    0.229659
dtype: float64


In [9]:
# 切片
s1 = pd.Series(np.random.rand(5))
s2 = pd.Series(np.random.rand(5), index = list('abcde'))
print(s1)
print(s2)
print("------------------------------")
print(s1[0:3])   # 下标索引，左闭右开
print(s2['a':'c'])   # 标签索引，左闭右闭
print("-----------------------------")
# print(s1[-1])   # 下标索引不支持-1
print(s1[::-1])  # 但是切片可以这么写
print(s2[::2])

0    0.779041
1    0.043249
2    0.151760
3    0.803278
4    0.529603
dtype: float64
a    0.540085
b    0.868815
c    0.833208
d    0.981977
e    0.935819
dtype: float64
------------------------------
0    0.779041
1    0.043249
2    0.151760
dtype: float64
a    0.540085
b    0.868815
c    0.833208
dtype: float64
-----------------------------
4    0.529603
3    0.803278
2    0.151760
1    0.043249
0    0.779041
dtype: float64
a    0.540085
c    0.833208
e    0.935819
dtype: float64


In [10]:
# 布尔型索引
s = pd.Series(np.random.rand(3))* 100
s[4] = None
print(s)
bs1 = s > 50
bs2 = s.isnull()
bs3 = s.notnull()
print(bs1, bs2, bs3)

0    72.325077
1    28.429398
2    17.997942
4         None
dtype: object
0     True
1    False
2    False
4    False
dtype: bool 0    False
1    False
2    False
4     True
dtype: bool 0     True
1     True
2     True
4    False
dtype: bool


# Series 基本技巧

In [11]:
# 数据查看
s = pd.Series(np.random.rand(15))
print(s.head(), s.tail())

0    0.871993
1    0.367156
2    0.200023
3    0.614152
4    0.386234
dtype: float64 10    0.412134
11    0.039241
12    0.100908
13    0.522112
14    0.021295
dtype: float64


In [12]:
# 重新索引 reindex
s = pd.Series(np.random.rand(5), index = list('abced'))
print(s)
s1 = s.reindex(list('abfgh'))
print(s1)
s2 = s.reindex(list('abcegh'), fill_value= 0)
print(s2)

a    0.536742
b    0.830291
c    0.834003
e    0.713438
d    0.905088
dtype: float64
a    0.536742
b    0.830291
f         NaN
g         NaN
h         NaN
dtype: float64
a    0.536742
b    0.830291
c    0.834003
e    0.713438
g    0.000000
h    0.000000
dtype: float64


In [13]:
# 对齐
s1 = pd.Series(np.random.rand(3), index = ['jack','marry','tom'])
s2 = pd.Series(np.random.rand(3), index = ['tom','marry','jack'])
print(s1, s2)
print(s1 +s2)

jack     0.709762
marry    0.827546
tom      0.614935
dtype: float64 tom      0.247796
marry    0.817671
jack     0.956413
dtype: float64
jack     1.666175
marry    1.645217
tom      0.862730
dtype: float64


In [14]:
# 删除 drop
s = pd.Series(np.random.rand(5), index = list('abcde'))
print(s)
s.drop('c', inplace = True)
print(s)
s.drop(['a','b'], inplace = True)
print(s)

a    0.780984
b    0.446524
c    0.043825
d    0.838782
e    0.815261
dtype: float64
a    0.780984
b    0.446524
d    0.838782
e    0.815261
dtype: float64
d    0.838782
e    0.815261
dtype: float64


In [15]:
# 添加
s1 = pd.Series(np.random.rand(5), index = list('abcde'))
s2 = pd.Series([1,2,3], index = ['w','e','r'])
print(s1, s2)
s1['m'] = 100
print(s1)

s3 = s1.append(s2)
print(s3)

a    0.236625
b    0.907888
c    0.620888
d    0.360470
e    0.658794
dtype: float64 w    1
e    2
r    3
dtype: int64
a      0.236625
b      0.907888
c      0.620888
d      0.360470
e      0.658794
m    100.000000
dtype: float64
a      0.236625
b      0.907888
c      0.620888
d      0.360470
e      0.658794
m    100.000000
w      1.000000
e      2.000000
r      3.000000
dtype: float64


# Dataframe基本概念及创建

In [16]:
# 创建dataframe

In [17]:
data = {
    'name':['jack','tom','marry'],
    'age':[18,19,20],
    'gender':['m','m','f']
}
frame = pd.DataFrame(data)
print(frame)
print(type(frame))
print(frame.index)
print(frame.columns)
print(frame.values)

    name  age gender
0   jack   18      m
1    tom   19      m
2  marry   20      f
<class 'pandas.core.frame.DataFrame'>
RangeIndex(start=0, stop=3, step=1)
Index(['name', 'age', 'gender'], dtype='object')
[['jack' 18 'm']
 ['tom' 19 'm']
 ['marry' 20 'f']]


In [18]:
# dataframe创建方法1，由数组或者list组成的字典,行列需要保持一致
data1 = {'a':[1,2,3],
        'b':[3,4,5],
        'c':[5,6,7]}
data2 = {'one':np.random.rand(3),
        'two':np.random.rand(3)}    
print(data1)
print(data2)
df1 = pd.DataFrame(data1, index = list('abc'))
df2 = pd.DataFrame(data2)
print(df1)
print(df2)
print('--------------------------------------------')
df1 = pd.DataFrame(data1, columns=['a','b','c','d'])   
df2 = pd.DataFrame(data1, columns=['b','c'])
print(df1)
print(df2)

{'a': [1, 2, 3], 'b': [3, 4, 5], 'c': [5, 6, 7]}
{'one': array([0.33186715, 0.45049646, 0.57541203]), 'two': array([0.73108042, 0.78541325, 0.14811496])}
   a  b  c
a  1  3  5
b  2  4  6
c  3  5  7
        one       two
0  0.331867  0.731080
1  0.450496  0.785413
2  0.575412  0.148115
--------------------------------------------
   a  b  c    d
0  1  3  5  NaN
1  2  4  6  NaN
2  3  5  7  NaN
   b  c
0  3  5
1  4  6
2  5  7


In [19]:
# 通过二维数组直接创建
ar = np.random.rand(9).reshape(3,3)
print(ar)
df1 = pd.DataFrame(ar, index = ['a','b','c'], columns= ['one','two','three'])
print(df1)

[[0.64589771 0.76928951 0.01681971]
 [0.28541003 0.77355782 0.65787579]
 [0.65015962 0.53014779 0.42642407]]
        one       two     three
a  0.645898  0.769290  0.016820
b  0.285410  0.773558  0.657876
c  0.650160  0.530148  0.426424


In [20]:
# 由字典组成的列表生成, 列表里每一个字典都是一行数据
dic = [{'one':1, 'two':2}, {"one":3, "two":4, "three":5}]
df = pd.DataFrame(dic)
df

Unnamed: 0,one,two,three
0,1,2,
1,3,4,5.0


In [21]:
# 由字典组成的字典, 第一层key会变成columns，第二层key会变成index
data = {
    'jack':{'math':90, 'eng':100, 'art':100},
    'bob':{'math':90, 'eng':89, 'art':99},
    'marry':{'math':90, 'eng':87, 'art':78}
}
df = pd.DataFrame(data)
df

Unnamed: 0,jack,bob,marry
math,90,90,90
eng,100,89,87
art,100,99,78


# Dataframe的索引

In [22]:
# 选择行与列
df = pd.DataFrame(np.random.rand(12).reshape(3,4) *100,
                 index = ['one','two','three'],
                  columns = ['a','b','c','d']
                 )
print(df)
print("-----------------------------")
data1 = df['a']  # 选择一列,这时返回的series
data1_1 = df[['a']]  # 选择一列，这时返回的是dataframe
data2 = df[['a','b']]  # 选择多列
print(data1)
print(data1_1)
print(data2)
print("-------------------------")
# 选择行
data3 = df.loc['one']   # 选择一行，这时返回的是series
data3_1 = df.loc[['one']]  # 选择一行，这时返回的是dataframe
data4 = df.loc[['three','two']]  # 选择多行
print(data3)
print(data3_1)
print(data4)

# 切片
df[:2]  # 这时候索引的是行，而不是列  ， 但为避免混淆这种方式不建议用，索引行还是用.loc吧

               a          b          c          d
one    59.464747  53.566742  81.556387  96.249826
two    97.052670  50.908406  80.905425  40.341066
three  91.228230  71.307045  76.841470  94.049859
-----------------------------
one      59.464747
two      97.052670
three    91.228230
Name: a, dtype: float64
               a
one    59.464747
two    97.052670
three  91.228230
               a          b
one    59.464747  53.566742
two    97.052670  50.908406
three  91.228230  71.307045
-------------------------
a    59.464747
b    53.566742
c    81.556387
d    96.249826
Name: one, dtype: float64
             a          b          c          d
one  59.464747  53.566742  81.556387  96.249826
              a          b          c          d
three  91.22823  71.307045  76.841470  94.049859
two    97.05267  50.908406  80.905425  40.341066


Unnamed: 0,a,b,c,d
one,59.464747,53.566742,81.556387,96.249826
two,97.05267,50.908406,80.905425,40.341066


In [23]:
# .loc按照标签索引
df = pd.DataFrame(np.random.rand(12).reshape(3,4) *100,
                 index = ['one','two','three'],
                  columns = ['a','b','c','d']
                 )
print(df)
print(df.loc[['one','three']])
print(df.loc['one':'three'])   # 默认索引行哦
print(df.loc[['one','three'], ['a','b']])   # 索引行和列

               a          b          c          d
one     4.263024  15.002120  95.587692  84.068806
two    40.438848   4.007341  90.635682  97.116234
three  87.901398  53.158255  36.128731  57.784200
               a          b          c          d
one     4.263024  15.002120  95.587692  84.068806
three  87.901398  53.158255  36.128731  57.784200
               a          b          c          d
one     4.263024  15.002120  95.587692  84.068806
two    40.438848   4.007341  90.635682  97.116234
three  87.901398  53.158255  36.128731  57.784200
               a          b
one     4.263024  15.002120
three  87.901398  53.158255


In [24]:
# 按照整数位置索引 .iloc
df = pd.DataFrame(np.random.rand(12).reshape(3,4) *100,
                 index = ['one','two','three'],
                  columns = ['a','b','c','d']
                 )
print(df)
print(df.iloc[0])  # 第一行
print(df.iloc[-1])  # 最后一行
print(df.iloc[0, 1:3])   # 行列都加索引
df.iloc[[0]]

# 所以，在对dataframe进行索引的时候，最好只用loc和iloc方法

               a          b          c          d
one    82.030846  19.799993  68.627759  77.544548
two    64.303016  52.401617   4.164134  43.068672
three  58.652272   2.170909  18.576406  86.979753
a    82.030846
b    19.799993
c    68.627759
d    77.544548
Name: one, dtype: float64
a    58.652272
b     2.170909
c    18.576406
d    86.979753
Name: three, dtype: float64
b    19.799993
c    68.627759
Name: one, dtype: float64


Unnamed: 0,a,b,c,d
one,82.030846,19.799993,68.627759,77.544548


In [25]:
# 布尔型索引
df <20
print(df[df<20])
print(df[df['a']>20])
print(df[(df['a']>20) & (df['c']>50)])

        a          b          c   d
one   NaN  19.799993        NaN NaN
two   NaN        NaN   4.164134 NaN
three NaN   2.170909  18.576406 NaN
               a          b          c          d
one    82.030846  19.799993  68.627759  77.544548
two    64.303016  52.401617   4.164134  43.068672
three  58.652272   2.170909  18.576406  86.979753
             a          b          c          d
one  82.030846  19.799993  68.627759  77.544548


# dataframe基本技巧

In [26]:
# 数据查看、转置
df = pd.DataFrame(np.random.rand(16).reshape(8,2),
                 columns= ['a','b']
                 )
print(df.head(2))
print(df.T)


          a         b
0  0.135706  0.124121
1  0.368092  0.101623
          0         1         2         3         4         5         6  \
a  0.135706  0.368092  0.261347  0.391935  0.912677  0.932882  0.220915   
b  0.124121  0.101623  0.239317  0.908529  0.462720  0.889670  0.242060   

          7  
a  0.271133  
b  0.774895  


In [27]:
# 删除
df = pd.DataFrame(np.random.rand(16).reshape(8,2),
                 columns= ['a','b']
                 )
df
del df['a']   # 删除列
df
df = pd.DataFrame(np.random.rand(16).reshape(8,2),
                 columns= ['a','b']
                 )
df.drop(0)    # 删除行

Unnamed: 0,a,b
1,0.162641,0.674371
2,0.924912,0.066776
3,0.694119,0.535694
4,0.085521,0.031549
5,0.751804,0.529314
6,0.700514,0.013702
7,0.445913,0.745637


In [28]:
# 排序
# 按值排序
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                 columns = list('abcd')
                 )
print(df)
print(df.sort_values(['a'], ascending= False))
print(df.sort_values(['a','c'], ascending= [False,True]))


           a          b          c          d
0  21.178339  20.258686  75.441898   2.843136
1  66.777176  19.350500  73.532552  23.248276
2  83.219395  31.264318  38.987160  64.172002
3  58.611945  38.456600  79.236644  49.978197
           a          b          c          d
2  83.219395  31.264318  38.987160  64.172002
1  66.777176  19.350500  73.532552  23.248276
3  58.611945  38.456600  79.236644  49.978197
0  21.178339  20.258686  75.441898   2.843136
           a          b          c          d
2  83.219395  31.264318  38.987160  64.172002
1  66.777176  19.350500  73.532552  23.248276
3  58.611945  38.456600  79.236644  49.978197
0  21.178339  20.258686  75.441898   2.843136


In [29]:
# 按索引排序
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                 columns = list('abcd')
                 )
print(df)
print(df.sort_index(ascending=False))

           a          b          c          d
0  50.274661  26.123795  83.006790  81.791103
1  14.856253  99.667816  55.078776  65.592128
2  17.309437  42.512735  94.828529  54.414493
3  40.281286  77.680270  72.639499  62.642820
           a          b          c          d
3  40.281286  77.680270  72.639499  62.642820
2  17.309437  42.512735  94.828529  54.414493
1  14.856253  99.667816  55.078776  65.592128
0  50.274661  26.123795  83.006790  81.791103


# 时间模块

In [30]:
# 主要用到的函数 datetime.date(),  datetime.datetimes(), datetime.timedelta)_
import datetime
import numpy as np
import pandas as pd

In [31]:
today = datetime.date.today()   # 返回今日日期
print(today, type(today))

2021-12-02 <class 'datetime.date'>


In [32]:
# datetime.datetime()
now = datetime.datetime.now()
print(now, type(now))

t1 = datetime.datetime(2021, 1,1)
t2 = datetime.datetime(2021,1,21)
print(t1-t2, type(t1-t2))

2021-12-02 21:47:04.390059 <class 'datetime.datetime'>
-20 days, 0:00:00 <class 'datetime.timedelta'>


In [33]:
# datetime.timedelta 时间差
t1= datetime.datetime(2000,10,1)
tx = datetime.timedelta(100, 3600)   # 100是天，3600是秒
print(t1 + tx)
print(t1-tx)

2001-01-09 01:00:00
2000-06-22 23:00:00


In [34]:
# 日期与文本转换 parser.parse
from dateutil.parser import parse
date1 = '12/21/2017'
date2 = '21/12/2017'
print(parse(date1), type(parse(date1)))

2017-12-21 00:00:00 <class 'datetime.datetime'>


In [35]:
# 日期格式转换
datetime.datetime.now().strftime('%Y-%m-%d')

'2021-12-02'

# 时刻数据

In [36]:
import numpy as np
import pandas as pd
import datetime

In [37]:
# pd.Timestamp
date1 = '20170101'
date2 = datetime.datetime(2016, 10, 1,15)
t1 = pd.Timestamp(date1)
t2 = pd.Timestamp(date2)
print(t1, type(t1))
print(t2, type(t2))

2017-01-01 00:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>
2016-10-01 15:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [38]:
# pd.to_datetime
date1 = '20170101'
date2 = datetime.datetime(2016, 10, 1,15)
t1 = pd.to_datetime(date1)
t2 = pd.to_datetime(date2)
print(t1, type(t1))
print(t2, type(t2))

2017-01-01 00:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>
2016-10-01 15:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [39]:
# 当传入一个时间列表时，将会变成一个时间序列
date1 = ['2021-10-01', '2021-10-02','2021-10-03', 'hhh']
t1 = pd.to_datetime(date1, errors= 'ignore')   # 忽略错误，直接返回index
t1_1 = pd.to_datetime(date1, errors= 'coerce')   # 把错误值变成缺失值，返回 datetimeindex
print(t1)
print(t1_1)


date2 = ['2021-10-01', '2021-10-02','2021-10-03']
t2 = pd.to_datetime(date2)
print(t2)

Index(['2021-10-01', '2021-10-02', '2021-10-03', 'hhh'], dtype='object')
DatetimeIndex(['2021-10-01', '2021-10-02', '2021-10-03', 'NaT'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2021-10-01', '2021-10-02', '2021-10-03'], dtype='datetime64[ns]', freq=None)


In [40]:
lst = []

for i in range(1,32):
    tt = '2021-10-' + str(i)
    lst.append(tt)
lst
t1 = pd.to_datetime(lst)
t1[15]

Timestamp('2021-10-16 00:00:00')

# 时间戳索引

In [41]:
import numpy as np
import pandas as pd
import datetime
# 核心 pd.date_range()

In [42]:
# datetimeIndex,  timeseries
rng = pd.DatetimeIndex(['2021-10-01', '2021-10-02','2021-10-03', '2021-10-04','2021-10-05'])
st = pd.Series(np.random.rand(5), 
              index = rng
              )
type(st)

pandas.core.series.Series

In [43]:
# 日期生成  pd.date_range
rng1 = pd.date_range('2021/01/01','2021/01/10', normalize= True)  # normalize把时间归到凌晨零点
rng2 = pd.date_range('2021/1/1', periods= 10)
rng3 = pd.date_range(end = '2021/1/10', periods= 10)  # freq参数默认是‘D’天
print(rng1, rng2, rng3)

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10'],
              dtype='datetime64[ns]', freq='D') DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10'],
              dtype='datetime64[ns]', freq='D') DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10'],
              dtype='datetime64[ns]', freq='D')


In [44]:
pd.bdate_range('2021-01-01','2021-01-10')   # 只显示工作日

DatetimeIndex(['2021-01-01', '2021-01-04', '2021-01-05', '2021-01-06',
               '2021-01-07', '2021-01-08'],
              dtype='datetime64[ns]', freq='B')

In [45]:
# Freq参数的含义
# D  天
# B 每工作日
# H 每小时
# T 每分钟
# S 每秒
# L 每毫秒
# U 每微秒
# W-MON  每周的星期一
# WOM-2MON 每个月的第二个星期一
# …… 还有还多，需要手动探索

In [46]:
# asfreq时间频率转换
ts = pd.Series(np.random.rand(5),
              index =pd.date_range('2021/1/1', periods=5))
print(ts)
print(ts.asfreq('4H', method='ffill'))   # 转换成4个小时一次

2021-01-01    0.144589
2021-01-02    0.348657
2021-01-03    0.282218
2021-01-04    0.702608
2021-01-05    0.481673
Freq: D, dtype: float64
2021-01-01 00:00:00    0.144589
2021-01-01 04:00:00    0.144589
2021-01-01 08:00:00    0.144589
2021-01-01 12:00:00    0.144589
2021-01-01 16:00:00    0.144589
2021-01-01 20:00:00    0.144589
2021-01-02 00:00:00    0.348657
2021-01-02 04:00:00    0.348657
2021-01-02 08:00:00    0.348657
2021-01-02 12:00:00    0.348657
2021-01-02 16:00:00    0.348657
2021-01-02 20:00:00    0.348657
2021-01-03 00:00:00    0.282218
2021-01-03 04:00:00    0.282218
2021-01-03 08:00:00    0.282218
2021-01-03 12:00:00    0.282218
2021-01-03 16:00:00    0.282218
2021-01-03 20:00:00    0.282218
2021-01-04 00:00:00    0.702608
2021-01-04 04:00:00    0.702608
2021-01-04 08:00:00    0.702608
2021-01-04 12:00:00    0.702608
2021-01-04 16:00:00    0.702608
2021-01-04 20:00:00    0.702608
2021-01-05 00:00:00    0.481673
Freq: 4H, dtype: float64


In [47]:
# 日期数据平移 .shfit， 是个计算环比，同比的神器
ts = pd.Series(np.random.rand(5),
              index =pd.date_range('2021/1/1', periods=5))
print(ts)
print(ts.shift(1))
print(ts.shift(-1))
print(ts.shift(1, freq= "D"))   # 按日期平移

2021-01-01    0.375299
2021-01-02    0.472630
2021-01-03    0.867585
2021-01-04    0.737084
2021-01-05    0.008125
Freq: D, dtype: float64
2021-01-01         NaN
2021-01-02    0.375299
2021-01-03    0.472630
2021-01-04    0.867585
2021-01-05    0.737084
Freq: D, dtype: float64
2021-01-01    0.472630
2021-01-02    0.867585
2021-01-03    0.737084
2021-01-04    0.008125
2021-01-05         NaN
Freq: D, dtype: float64
2021-01-02    0.375299
2021-01-03    0.472630
2021-01-04    0.867585
2021-01-05    0.737084
2021-01-06    0.008125
Freq: D, dtype: float64


# 时期

In [48]:
# pd.Period
# pd.period_range
# 用法与pd.date_range()

# 时间序列的索引与切片

In [49]:
# 索引
from datetime import datetime
rng = pd.date_range('2017/1','2017/3')
ts = pd.Series(np.random.rand(len(rng)), index = rng)
print(ts.head())
print('------------------------------------------------------')
print(ts[0])
print(ts[:2])
print("------------------------------------------------------------")
print(ts['20170101'])   # 时间序列的标签索引
print(ts.loc['20170105'])   # 时间序列的索引

2017-01-01    0.742815
2017-01-02    0.405468
2017-01-03    0.979043
2017-01-04    0.579695
2017-01-05    0.056764
Freq: D, dtype: float64
------------------------------------------------------
0.7428147649019661
2017-01-01    0.742815
2017-01-02    0.405468
Freq: D, dtype: float64
------------------------------------------------------------
0.7428147649019661
0.05676393127829027


In [50]:
# 切片
print(ts.head())
print(ts.loc['20170101':'20170109'])
print(ts.loc['2017-1'])   # 索引整个一月份数据

2017-01-01    0.742815
2017-01-02    0.405468
2017-01-03    0.979043
2017-01-04    0.579695
2017-01-05    0.056764
Freq: D, dtype: float64
2017-01-01    0.742815
2017-01-02    0.405468
2017-01-03    0.979043
2017-01-04    0.579695
2017-01-05    0.056764
2017-01-06    0.069348
2017-01-07    0.236302
2017-01-08    0.867765
2017-01-09    0.399654
Freq: D, dtype: float64
2017-01-01    0.742815
2017-01-02    0.405468
2017-01-03    0.979043
2017-01-04    0.579695
2017-01-05    0.056764
2017-01-06    0.069348
2017-01-07    0.236302
2017-01-08    0.867765
2017-01-09    0.399654
2017-01-10    0.020087
2017-01-11    0.761033
2017-01-12    0.287959
2017-01-13    0.863372
2017-01-14    0.577327
2017-01-15    0.974139
2017-01-16    0.653929
2017-01-17    0.786010
2017-01-18    0.956817
2017-01-19    0.182328
2017-01-20    0.589650
2017-01-21    0.566517
2017-01-22    0.315131
2017-01-23    0.783096
2017-01-24    0.266576
2017-01-25    0.112392
2017-01-26    0.399262
2017-01-27    0.498735
2017-01-2

# 时间序列重采样

In [51]:
# 降采样
# 升采样

In [52]:
rng = pd.date_range('20170101', periods=12)
ts = pd.Series(np.random.rand(len(rng)), index = rng)
print(ts)
print("----------------------------------")
ts_re = ts.resample('5D')   # 5天重采样
print(ts_re)
print(ts_re.sum())

2017-01-01    0.928059
2017-01-02    0.459823
2017-01-03    0.484885
2017-01-04    0.312629
2017-01-05    0.686358
2017-01-06    0.951114
2017-01-07    0.780496
2017-01-08    0.656226
2017-01-09    0.442526
2017-01-10    0.022492
2017-01-11    0.720819
2017-01-12    0.049470
Freq: D, dtype: float64
----------------------------------
DatetimeIndexResampler [freq=<5 * Days>, axis=0, closed=left, label=left, convention=start, origin=start_day]
2017-01-01    2.871753
2017-01-06    2.852854
2017-01-11    0.770289
Freq: 5D, dtype: float64


In [53]:
# 升采样，降采样

# 数值计算和统计基础

In [54]:
# 常用数学、统计方法

In [55]:
import numpy as np
import pandas as pd

In [56]:
dic = {
    'key1':[4,5,3,np.nan,2],
    'key2':[1,2,np.nan,4,5],
    'key3':[1,2,3,'j','k']
}
df = pd.DataFrame(dic, index = list('abcde'))
df

Unnamed: 0,key1,key2,key3
a,4.0,1.0,1
b,5.0,2.0,2
c,3.0,,3
d,,4.0,j
e,2.0,5.0,k


In [57]:
#axis 和 skipna
print(df.mean())
print(df.mean(axis = 1))   # axis = 0,按列计算，axis= 按行计算
print(df.mean(axis = 1, skipna= False))   # 默认过滤空值

key1    3.5
key2    3.0
dtype: float64
a    2.5
b    3.5
c    3.0
d    4.0
e    3.5
dtype: float64
a    2.5
b    3.5
c    NaN
d    NaN
e    3.5
dtype: float64


  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [58]:
# 主要的常用计算方法
df = pd.DataFrame({'key1':np.arange(10),
                  'key2':np.random.rand(10) *10
                  })
print(df.head())
print("------------------------------")
print(df.count())   # 计数
print(df.min())  # 最小值
print(df.max()) # 最大值
print(df.quantile(q = 0.75))   # 百分位数
print(df.mean(), df.median())  # 均值和中位数
print(df.std(), df.var())   # 标准差和方差
print(df.skew())   # 偏度
print(df.kurt())   # 峰度

   key1      key2
0     0  1.024219
1     1  1.689004
2     2  4.196590
3     3  6.641237
4     4  0.282550
------------------------------
key1    10
key2    10
dtype: int64
key1    0.00000
key2    0.28255
dtype: float64
key1    9.000000
key2    6.659035
dtype: float64
key1    6.750000
key2    5.839401
Name: 0.75, dtype: float64
key1    4.500000
key2    3.633176
dtype: float64 key1    4.500000
key2    3.670538
dtype: float64
key1    3.02765
key2    2.38590
dtype: float64 key1    9.166667
key2    5.692521
dtype: float64
key1    0.000000
key2    0.071141
dtype: float64
key1   -1.200000
key2   -1.565171
dtype: float64


In [59]:
# 累计求和与累计求积
df['key1s'] = df['key1'].cumsum()  # 累计求和
df['key2s'] = df['key2'].cumprod()   # 累计求积
df['key1ss'] =df['key1s'].cummax()  # 求最大值
df['key2ss'] = df['key2'].cummin()  # 求最小值
df

Unnamed: 0,key1,key2,key1s,key2s,key1ss,key2ss
0,0,1.024219,0,1.024219,0,1.024219
1,1,1.689004,1,1.72991,1,1.024219
2,2,4.19659,3,7.259722,3,1.024219
3,3,6.641237,6,48.213537,6,1.024219
4,4,0.28255,10,13.62272,10,0.28255
5,5,6.659035,15,90.714168,15,0.28255
6,6,1.940016,21,175.98697,21,0.28255
7,7,4.453129,28,783.69272,28,0.28255
8,8,3.144486,36,2464.31052,36,0.28255
9,9,6.301492,45,15528.833223,45,0.28255


In [60]:
# 唯一值 unique()
s = pd.Series(list('asjdldghlalkja'))
print(s)
print(s.unique())

0     a
1     s
2     j
3     d
4     l
5     d
6     g
7     h
8     l
9     a
10    l
11    k
12    j
13    a
dtype: object
['a' 's' 'j' 'd' 'l' 'g' 'h' 'k']


In [61]:
# 数值计数
pd.value_counts(s)

a    3
l    3
j    2
d    2
s    1
g    1
h    1
k    1
dtype: int64

In [62]:
# 成员资格
s
s.isin(['a'])

0      True
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9      True
10    False
11    False
12    False
13     True
dtype: bool

# 文本数据

In [63]:
import pandas as pd
import numpy as np

In [64]:
df = pd.DataFrame({
    'key1':list('abcdef'),
    'key2':['hee','fv','w','hija','123', np.nan]
})
s = pd.Series(['A','b','C','bbhello','123', np.nan,'hj'])
print(s.str.count('b'))   # 字符串计数
print(df['key2'].str.upper())   # 变成大写
df.columns.str.upper()  # 大写

0    0.0
1    1.0
2    0.0
3    2.0
4    0.0
5    NaN
6    0.0
dtype: float64
0     HEE
1      FV
2       W
3    HIJA
4     123
5     NaN
Name: key2, dtype: object


Index(['KEY1', 'KEY2'], dtype='object')

In [65]:
print(s.str.lower())   # 小写
print(s.str.upper())  # d大写
print(s.str.len())    # 长度
print(s.str.startswith('b'))   # 判断是否以xx开始
print(s.str.endswith('3'))  # 判断是否以 xx结束
print(s.str.strip())  # 删除空格
print(s.str.lstrip())  # 删除左侧的空格
print(s.str.rstrip())  # 删除右侧的空格

0          a
1          b
2          c
3    bbhello
4        123
5        NaN
6         hj
dtype: object
0          A
1          B
2          C
3    BBHELLO
4        123
5        NaN
6         HJ
dtype: object
0    1.0
1    1.0
2    1.0
3    7.0
4    3.0
5    NaN
6    2.0
dtype: float64
0    False
1     True
2    False
3     True
4    False
5      NaN
6    False
dtype: object
0    False
1    False
2    False
3    False
4     True
5      NaN
6    False
dtype: object
0          A
1          b
2          C
3    bbhello
4        123
5        NaN
6         hj
dtype: object
0          A
1          b
2          C
3    bbhello
4        123
5        NaN
6         hj
dtype: object
0          A
1          b
2          C
3    bbhello
4        123
5        NaN
6         hj
dtype: object


In [66]:
# 替换 replace
df = pd.DataFrame(np.random.randn(3,2), 
                 columns = [' column A', 'column B']
                 )
df.columns = df.columns.str.replace(' ','-')
df.columns.str.replace('-', '[]', n = 1)   # 替换第1个

Index(['[]column-A', 'column[]B'], dtype='object')

In [67]:
# 分割 split, rsplit
s = pd.Series(['a,b,c', '1,2,3', ['a,,,c'], np.nan])
print(s)
print(s.str.split(","))
print(s.str.split(",")[0])
print(s.str.split(",", expand= True))

0      a,b,c
1      1,2,3
2    [a,,,c]
3        NaN
dtype: object
0    [a, b, c]
1    [1, 2, 3]
2          NaN
3          NaN
dtype: object
['a', 'b', 'c']
     0    1    2
0    a    b    c
1    1    2    3
2  NaN  NaN  NaN
3  NaN  NaN  NaN


# 合并

In [68]:
# merge合并
import pandas as pd
import numpy as np
df1 = pd.DataFrame({'key':['k0','k1','k2','k3'],
                   'A':['a0','a1','a2','a3'],
                    'B':['b0','b1','b2','b3']
                   })
df2 = pd.DataFrame({'key':['k0','k1','k2','k3'],
                   'C':['a0','a1','a2','a3'],
                    'D':['b0','b1','b2','b3']
                   })
df3 = pd.DataFrame({'key1':['k0','k1','k2','k3'],
                    'key2':['k0','k1','k2','k3'],
                   'C':['a0','a1','a2','a3'],
                    'D':['b0','b1','b2','b3']
                   })
df4 = pd.DataFrame({'key1':['k0','k1','k2','k3'],
                    'key2':['k0','k1','k2','k3'],
                   'C':['a0','a1','a2','a3'],
                    'D':['b0','b1','b2','b3']
                   })
pd.merge(df1, df2, 
         on = 'key',   # 键值
         how = 'inner'
        )
pd.merge(df3, df4, 
        on = ['key1','key2'])   # how = inner, outer, left, right

Unnamed: 0,key1,key2,C_x,D_x,C_y,D_y
0,k0,k0,a0,b0,a0,b0
1,k1,k1,a1,b1,a1,b1
2,k2,k2,a2,b2,a2,b2
3,k3,k3,a3,b3,a3,b3


In [69]:
# 还有left_on, right_on， left_index, right_index，这里不细说了
# 也有df1.join(df2)的方式，但为了统一，我建议还是用pd.merge的方式

# 连接与修补

In [70]:
# concat
s1 =pd.Series([1,2,3])
s2 = pd.Series([4,5,6])
print(pd.concat([s1, s2]))
print(pd.concat([s1, s2], axis = 1))   

0    1
1    2
2    3
0    4
1    5
2    6
dtype: int64
   0  1
0  1  4
1  2  5
2  3  6


In [71]:
# 连接方式
s5 = pd.Series([1,2,3], index =['a','b','c'])
s6 = pd.Series([2,3,4], index = ['b','c','d'])
print(pd.concat([s5, s6], axis = 1))
print(pd.concat([s5, s6], axis = 1, join = 'inner'))
print(pd.concat([s5,s6], keys = ['one','two'], axis= 1))

     0    1
a  1.0  NaN
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0
   0  1
b  2  2
c  3  3
   one  two
a  1.0  NaN
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0


In [72]:
# 修复修补 combine_first
df1 = pd.DataFrame([[np.nan, 3, 5], [-4, 6, np.nan, np.nan], [np.nan, 7, np.nan]])
df2 = pd.DataFrame([[42.6, np.nan, -8.2], [-5, 1.6, 4]], index = ['a',1])
print(df1)
print(df2)
df1.combine_first(df2)   # 若df1是空的，则以df2的值填充

     0  1    2   3
0  NaN  3  5.0 NaN
1 -4.0  6  NaN NaN
2  NaN  7  NaN NaN
      0    1    2
a  42.6  NaN -8.2
1  -5.0  1.6  4.0


Unnamed: 0,0,1,2,3
0,,3.0,5.0,
1,-4.0,6.0,4.0,
2,,7.0,,
a,42.6,,-8.2,


In [73]:
print(df1.update(df2))

None


# 去重及替换

In [74]:
import pandas as pd
import numpy as np

In [75]:
# duplicated
s = pd.Series([1,1,1,1,2,2,2,8,4,5])
print(s.duplicated())
print(s[s.duplicated() == False])   # 去重后的结果
print("-----------")
print(s.unique())
print(s.drop_duplicates())

0    False
1     True
2     True
3     True
4    False
5     True
6     True
7    False
8    False
9    False
dtype: bool
0    1
4    2
7    8
8    4
9    5
dtype: int64
-----------
[1 2 8 4 5]
0    1
4    2
7    8
8    4
9    5
dtype: int64


In [76]:
# replace 替换
s = pd.Series(list('abcdlajdalj'))
print(s.replace('a','|'))   # 替换一个值
print(s.replace({'a':123, 'j':'hhah'}))

0     |
1     b
2     c
3     d
4     l
5     |
6     j
7     d
8     |
9     l
10    j
dtype: object
0      123
1        b
2        c
3        d
4        l
5      123
6     hhah
7        d
8      123
9        l
10    hhah
dtype: object


# 数据分组

In [77]:
import pandas as pd
import numpy as np
# df.groupby()

In [78]:
df = pd.DataFrame({
    'A':['foo','bar','foo','bar','foo','bar','foo','foo'],
    'B':['one','one','two','three','two','two','one','three'],
    'C':np.random.randn(8),
    'D':np.random.randn(8)
})
print(df)
print('======================================================')
print(df.groupby(['A']).sum())
print(df.groupby(['A','B']).mean())
# 还有get_group和 groups的方法，不做详细研究了
print("=========================================================")
print(df.groupby(['A']).get_group('foo'))

     A      B         C         D
0  foo    one -0.637521  0.810651
1  bar    one  0.925183  0.532303
2  foo    two -0.271743  0.561164
3  bar  three  0.293901  0.683245
4  foo    two  0.762219  0.211454
5  bar    two -1.099593 -0.767441
6  foo    one  0.591458  1.769207
7  foo  three -1.464069  0.270520
            C         D
A                      
bar  0.119491  0.448108
foo -1.019656  3.622997
                  C         D
A   B                        
bar one    0.925183  0.532303
    three  0.293901  0.683245
    two   -1.099593 -0.767441
foo one   -0.023032  1.289929
    three -1.464069  0.270520
    two    0.245238  0.386309
     A      B         C         D
0  foo    one -0.637521  0.810651
2  foo    two -0.271743  0.561164
4  foo    two  0.762219  0.211454
6  foo    one  0.591458  1.769207
7  foo  three -1.464069  0.270520


In [79]:
# 通过字典或者series分组
df = pd.DataFrame(np.arange(16).reshape(4,4),
                 columns = ['a','b','c','d'])
mapping = {'a':'one', 'b':'one', 'c':'two', 'd':'two'}
print(df)
print(df.groupby(mapping, axis =1).sum())

print("==========================================")
s = pd.Series(mapping)
print(s)
print(s.groupby(s).count())

    a   b   c   d
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
   one  two
0    1    5
1    9   13
2   17   21
3   25   29
a    one
b    one
c    two
d    two
dtype: object
one    2
two    2
dtype: int64


In [80]:
# 通过函数分组
# df.groupby(len).sum()


In [81]:
df = pd.DataFrame({
    'A':['foo','bar','foo','bar','foo','bar','foo','foo'],
    'B':['one','one','two','three','two','two','one','three'],
    'C':np.random.randn(8),
    'D':np.random.randn(8)
})


In [82]:
# 多函数计算
df = pd.DataFrame({
    'a':[1,1,2,2],
    'b':np.random.rand(4),
    'c':np.random.rand(4)
})
df


print(df)
print(df.groupby('a').agg(['mean',np.sum]))
print(df.groupby('a')['b'].agg(['mean',np.sum]))
# # 课程的示例是可以的，自己写就报错了，，，
# print(df.groupby('a')['b'].agg({'result1':'mean',
#                                'result2':np.sum}))

   a         b         c
0  1  0.224984  0.623259
1  1  0.359628  0.059854
2  2  0.553607  0.446950
3  2  0.137425  0.007519
          b                   c          
       mean       sum      mean       sum
a                                        
1  0.292306  0.584612  0.341556  0.683113
2  0.345516  0.691032  0.227234  0.454469
       mean       sum
a                    
1  0.292306  0.584612
2  0.345516  0.691032


# 分组转换及一般性拆分、应用与合并

In [83]:
import pandas as pd
import numpy as np

In [84]:
df = pd.DataFrame({
    'key1':list('aabba'),
    'key2':['one','two','one','two','one'],
    'data1':np.random.rand(5),
    'data2':np.random.rand(5)
})
print(df)
right_on = df.groupby('key1').mean()
print(right_on)   # 求均值，接下来要把均值附在对应的key1上
print(pd.merge(df,right_on,left_on='key1', right_index= True ).add_prefix('mean_'))   # add_prefix，添加前缀，这里只是演示，没有意义

  key1 key2     data1     data2
0    a  one  0.813571  0.993472
1    a  two  0.627499  0.432748
2    b  one  0.933498  0.390462
3    b  two  0.500433  0.294020
4    a  one  0.599511  0.440704
         data1     data2
key1                    
a     0.680194  0.622308
b     0.716966  0.342241
  mean_key1 mean_key2  mean_data1_x  mean_data2_x  mean_data1_y  mean_data2_y
0         a       one      0.813571      0.993472      0.680194      0.622308
1         a       two      0.627499      0.432748      0.680194      0.622308
4         a       one      0.599511      0.440704      0.680194      0.622308
2         b       one      0.933498      0.390462      0.716966      0.342241
3         b       two      0.500433      0.294020      0.716966      0.342241


In [85]:
# transform实现把groupby后的结果与原表结构一致
df.groupby('key1').transform(np.mean)

  


Unnamed: 0,data1,data2
0,0.680194,0.622308
1,0.680194,0.622308
2,0.716966,0.342241
3,0.716966,0.342241
4,0.680194,0.622308


In [86]:
# apply,一般化的groupby方法

In [87]:
df = pd.DataFrame({
    'key1':list('aabba'),
    'key2':['one','two','one','two','one'],
    'data1':np.random.rand(5),
    'data2':np.random.rand(5)
})
print(df)
print(df.groupby('key1').apply(lambda x: x.describe()))

  key1 key2     data1     data2
0    a  one  0.292745  0.833577
1    a  two  0.631586  0.977020
2    b  one  0.980718  0.816113
3    b  two  0.409638  0.244003
4    a  one  0.743737  0.411584
               data1     data2
key1                          
a    count  3.000000  3.000000
     mean   0.556023  0.740727
     std    0.234799  0.293930
     min    0.292745  0.411584
     25%    0.462166  0.622581
     50%    0.631586  0.833577
     75%    0.687661  0.905298
     max    0.743737  0.977020
b    count  2.000000  2.000000
     mean   0.695178  0.530058
     std    0.403815  0.404543
     min    0.409638  0.244003
     25%    0.552408  0.387030
     50%    0.695178  0.530058
     75%    0.837948  0.673086
     max    0.980718  0.816113


In [88]:
# 自定义函数，应用到df里
# 按照index排序后返回前n行
def f1(d,n):
    return(d.sort_index()[:n])

# 返回k1列
def f2(d,k1):
    return(d[k1])

In [89]:
print(df.groupby('key1').apply(f1, 2))
print('===============================')
print(df.groupby('key1').apply(f2, 'data1'))

       key1 key2     data1     data2
key1                                
a    0    a  one  0.292745  0.833577
     1    a  two  0.631586  0.977020
b    2    b  one  0.980718  0.816113
     3    b  two  0.409638  0.244003
key1   
a     0    0.292745
      1    0.631586
      4    0.743737
b     2    0.980718
      3    0.409638
Name: data1, dtype: float64


# 透视表及交叉表

In [90]:
# 感觉透视表和交叉表很像
# pivot_table/ crosstab
df = pd.DataFrame({'date':pd.date_range('2021-01-01', periods= 7),
                  'key':list('abcdabc'),
                   'values':np.random.rand(7)
                  })
print(df)
print("==================================")
print(pd.pivot_table(df, values = 'values', index = 'date', columns=  'key', aggfunc = np.sum))

print("==================================")
print(pd.pivot_table(df, values = 'values', index = ['date','key'], aggfunc = np.sum))

        date key    values
0 2021-01-01   a  0.300069
1 2021-01-02   b  0.331118
2 2021-01-03   c  0.188535
3 2021-01-04   d  0.537738
4 2021-01-05   a  0.915550
5 2021-01-06   b  0.046333
6 2021-01-07   c  0.074196
key                a         b         c         d
date                                              
2021-01-01  0.300069       NaN       NaN       NaN
2021-01-02       NaN  0.331118       NaN       NaN
2021-01-03       NaN       NaN  0.188535       NaN
2021-01-04       NaN       NaN       NaN  0.537738
2021-01-05  0.915550       NaN       NaN       NaN
2021-01-06       NaN  0.046333       NaN       NaN
2021-01-07       NaN       NaN  0.074196       NaN
                  values
date       key          
2021-01-01 a    0.300069
2021-01-02 b    0.331118
2021-01-03 c    0.188535
2021-01-04 d    0.537738
2021-01-05 a    0.915550
2021-01-06 b    0.046333
2021-01-07 c    0.074196


In [91]:
# 交叉表，统计频率的时候用 crosstab
df = pd.DataFrame({'A':[1,2,2,2,2],
                  'B':[3,3,4,4,4],
                   'C':[1,1,np.nan, 2,1]
                  })
print(df)
print("------------")
print(pd.crosstab(df['A'],df['B']))

print("------------")
print(pd.crosstab(df['A'],df['B'], normalize= True))  # normalize，百分比化


print("------------")
print(pd.crosstab(df['A'],df['B'], values = df['C'], aggfunc= np.sum, margins=True))  # 求和,也可以使用pivot_table实现
# pd.pivot_table(df, index = 'A',columns='B', values = 'C', aggfunc= np.sum, margins=True)


   A  B    C
0  1  3  1.0
1  2  3  1.0
2  2  4  NaN
3  2  4  2.0
4  2  4  1.0
------------
B  3  4
A      
1  1  0
2  1  3
------------
B    3    4
A          
1  0.2  0.0
2  0.2  0.6
------------
B      3    4  All
A                 
1    1.0  NaN  1.0
2    1.0  3.0  4.0
All  2.0  3.0  5.0


# 数据读取

In [92]:
import pandas as pd
import os
# os.getcwd()
# os.chdir("C:\\Users\\kwai\\python微专业")

In [93]:
# pd.read_table
# pd.read_csv(engine = 'python', encoding = 'utf-8')
# pd.read_excel

# 表格样式创建

In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [95]:
# 样式
df = pd.DataFrame(np.random.randn(10,4), columns= list('abcd'))
sty = df.style
print(df.head())
print(sty, type(sty))

          a         b         c         d
0 -0.866131 -1.786248 -0.165153  1.067828
1 -1.434982  0.525166 -1.436409 -0.486236
2  1.160070 -0.029489  0.631900  0.575730
3 -0.221873 -2.100628  0.170238  0.913629
4  0.035609  1.816656 -0.757583 -0.303061
<pandas.io.formats.style.Styler object at 0x000001F45E3DEC08> <class 'pandas.io.formats.style.Styler'>


In [96]:
# 按元素处理，大于0是黑色，小于0是红色, style.applymap()
def color_neg_red(val):
    if val <0:
        color ='red'
    else:
        color = 'black'
    return('color:%s'%color)
df.style.applymap(color_neg_red)

Unnamed: 0,a,b,c,d
0,-0.866131,-1.786248,-0.165153,1.067828
1,-1.434982,0.525166,-1.436409,-0.486236
2,1.16007,-0.029489,0.6319,0.57573
3,-0.221873,-2.100628,0.170238,0.913629
4,0.035609,1.816656,-0.757583,-0.303061
5,2.090387,-0.867173,-0.154298,-0.535484
6,-0.022712,0.973569,-0.371145,1.397919
7,0.147529,-0.876926,0.075051,-1.464677
8,-0.481979,-0.152315,-0.914538,0.387101
9,0.386835,0.4205,-0.499488,-0.835586


In [97]:
# 按行或者列处理， style.apply(),将每列的最大值高亮
def highlight_max(s):
    is_max = s == s.max()   # s == s.max()判断是否最大值，然后赋值给is_max
#     print(is_max)
    lst =[]
    for v in is_max:
        if v:
            lst.append("background-color:green")
        else:
            lst.append("")
#     print(lst)
    return(lst)
df.style.apply(highlight_max,axis = 0,subset = ['b','c'])
# axis, 0是按列，1是按行
# subset指定索引列

Unnamed: 0,a,b,c,d
0,-0.866131,-1.786248,-0.165153,1.067828
1,-1.434982,0.525166,-1.436409,-0.486236
2,1.16007,-0.029489,0.6319,0.57573
3,-0.221873,-2.100628,0.170238,0.913629
4,0.035609,1.816656,-0.757583,-0.303061
5,2.090387,-0.867173,-0.154298,-0.535484
6,-0.022712,0.973569,-0.371145,1.397919
7,0.147529,-0.876926,0.075051,-1.464677
8,-0.481979,-0.152315,-0.914538,0.387101
9,0.386835,0.4205,-0.499488,-0.835586


In [98]:
# 样式索引，切片，如果只选择列，按照subset = ['b','c']写就行，如果还要考虑行，需要这么写
df.style.apply(highlight_max,axis = 0,subset = pd.IndexSlice[0:3, ['b','c']])


Unnamed: 0,a,b,c,d
0,-0.866131,-1.786248,-0.165153,1.067828
1,-1.434982,0.525166,-1.436409,-0.486236
2,1.16007,-0.029489,0.6319,0.57573
3,-0.221873,-2.100628,0.170238,0.913629
4,0.035609,1.816656,-0.757583,-0.303061
5,2.090387,-0.867173,-0.154298,-0.535484
6,-0.022712,0.973569,-0.371145,1.397919
7,0.147529,-0.876926,0.075051,-1.464677
8,-0.481979,-0.152315,-0.914538,0.387101
9,0.386835,0.4205,-0.499488,-0.835586


In [99]:
# 按照上面那样的话，依然后展示全部的表格，只是计算时只考虑指定的行，也可以先切片再修改样式
df[0:3].style.apply(highlight_max,axis = 0,subset = pd.IndexSlice[:, ['b','c']])

Unnamed: 0,a,b,c,d
0,-0.866131,-1.786248,-0.165153,1.067828
1,-1.434982,0.525166,-1.436409,-0.486236
2,1.16007,-0.029489,0.6319,0.57573


# 表格显示控制

In [100]:
df = pd.DataFrame(np.random.randn(10,4), columns= list('abcd'))
print(df.head())

          a         b         c         d
0  0.169578  0.982422  1.437154 -0.426751
1  0.391591  1.533879 -1.321112  0.496504
2  0.391439 -0.948906  1.863978 -0.626695
3  1.379763 -0.109205  0.326243 -0.834177
4  0.234890  0.939610 -0.102158  0.427410


In [101]:
# 设置数值格式,每一列单独设置
df.style.format({"a":"{:+.2%}",
                "b":"{:,.2F}",
                 "c":"{:,.4}"
                })

Unnamed: 0,a,b,c,d
0,+16.96%,0.98,1.437,-0.426751
1,+39.16%,1.53,-1.321,0.496504
2,+39.14%,-0.95,1.864,-0.626695
3,+137.98%,-0.11,0.3262,-0.834177
4,+23.49%,0.94,-0.1022,0.42741
5,+90.88%,0.16,0.9454,-1.009794
6,+188.93%,1.32,-1.234,-1.048575
7,+95.74%,-0.94,-0.5315,-0.039059
8,-77.80%,-1.56,-1.148,0.205003
9,+39.04%,1.23,0.4525,-0.493237


In [102]:
# 设置数值格式,统一设置
df.style.format("{:,.2f}")

Unnamed: 0,a,b,c,d
0,0.17,0.98,1.44,-0.43
1,0.39,1.53,-1.32,0.5
2,0.39,-0.95,1.86,-0.63
3,1.38,-0.11,0.33,-0.83
4,0.23,0.94,-0.1,0.43
5,0.91,0.16,0.95,-1.01
6,1.89,1.32,-1.23,-1.05
7,0.96,-0.94,-0.53,-0.04
8,-0.78,-1.56,-1.15,0.21
9,0.39,1.23,0.45,-0.49


# 表格样式调用

In [103]:
import pandas as pd
import numpy as np

In [104]:
#  定位空值
df = pd.DataFrame(np.random.rand(5,4), columns= list('ABCD'))
df['A'][2] = np.nan
df.style.highlight_null(null_color = 'red')

Unnamed: 0,A,B,C,D
0,0.198439,0.949658,0.837903,0.201434
1,0.70935,0.610154,0.722327,0.799442
2,,0.655435,0.751909,0.066147
3,0.566917,0.569205,0.146852,0.156414
4,0.484352,0.390742,0.00115,0.668495


In [105]:
# 色彩映射，条件格式，热力图
df = pd.DataFrame(np.random.rand(5,4), columns= list('ABCD'))
df.style.background_gradient(cmap = 'Greens', axis = 0, low= 0 ,high = 1)

Unnamed: 0,A,B,C,D
0,0.442421,0.46802,0.502689,0.316616
1,0.550134,0.516862,0.106755,0.61187
2,0.639855,0.230808,0.665946,0.204034
3,0.322482,0.054214,0.315975,0.169877
4,0.433253,0.351634,0.558934,0.356999


In [106]:
# 条件格式，条形图
df = pd.DataFrame(np.random.rand(5,4), columns= list('ABCD'))
df.style.bar(subset =['A','B'],
            color = 'lightgreen',
             width = 90
            )

Unnamed: 0,A,B,C,D
0,0.543452,0.926254,0.172859,0.86675
1,0.942659,0.674926,0.917356,0.810474
2,0.807904,0.903484,0.312557,0.336813
3,0.984166,0.084286,0.250099,0.27589
4,0.320015,0.343834,0.814205,0.897364


In [107]:
# 分段式写法，链式写法，同时应用多个格式
df = pd.DataFrame(np.random.rand(5,4), columns= list('ABCD'))
df['A'][2] = np.nan
df.style.bar(subset =['A','B'], color = 'lightgreen',  width = 100).\
    highlight_null(null_color = 'yellow').\
    highlight_max().\
    format({"A":"{:+.2%}",
                "B":"{:,.2F}",
                 "C":"{:,.4}"
                })

Unnamed: 0,A,B,C,D
0,+64.47%,0.48,0.8638,0.991156
1,+72.85%,0.57,0.07755,0.361102
2,+nan%,0.03,0.9761,0.908863
3,+51.19%,0.68,0.5903,0.46906
4,+41.30%,0.12,0.3646,0.403229
