# Pandas 基础操作

## Series
* 同时具备列表和字典的功能。
 * 使用默认索引时，具备列表的功能。
 * 使用文字索引时，具有字典的功能。
* 最重要的功能：运算的时候自动对齐
 * 对齐的依据：索引。

In [1]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

In [2]:
# 列表转Series
ls1 = list('abcdabc')
print(ls1)
# 不指定索引时，自动创建0～N-1的整数索引
sr1 = Series(ls1)
print(sr1)
# 索引
print(sr1.index)
# 值
print(sr1.values)

['a', 'b', 'c', 'd', 'a', 'b', 'c']
0    a
1    b
2    c
3    d
4    a
5    b
6    c
dtype: object
RangeIndex(start=0, stop=7, step=1)
['a' 'b' 'c' 'd' 'a' 'b' 'c']


In [3]:
# 指定索引创建Series
sr2 = Series([1, 4, 9, 16], index=list('abcd'))
print(sr2)

# 索引取值
print("sr2['a'] = %d" % sr2['a'])

# 设置多个索引的值
sr2['a','b'] = 10
print(sr2)

# 选取部分索引，做成新的Series
print(sr2[list('bd')])

# 使用Where条件选取部分，所称新的Series
print(sr2[sr2 == 10])

a     1
b     4
c     9
d    16
dtype: int64
sr2['a'] = 1
a    10
b    10
c     9
d    16
dtype: int64
b    10
d    16
dtype: int64
a    10
b    10
dtype: int64


In [15]:
# 用字典创建序列
sdata = {'a':12, 'b':34, 'c':56}
sd1 = Series(sdata)
print(sd1)

# 切片：指定范围
sd2 = sd1['a':'b']
print(sd2)

# 切片：指定条件
sd3 = sd1[sd1 > 30]
print(sd3)

a    12
b    34
c    56
dtype: int64
a    12
b    34
dtype: int64
b    34
c    56
dtype: int64


In [5]:
# 计算
sr3 = sr2.copy()
print(sr3)

# 四则计算
sr4 = sr3 * 0.1 + 1
print(sr4)

# 指数运算
sr5 = np.exp(sr4)
print(sr5)

# 对数运算
sr6 = np.log(sr4)
print(sr6)

a    10
b    10
c     9
d    16
dtype: int64
a    2.0
b    2.0
c    1.9
d    2.6
dtype: float64
a     7.389056
b     7.389056
c     6.685894
d    13.463738
dtype: float64
a    0.693147
b    0.693147
c    0.641854
d    0.955511
dtype: float64


In [6]:
# 为空判断
sn1 = pd.isnull(sr6)
print(sn1)

# 非空判断
sn2 = pd.notnull(sr6)
print(sn2)

a    False
b    False
c    False
d    False
dtype: bool
a    True
b    True
c    True
d    True
dtype: bool


In [7]:
# 修改名称和索引
sr7 = sr2.copy()
print(sr7)

# 修改名称
sr7.name = 'Money'
print(sr7)

# 修改索引
sr7.index = ['A', 'B', 'C', 'D']
print(sr7)


a    10
b    10
c     9
d    16
dtype: int64
a    10
b    10
c     9
d    16
Name: Money, dtype: int64
A    10
B    10
C     9
D    16
Name: Money, dtype: int64


In [8]:
# Series相加
sr81 = Series({'a':1, 'b':2})
print(sr81)
sr82 = Series({'b':8, 'c':4})
print(sr82)
# 不同索引扩展合并，相同索引相加
sr83 = sr81 + sr82
print(sr83)

a    1
b    2
dtype: int64
b    8
c    4
dtype: int64
a     NaN
b    10.0
c     NaN
dtype: float64


## DataFrame
* 表格型数据结构
 * 同时具有行索引，列索引
 * 行和列都可以作为一个Series取出

In [9]:
# 字典转DataFrame
dt1 = {'aa':[1, 2, 3, 4],
       'bb':[1.1, 2.2, 3.3, 4.4],
       'cc':[1.11, 2.22, 3.33, 4.44],
       'dd':[1.111, 2.222, 3.333, 4.444]}
df1 = DataFrame(dt1)
print(df1)

# 设置索引
df1.index = ['one', 'two', 'three', 'four']
print(df1)

   aa   bb    cc     dd
0   1  1.1  1.11  1.111
1   2  2.2  2.22  2.222
2   3  3.3  3.33  3.333
3   4  4.4  4.44  4.444
       aa   bb    cc     dd
one     1  1.1  1.11  1.111
two     2  2.2  2.22  2.222
three   3  3.3  3.33  3.333
four    4  4.4  4.44  4.444


In [23]:
# 竖向切片，生成新的DataFrame
df2 = df1[['aa','bb']]
print(df2)
print(type(df2))

# 横向切片
df3 = df1.loc[['one', 'two']]
print(df3)
print(type(df3))

# 切片：指定范围
df4 = df1[:2]
print(df4)

# 切片：指定条件(列)
df5 = df1[df1['cc'] > 3.3]
print(df5)


       aa   bb
one     1  1.1
two     2  2.2
three   3  3.3
four    4  4.4
<class 'pandas.core.frame.DataFrame'>
     aa   bb    cc     dd
one   1  1.1  1.11  1.111
two   2  2.2  2.22  2.222
<class 'pandas.core.frame.DataFrame'>
     aa   bb    cc     dd
one   1  1.1  1.11  1.111
two   2  2.2  2.22  2.222
       aa   bb    cc     dd
three   3  3.3  3.33  3.333
four    4  4.4  4.44  4.444


In [11]:
print(df1)
# 取一列，得到Series
sf1 = df1['aa']
sf3 = df1.aa
print(sf1)
print(type(sf1))
print(sf3)
print(sf1 is sf3)

# 取一行，得到Series
sf2 = df1.loc['two']
print(sf2)
print(type(sf2))

       aa   bb    cc     dd
one     1  1.1  1.11  1.111
two     2  2.2  2.22  2.222
three   3  3.3  3.33  3.333
four    4  4.4  4.44  4.444
one      1
two      2
three    3
four     4
Name: aa, dtype: int64
<class 'pandas.core.series.Series'>
one      1
two      2
three    3
four     4
Name: aa, dtype: int64
True
aa    2.000
bb    2.200
cc    2.220
dd    2.222
Name: two, dtype: float64
<class 'pandas.core.series.Series'>
