In [1]:
import numpy as np
import pandas as pd

## ```pandas.Series```

In [2]:
# Series是一个带索引数据构成的一位数组
data = pd.Series([0.25, 0.36, 0.44, 1.55])
data

0    0.25
1    0.36
2    0.44
3    1.55
dtype: float64

In [3]:
data.values

array([0.25, 0.36, 0.44, 1.55])

In [4]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
data[0]

0.25

In [6]:
data[1:3]

1    0.36
2    0.44
dtype: float64

相比Numpy数组通过**隐式定义**的整数索引，Series通过**显式索引**与数值关联。

In [7]:
# 例子
data = pd.Series([0.25, 0.36, 0.44, 1.55], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.36
c    0.44
d    1.55
dtype: float64

In [8]:
data['a']

0.25

In [9]:
# 不连续索引例子
data = pd.Series([0.25, 0.36, 0.44, 1.55], index=[1,3,5,7])
data

1    0.25
3    0.36
5    0.44
7    1.55
dtype: float64

In [10]:
data[7]

1.55

In [11]:
# 与字典的类对比
population_dict = {'Jiangsu': 10,
                   'Beijing': 11,
                   'Hubei': 12,
                   'Nanjin': 13}
population = pd.Series(population_dict)
population

Jiangsu    10
Beijing    11
Hubei      12
Nanjin     13
dtype: int64

In [12]:
# 创建Series对象
# 1.
pd.Series([2,4,6])

0    2
1    4
2    6
dtype: int64

In [13]:
# 2.
pd.Series(5, index=[10, 20, 30])

10    5
20    5
30    5
dtype: int64

In [14]:
# 3.
pd.Series({2:'a', 4:'b', 3:'c'})

2    a
4    b
3    c
dtype: object

In [15]:
# 4.
# 只保留显式定义的键指对
pd.Series({2:'a', 4:'b', 3:'c'}, index=[3,2])

3    c
2    a
dtype: object

## ```pandas.DataFrame```

In [16]:
# 带索引的灵活二维数组，排列有序的Series对象
area_dict = {'Jiangsu':100,
             'Beijing':200,
             'Hubei': 300,
             'Wuhan': 400}
area = pd.Series(area_dict)
area

Jiangsu    100
Beijing    200
Hubei      300
Wuhan      400
dtype: int64

In [17]:
df = pd.DataFrame({'Area':area, 'Population':population})
df

Unnamed: 0,Area,Population
Beijing,200.0,11.0
Hubei,300.0,12.0
Jiangsu,100.0,10.0
Nanjin,,13.0
Wuhan,400.0,


In [18]:
df.index

Index(['Beijing', 'Hubei', 'Jiangsu', 'Nanjin', 'Wuhan'], dtype='object')

In [19]:
df.columns

Index(['Area', 'Population'], dtype='object')

In [20]:
df['Area']

Beijing    200.0
Hubei      300.0
Jiangsu    100.0
Nanjin       NaN
Wuhan      400.0
Name: Area, dtype: float64

In [21]:
# 创建df对象
# 1. 通过单个Series
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
Jiangsu,10
Beijing,11
Hubei,12
Nanjin,13


In [22]:
# 2. 元素为字典的列表
data = [{'num': i, 'pow': i ** 2} for i in range(4)]
df = pd.DataFrame(data, index=['zero', 'one', 'two', 'three'])
df

Unnamed: 0,num,pow
zero,0,0
one,1,1
two,2,4
three,3,9


In [23]:
# 3. 通过Series创建字典
df = pd.DataFrame({'Area':area, 'Population':population})
df

Unnamed: 0,Area,Population
Beijing,200.0,11.0
Hubei,300.0,12.0
Jiangsu,100.0,10.0
Nanjin,,13.0
Wuhan,400.0,


In [24]:
# 4. 通过numpy二维数组
df = pd.DataFrame(np.random.rand(2, 3), index=['foo', 'bar'], columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
foo,0.784268,0.548543,0.908367
bar,0.648134,0.448845,0.346874


In [25]:
# 5. numpy结构化数组
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [26]:
df = pd.DataFrame(A)
df

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


## ```pandas.index```

In [27]:
ind = pd.Index([2,3,5,7,9])
ind

Int64Index([2, 3, 5, 7, 9], dtype='int64')

In [28]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [29]:
# 索引不可变
ind[0] = 1

TypeError: Index does not support mutable operations

In [30]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,4,8,9])
# 集合操作
indA & indB

Int64Index([3, 9], dtype='int64')

In [31]:
indA | indB

Int64Index([1, 2, 3, 4, 5, 7, 8, 9], dtype='int64')

In [32]:
indA ^ indB

Int64Index([1, 2, 4, 5, 7, 8], dtype='int64')