In [49]:
import pandas as pd # 引入 pandas库

一个Series代表一个DataFrame的一列；更一般地说，它可以是任何一维的类数组对象。它同时包含：  
- 相同类型的值的序列。  
- 称为索引的数据标签序列。

In [50]:
s = pd.Series(['glm', 'golitter', 'kerwin', '田乐蒙'])
print(s)
# 索引
print(s.index)
# 值
print(s.values)

0         glm
1    golitter
2      kerwin
3         田乐蒙
dtype: object
RangeIndex(start=0, stop=4, step=1)
['glm' 'golitter' 'kerwin' '田乐蒙']


使用`pd.Series([...], index=[...])`传递自定义索引

In [51]:
s = pd.Series(['glm', 'golitter', 'kerwin', '田乐蒙'], index=['a', 'b', 'c', 'd'])
print(s)
print(s.index)
# 改变索引
s.index = ['first', 'second', 'third', 'fourth']
print("改变后的Series")
print(s)

a         glm
b    golitter
c      kerwin
d         田乐蒙
dtype: object
Index(['a', 'b', 'c', 'd'], dtype='object')
改变后的Series
first          glm
second    golitter
third       kerwin
fourth         田乐蒙
dtype: object


可以从Series中选择单个值或一组值。要做到这一点，有三种主要方法：
- 一个标签。
- 标签列表。
- 过滤条件。

In [52]:
s = pd.Series([4, -2, 0, 6], index = ['a', 'b', 'c', 'd'])
# 一个标签
print("一个标签：")
print(s['a'])
# 多个标签
print("\n多个标签：")
print(s[['a', 'c']]) # 注意这里是两个中括号
# 切片
print("\n切片：")
print(s['b':'d']) # 注意这里是中括号

print("如果DataFrame对象内传递的列表只有一个值时，将返回一个Series对象。")
print(s[['b']])

一个标签：
4

多个标签：
a    4
c    0
dtype: int64

切片：
b   -2
c    0
d    6
dtype: int64
如果DataFrame对象内传递的列表只有一个值时，将返回一个Series对象。
b   -2
dtype: int64


In [53]:
print("\n\n 过滤")
### 过滤
    # 首先，我们对Series应用布尔运算（向量操作）。这将创建一个新的 布尔值系列。 s > 0
print(s > 0)
    # 然后，我们将此布尔值系列传递给原始系列，以获取仅包含True的行。
print("\n过滤后的Series")
print(s[s > 0])

print(s)
even = s[s % 2 == 0] # 取出偶数
print("\n偶数")
print(even)



 过滤
a     True
b    False
c    False
d     True
dtype: bool

过滤后的Series
a    4
d    6
dtype: int64
a    4
b   -2
c    0
d    6
dtype: int64

偶数
a    4
b   -2
c    0
d    6
dtype: int64


通常，我们将使用将Series视为DataFrame中的列的透视图来处理它们。我们可以把**DataFrame看作是一个系列的集合，这些系列都共享相同的索引。**   
有许多方法可以创建DataFrame。在这里，我们将介绍最流行的方法：  
- 从CSV文件。
- 使用列表和列名。
- 从字典里查。
- 从一个系列。

# 从CSV文件

In [54]:
# 从 csv 文件中读取数据
elections = pd.read_csv('data/elections.csv')
elections

Unnamed: 0,Year,Candidate,Party,Popular vote,Result,%
0,1824,Andrew Jackson,Democratic-Republican,151271,loss,57.210122
1,1824,John Quincy Adams,Democratic-Republican,113142,win,42.789878
2,1828,Andrew Jackson,Democratic,642806,win,56.203927
3,1828,John Quincy Adams,National Republican,500897,loss,43.796073
4,1832,Andrew Jackson,Democratic,702735,win,54.574789
...,...,...,...,...,...,...
177,2016,Jill Stein,Green,1457226,loss,1.073699
178,2020,Joseph Biden,Democratic,81268924,win,51.311515
179,2020,Donald Trump,Republican,74216154,loss,46.858542
180,2020,Jo Jorgensen,Libertarian,1865724,loss,1.177979


Year 列将被用作索引列（而不是普通列）

In [55]:
elections = pd.read_csv('data/elections.csv', index_col='Year')
elections

Unnamed: 0_level_0,Candidate,Party,Popular vote,Result,%
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1824,Andrew Jackson,Democratic-Republican,151271,loss,57.210122
1824,John Quincy Adams,Democratic-Republican,113142,win,42.789878
1828,Andrew Jackson,Democratic,642806,win,56.203927
1828,John Quincy Adams,National Republican,500897,loss,43.796073
1832,Andrew Jackson,Democratic,702735,win,54.574789
...,...,...,...,...,...
2016,Jill Stein,Green,1457226,loss,1.073699
2020,Joseph Biden,Democratic,81268924,win,51.311515
2020,Donald Trump,Republican,74216154,loss,46.858542
2020,Jo Jorgensen,Libertarian,1865724,loss,1.177979


使用`reset_index()`，重新设置索引

In [56]:
# 将 Year 列设置为普通列
elections.reset_index(inplace=True) # inplace=True 下，会直接修改 elections DataFrame，而不会创建一个新的对象。
elections

Unnamed: 0,Year,Candidate,Party,Popular vote,Result,%
0,1824,Andrew Jackson,Democratic-Republican,151271,loss,57.210122
1,1824,John Quincy Adams,Democratic-Republican,113142,win,42.789878
2,1828,Andrew Jackson,Democratic,642806,win,56.203927
3,1828,John Quincy Adams,National Republican,500897,loss,43.796073
4,1832,Andrew Jackson,Democratic,702735,win,54.574789
...,...,...,...,...,...,...
177,2016,Jill Stein,Green,1457226,loss,1.073699
178,2020,Joseph Biden,Democratic,81268924,win,51.311515
179,2020,Donald Trump,Republican,74216154,loss,46.858542
180,2020,Jo Jorgensen,Libertarian,1865724,loss,1.177979


使用`set_index()`设置索引列

In [57]:
elections.set_index('Party')

Unnamed: 0_level_0,Year,Candidate,Popular vote,Result,%
Party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Democratic-Republican,1824,Andrew Jackson,151271,loss,57.210122
Democratic-Republican,1824,John Quincy Adams,113142,win,42.789878
Democratic,1828,Andrew Jackson,642806,win,56.203927
National Republican,1828,John Quincy Adams,500897,loss,43.796073
Democratic,1832,Andrew Jackson,702735,win,54.574789
...,...,...,...,...,...
Green,2016,Jill Stein,1457226,loss,1.073699
Democratic,2020,Joseph Biden,81268924,win,51.311515
Republican,2020,Donald Trump,74216154,loss,46.858542
Libertarian,2020,Jo Jorgensen,1865724,loss,1.177979


# 使用列表和列名

In [58]:
# 一个列
pd.DataFrame([1,2,3], columns=['column_name'])

Unnamed: 0,column_name
0,1
1,2
2,3


In [59]:
# 多个列
pd.DataFrame([
    [1, 'glm'],
    [2, 'golitter'],
    [3, 'kerwin']
], columns=['id', 'name'])

Unnamed: 0,id,name
0,1,glm
1,2,golitter
2,3,kerwin


# 从字典里查

In [60]:
pd.DataFrame({
    'id': [1, 2, 3],
    'name': ['glm', 'golitter', 'kerwin']
})

Unnamed: 0,id,name
0,1,glm
1,2,golitter
2,3,kerwin


# 从一个系列

In [61]:
s_a = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s_b = pd.Series([4, 5, 6], index=['a', 'b', 'c'])
pd.DataFrame({
    'A': s_a,
    'B': s_b
})


Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [62]:
s_a.to_frame() # pd.DataFrame(s_a)

Unnamed: 0,0
a,1
b,2
c,3


索引不必是数值型，不必唯一；可以设置某列作为索引，也可以恢复默认列。

属性：Index, Columns, and Shape

In [63]:
elections.set_index("Party", inplace = True)
elections.index

Index(['Democratic-Republican', 'Democratic-Republican', 'Democratic',
       'National Republican', 'Democratic', 'National Republican',
       'Anti-Masonic', 'Whig', 'Democratic', 'Whig',
       ...
       'Constitution', 'Republican', 'Independent', 'Libertarian',
       'Democratic', 'Green', 'Democratic', 'Republican', 'Libertarian',
       'Green'],
      dtype='object', name='Party', length=182)

In [64]:
elections.columns

Index(['Year', 'Candidate', 'Popular vote', 'Result', '%'], dtype='object')

对于DataFrame的形状，我们可以使用DataFrame。Shape以获得行数后跟列数：x

In [65]:
elections.shape

(182, 5)