# DataFrame

In [2]:
import numpy as np
import pandas as pd

## 创建

* 一维数组创建的是表中的列

In [11]:
df = pd.DataFrame([1,2,3])
df

Unnamed: 0,0
0,1
1,2
2,3


* 二维数组创建的还是二维表

In [4]:
df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


* 设置index

In [6]:
df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]],
                 index=[1,2,3])
df

Unnamed: 0,0,1,2
1,1,2,3
2,4,5,6
3,7,8,9


* 设置列名

In [7]:
df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]],
                 columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


* 字典创建的表，key是列名，value是列。

In [8]:
df = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7,8,9]})
df

Unnamed: 0,a,b,c
0,1,4,7
1,2,5,8
2,3,6,9


* 更改index

In [9]:
df.index = [1,2,3]
df

Unnamed: 0,a,b,c
1,1,4,7
2,2,5,8
3,3,6,9


## 向量化操作

In [14]:
df1 = pd.DataFrame([1,2,3])
df1

Unnamed: 0,0
0,1
1,2
2,3


In [20]:
df2 = pd.DataFrame([1,2,3])
df2

Unnamed: 0,0
0,1
1,2
2,3


In [21]:
df1-df2

Unnamed: 0,0
0,0
1,0
2,0


In [25]:
# 更改index
df2.index = [1,2,3]
df2

Unnamed: 0,0
1,1
2,2
3,3


In [24]:
df1-df2

Unnamed: 0,0
0,
1,1.0
2,1.0
3,


In [18]:
# 更改列名
df2.columns = [1]
df2

Unnamed: 0,1
1,1
2,2
3,3


In [19]:
df1-df2

Unnamed: 0,0,1
0,,
1,,
2,,
3,,


**`总结：`向量化操作时，使用index, coloum定位。**

## 数据的统计

In [27]:
df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
df.describe() # 数据描述，更全面的了解数据
# var() 方差
# std() 标准差
# mean() 均值
# median() 中位数
# min() 最小值
# max() 最大值

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


* 显示前面几行数据

In [30]:
df.head(2)

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


* 显示后面几行数据

In [31]:
df.tail(2)

Unnamed: 0,0,1,2
1,4,5,6
2,7,8,9


## 索引

In [33]:
df = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


* 顺序索引，区间 [行,列)

In [35]:
df.iloc[:2, :3]

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


* 名称索引，区间 [行,列]

In [37]:
df.loc[:1, :2]

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


In [40]:
df1 = df
df1.index = [4, 5, 6]
df1.columns = [7, 8, 9]
df1

Unnamed: 0,7,8,9
4,1,2,3
5,4,5,6
6,7,8,9


In [41]:
df1.loc[:5, :8]

Unnamed: 0,7,8
4,1,2
5,4,5


In [42]:
df1.loc[[5,6], [8,9]]

Unnamed: 0,8,9
5,5,6
6,8,9


* 索引可重复

In [44]:
df = pd.DataFrame([1,2,3,4], index=[0,1,1,2])
df

Unnamed: 0,0
0,1
1,2
1,3
2,4


In [45]:
df.loc[1, :]

Unnamed: 0,0
1,2
1,3
