In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# 5.1.2 DataFrame

### 通过字典构建一个dataframe

In [2]:
myDict = {'city': ['北京', '北京', '北京', '上海', '上海', '上海'], 
        'year': [2000, 2001, 2002, 2001, 2002, 2003], 
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
print(myDict)
df = pd.DataFrame(myDict)
df

{'city': ['北京', '北京', '北京', '上海', '上海', '上海'], 'year': [2000, 2001, 2002, 2001, 2002, 2003], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}


Unnamed: 0,city,pop,year
0,北京,1.5,2000
1,北京,1.7,2001
2,北京,3.6,2002
3,上海,2.4,2001
4,上海,2.9,2002
5,上海,3.2,2003


### 用head()方法会返回前5行,也可指定返回前多少行

In [3]:
df.head() # frame.head(10)

Unnamed: 0,city,pop,year
0,北京,1.5,2000
1,北京,1.7,2001
2,北京,3.6,2002
3,上海,2.4,2001
4,上海,2.9,2002


In [4]:
# 按照指定的列的顺序显示
pd.DataFrame(myDict, columns=['year', 'city', 'pop'])

Unnamed: 0,year,city,pop
0,2000,北京,1.5
1,2001,北京,1.7
2,2002,北京,3.6
3,2001,上海,2.4
4,2002,上海,2.9
5,2003,上海,3.2


In [5]:
# 指定一列
pd.DataFrame(myDict, columns=['pop'])

Unnamed: 0,pop
0,1.5
1,1.7
2,3.6
3,2.4
4,2.9
5,3.2


In [6]:
# 不存在的列,不会报错
pd.DataFrame(myDict, columns=['pop2']) 

Unnamed: 0,pop2


#### 如果你导入一个不存在的列名，那么会显示为缺失数据：

In [7]:
myDict

{'city': ['北京', '北京', '北京', '上海', '上海', '上海'],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2],
 'year': [2000, 2001, 2002, 2001, 2002, 2003]}

In [8]:
df2 = pd.DataFrame(myDict, columns=['year', 'city', 'pop', 'debt', '2debt'], 
                      index=['one', 'two', 'three', 'four', 'five', '6'])

In [9]:
df2

Unnamed: 0,year,city,pop,debt,2debt
one,2000,北京,1.5,,
two,2001,北京,1.7,,
three,2002,北京,3.6,,
four,2001,上海,2.4,,
five,2002,上海,2.9,,
6,2003,上海,3.2,,


In [10]:
# 列索引
print(type(df2.columns))
df2.columns

<class 'pandas.core.indexes.base.Index'>


Index(['year', 'city', 'pop', 'debt', '2debt'], dtype='object')

In [11]:
# 行索引
print(type(df2.index))
df2.index

<class 'pandas.core.indexes.base.Index'>


Index(['one', 'two', 'three', 'four', 'five', '6'], dtype='object')

### 从DataFrame里提取一列或一列都会返回series格式

#### 取一列数据, 可以以属性或是dict一样的形式来提取

In [12]:
df2['city'] # === df2.city

one      北京
two      北京
three    北京
four     上海
five     上海
6        上海
Name: city, dtype: object

In [13]:
df2.city

one      北京
two      北京
three    北京
four     上海
five     上海
6        上海
Name: city, dtype: object

注意：frame2[column]能应对任何列名，但frame2.column的情况下，列名必须是有效的python变量名才行。 下面的会报错

In [14]:
df2['2debt']

one      NaN
two      NaN
three    NaN
four     NaN
five     NaN
6        NaN
Name: 2debt, dtype: object

#### 取一行数据，需要用在loc属性里用 位置或名字：

In [15]:
oneRow = df2.loc['three']
print(type(oneRow)) # 
oneRow

<class 'pandas.core.series.Series'>


year     2002
city       北京
pop       3.6
debt      NaN
2debt     NaN
Name: three, dtype: object

#### 列值也能通过赋值改变。同时改变所有的值

In [16]:
df2['debt'] = 2.713
df2

Unnamed: 0,year,city,pop,debt,2debt
one,2000,北京,1.5,2.713,
two,2001,北京,1.7,2.713,
three,2002,北京,3.6,2.713,
four,2001,上海,2.4,2.713,
five,2002,上海,2.9,2.713,
6,2003,上海,3.2,2.713,


In [17]:
# 修改值, 长度必须匹配,多了少了都不行
df2['debt'] = np.arange(6.)
# df2['debt'] = np.arange(4.) # 不匹配
# df2['debt'] = np.arange(9.) # 不匹配
df2

Unnamed: 0,year,city,pop,debt,2debt
one,2000,北京,1.5,0.0,
two,2001,北京,1.7,1.0,
three,2002,北京,3.6,2.0,
four,2001,上海,2.4,3.0,
five,2002,上海,2.9,4.0,
6,2003,上海,3.2,5.0,


In [18]:
# 冷知识,np.arange()居然能够传入小数
print(np.arange(3.4))
print(np.arange(2.2, 5.6, 1.1))

# myList = range(3.4) # ERROR Python自带的range()是不行的

[0. 1. 2. 3.]
[2.2 3.3 4.4 5.5]


如果把list或array赋给column的话，长度必须符合DataFrame的长度：

In [19]:
df2

Unnamed: 0,year,city,pop,debt,2debt
one,2000,北京,1.5,0.0,
two,2001,北京,1.7,1.0,
three,2002,北京,3.6,2.0,
four,2001,上海,2.4,3.0,
five,2002,上海,2.9,4.0,
6,2003,上海,3.2,5.0,


In [20]:
ser = pd.Series([-1.2, -1.5, -1.7, 777], index=['two', 'four', 'five', 'seven'])
df2['debt'] = ser # 以原始的索引为准, ser的'seven'和777会被删除掉
df2

Unnamed: 0,year,city,pop,debt,2debt
one,2000,北京,1.5,,
two,2001,北京,1.7,-1.2,
three,2002,北京,3.6,,
four,2001,上海,2.4,-1.5,
five,2002,上海,2.9,-1.7,
6,2003,上海,3.2,,


#### 如果列不存在，赋值会创建一个新列。而del也能像删除字典关键字一样，删除列：

In [21]:
df2['newcolumn'] = df2.city == '北京'
df2

Unnamed: 0,year,city,pop,debt,2debt,newcolumn
one,2000,北京,1.5,,,True
two,2001,北京,1.7,-1.2,,True
three,2002,北京,3.6,,,True
four,2001,上海,2.4,-1.5,,False
five,2002,上海,2.9,-1.7,,False
6,2003,上海,3.2,,,False


#### 删除列,  tip:删除不存在的列会报错

In [22]:
del df2['newcolumn'], df2['2debt']

In [23]:
df2.columns

Index(['year', 'city', 'pop', 'debt'], dtype='object')

### 另一种创建DataFrame的格式是dict中的dict, pandas会把外层dict的key当做列索引，内层key当做行索引：

In [24]:
myDict2 = {'上海': {2001: 2.4, 2002: 2.9},
       '北京': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [25]:
df3 = pd.DataFrame(myDict2)
df3

Unnamed: 0,上海,北京
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [26]:
pd.DataFrame(myDict2, index=[2001, 2002, 2003, 2000])

Unnamed: 0,上海,北京
2001,2.4,1.7
2002,2.9,3.6
2003,,
2000,,1.5


#### DataFrame也可以向numpy数组一样做转置：

In [27]:
df3.T

Unnamed: 0,2000,2001,2002
上海,,2.4,2.9
北京,1.5,1.7,3.6


### 字典的值是Series类型组成的df 

In [28]:
print(df3['北京'][:-1])
df3['上海'][:2]

2000    1.5
2001    1.7
Name: 北京, dtype: float64


2000    NaN
2001    2.4
Name: 上海, dtype: float64

In [29]:
myDict3 = {'Ohio': df3['北京'][:-1],
         'Nevada': df3['上海'][:2]}
# pdata字典的值是series类型
pd.DataFrame(myDict3)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


### DataFrame的index和column有自己的name属性

In [30]:
df3.index.name = 'year'
df3.columns.name = 'city'
df3

city,上海,北京
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


### DataFrame的values属性会返回二维数组：

In [31]:
df3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])