In [126]:
import pandas as pd

# 基本数据结构
## DataFrame

<img alt="DataFrame基本结构" src="https://pic.liuzaoqi.com/picgo/202112111201158.png"  style="zoom: 25%;" />

In [127]:
dates = pd.date_range("20130101", periods=6)

df = pd.DataFrame(
    {
        "A": [0.342275, -0.010251, -0.344072, -0.235446, 3.074955, -0.039975],
        "B": [-0.333060, -0.322083, -1.185725, -1.721794, 1.848873, 1.090794],
        "C": ["male", "male", "female", "male", "female", "female"],
        "D": [1.808311, -0.960891, -0.716058, 0.242253, -0.795627, -1.111459],
    },
    index=dates,
)
df

Unnamed: 0,A,B,C,D
2013-01-01,0.342275,-0.33306,male,1.808311
2013-01-02,-0.010251,-0.322083,male,-0.960891
2013-01-03,-0.344072,-1.185725,female,-0.716058
2013-01-04,-0.235446,-1.721794,male,0.242253
2013-01-05,3.074955,1.848873,female,-0.795627
2013-01-06,-0.039975,1.090794,female,-1.111459


## Series

- Pandas的每一列是一个Series。(PS：每一行数据也可以看作是一个 Series。)

- Series是一种类似于一维数组的结构，它可以包含任何数据类型的数据。

- Series可以被索引、切片、合并、重塑等操作。

- Series可以被看做是一维的DataFrame。

- **Series 没有列标签，只有行标签因为它只是 DataFrame 的一个列。**

<img alt="Series基本结构" src="https://pic.liuzaoqi.com/picgo/202112111125570.png"  style="zoom: 75%;" />

In [128]:
print(df["B"])

print(type(df["C"]))

first_row_series = df.iloc[0]
print(first_row_series)
print(type(first_row_series))

2013-01-01   -0.333060
2013-01-02   -0.322083
2013-01-03   -1.185725
2013-01-04   -1.721794
2013-01-05    1.848873
2013-01-06    1.090794
Freq: D, Name: B, dtype: float64
<class 'pandas.core.series.Series'>
A    0.342275
B    -0.33306
C        male
D    1.808311
Name: 2013-01-01 00:00:00, dtype: object
<class 'pandas.core.series.Series'>


# 利用 DataFrame 或 Series 进行分析举例

- pandas提供了很多功能，每个功能都是可以应用于DataFrame或Series的方法。如max() 、min() 、mean() 等

- 大部分 pandas 操作都会返回一个DataFrame或一个Series。例如若果对数据表的数值数据的一些基本统计感兴趣，可以使用 describe() 来得到一个新的 dataframe

In [129]:
print(df["B"].max())

print(df.dtypes)

df.describe()

1.848873
A    float64
B    float64
C     object
D    float64
dtype: object


Unnamed: 0,A,B,D
count,6.0,6.0,6.0
mean,0.464581,-0.103832,-0.255578
std,1.300232,1.351197,1.117244
min,-0.344072,-1.721794,-1.111459
25%,-0.186578,-0.972559,-0.919575
50%,-0.025113,-0.327572,-0.755842
75%,0.254143,0.737575,0.002675
max,3.074955,1.848873,1.808311


# 利用 Pandas 进行数据分析的基本流程
1. 导入 Pandas 库
2. 读取或创建 DataFrame
3. 使用不同的 pandas 函数进行分析

# Pandas 数据查看

In [130]:
# 查看数据的顶部和尾部
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.342275,-0.33306,male,1.808311
2013-01-02,-0.010251,-0.322083,male,-0.960891
2013-01-03,-0.344072,-1.185725,female,-0.716058
2013-01-04,-0.235446,-1.721794,male,0.242253
2013-01-05,3.074955,1.848873,female,-0.795627


In [131]:
df.tail(2)

Unnamed: 0,A,B,C,D
2013-01-05,3.074955,1.848873,female,-0.795627
2013-01-06,-0.039975,1.090794,female,-1.111459


In [132]:
# 查看DataFrame对象的索引，列名，数据信息
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [133]:
# 查看列名
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [134]:
# 查看值
df.values

array([[0.342275, -0.33306, 'male', 1.808311],
       [-0.010251, -0.322083, 'male', -0.960891],
       [-0.344072, -1.185725, 'female', -0.716058],
       [-0.235446, -1.721794, 'male', 0.242253],
       [3.074955, 1.848873, 'female', -0.795627],
       [-0.039975, 1.090794, 'female', -1.111459]], dtype=object)

In [135]:
# 查看描述性统计
df.describe()

Unnamed: 0,A,B,D
count,6.0,6.0,6.0
mean,0.464581,-0.103832,-0.255578
std,1.300232,1.351197,1.117244
min,-0.344072,-1.721794,-1.111459
25%,-0.186578,-0.972559,-0.919575
50%,-0.025113,-0.327572,-0.755842
75%,0.254143,0.737575,0.002675
max,3.074955,1.848873,1.808311


In [136]:
# 查看数据转置

df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.342275,-0.010251,-0.344072,-0.235446,3.074955,-0.039975
B,-0.33306,-0.322083,-1.185725,-1.721794,1.848873,1.090794
C,male,male,female,male,female,female
D,1.808311,-0.960891,-0.716058,0.242253,-0.795627,-1.111459


In [137]:
# 根据索引排序
# 当 axis=0 时, 排序行，Pandas 会按照行索引(index)对整个 DataFrame 进行排序（即日期）。
# 当 axis=1 时, 排序列，Pandas 会按照列索引(column)对整个 DataFrame 进行排序（即改变列名顺序）。
print(df)
df.sort_index(axis=1, ascending=False)

                   A         B       C         D
2013-01-01  0.342275 -0.333060    male  1.808311
2013-01-02 -0.010251 -0.322083    male -0.960891
2013-01-03 -0.344072 -1.185725  female -0.716058
2013-01-04 -0.235446 -1.721794    male  0.242253
2013-01-05  3.074955  1.848873  female -0.795627
2013-01-06 -0.039975  1.090794  female -1.111459


Unnamed: 0,D,C,B,A
2013-01-01,1.808311,male,-0.33306,0.342275
2013-01-02,-0.960891,male,-0.322083,-0.010251
2013-01-03,-0.716058,female,-1.185725,-0.344072
2013-01-04,0.242253,male,-1.721794,-0.235446
2013-01-05,-0.795627,female,1.848873,3.074955
2013-01-06,-1.111459,female,1.090794,-0.039975


In [138]:
# 根据列名排序
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-04,-0.235446,-1.721794,male,0.242253
2013-01-03,-0.344072,-1.185725,female,-0.716058
2013-01-01,0.342275,-0.33306,male,1.808311
2013-01-02,-0.010251,-0.322083,male,-0.960891
2013-01-06,-0.039975,1.090794,female,-1.111459
2013-01-05,3.074955,1.848873,female,-0.795627


# 数据选取

官方建议使用优化的熊猫数据访问方法.at，.iat，.loc和.iloc

## 使用[]选取数据

使用[]选取数据时，如果索引不存在，会报错。

### 选取单列数据:

In [139]:
df["A"]  # 等效于df.A

2013-01-01    0.342275
2013-01-02   -0.010251
2013-01-03   -0.344072
2013-01-04   -0.235446
2013-01-05    3.074955
2013-01-06   -0.039975
Freq: D, Name: A, dtype: float64

### 按行选取数据

- 使用[行号:行号] 左闭右开的形式，左边是起始行号，右边是结束行号，不包括右边的行号。
- 按行选取数据,使用index的值进行选取，['index':index] 左开右闭区间


In [140]:
df[
    1:799
]  # 使用[行号:行号] 左闭右开的形式，左边是起始行号，右边是结束行号，不包括右边的行号。

Unnamed: 0,A,B,C,D
2013-01-02,-0.010251,-0.322083,male,-0.960891
2013-01-03,-0.344072,-1.185725,female,-0.716058
2013-01-04,-0.235446,-1.721794,male,0.242253
2013-01-05,3.074955,1.848873,female,-0.795627
2013-01-06,-0.039975,1.090794,female,-1.111459


In [141]:
df[
    "20130104":"20130110"
]  # 按行选取数据,使用index的值进行选取，['index':index] 左开右闭区间

Unnamed: 0,A,B,C,D
2013-01-04,-0.235446,-1.721794,male,0.242253
2013-01-05,3.074955,1.848873,female,-0.795627
2013-01-06,-0.039975,1.090794,female,-1.111459


## 通过标签(行标签index,列名column)选取数据

#### 通过行标签即index选取数据

In [146]:
df.loc[dates[5]]

A   -0.039975
B    1.090794
C      female
D   -1.111459
Name: 2013-01-06 00:00:00, dtype: object

#### 通过列标签即列名选取数据

In [148]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-06,-0.039975,1.090794


#### 通过行列标签选取数据

In [149]:
df.loc[dates[1] : dates[3], ["A", "C"]]

Unnamed: 0,A,C
2013-01-02,-0.010251,male
2013-01-03,-0.344072,female
2013-01-04,-0.235446,male


In [156]:
# loc：选取一行一列（即一个单元格）的数据：取出行标签（index）为dates[0]的A列的值
df.loc[dates[3], "A"]

np.float64(-0.235446)

In [155]:
# at：选取一行一列（即一个单元格）的数据：取出行标签（index）为dates[0]的A列的值
df.at[dates[3], "A"]

np.float64(-0.235446)

## 通过位置选取数据iloc

In [157]:
df.iloc[3]  # 第4行数据

A   -0.235446
B   -1.721794
C        male
D    0.242253
Name: 2013-01-04 00:00:00, dtype: object

In [159]:
df.iloc[3:5, 0:2]  # 选择第4行到第6行，第1列到第3列

Unnamed: 0,A,B
2013-01-04,-0.235446,-1.721794
2013-01-05,3.074955,1.848873


In [163]:
df.iloc[[1, 2, 4], [0, 2]]  # 取出第2、3、5行，第1列和第3列的数据

Unnamed: 0,A,C
2013-01-02,-0.010251,male
2013-01-03,-0.344072,female
2013-01-05,3.074955,female


In [162]:
df.iloc[1:3]    # 选取第2行到第3行的数据

Unnamed: 0,A,B,C,D
2013-01-02,-0.010251,-0.322083,male,-0.960891
2013-01-03,-0.344072,-1.185725,female,-0.716058


In [167]:
df.iloc[1, 1]  # 取第2行第2列的值

np.float64(-0.322083)

## 使用布尔索引选取数据

In [168]:
df[df.A>0]  # 选择A列大于0的行

Unnamed: 0,A,B,C,D
2013-01-01,0.342275,-0.33306,male,1.808311
2013-01-05,3.074955,1.848873,female,-0.795627


In [171]:
df[df > 0]  # 选择大于0的元素，如果数据中有非数字元素，则会报错

TypeError: '>' not supported between instances of 'str' and 'int'