# Pandas 学习笔记之统计篇

* 版本号： 0.1
* 创建时间： 2022年01月13日
* 修改时间： 2022年01月13日
* 数据来源：
 * movies.csv http://boxofficemojo.com/daily/
 * iris.csv https://github.com/dsaber/py-viz-blog
 * titanic.csv https://github.com/dsaber/py-viz-blog
 * ts.csv https://github.com/dsaber/py-viz-blog

## 一些准备工作

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns
# 辅助函数
def get_movie_df():
    """
    获得 movie dataframe
    """
    return pd.read_csv('datas/movies.csv', sep='\t', encoding='utf-8',thousands=',',escapechar='$')

def get_titanic_df():
    return pd.read_csv('datas/titanic.csv')

def get_iris_df():
    return pd.read_csv('datas/iris.csv')

def get_random_df():
    return pd.DataFrame(
        np.random.randn(6, 4),
        index=pd.date_range('20200101', periods=6),
        columns=list('ABCD'))

## 总体描述

In [2]:
df=get_titanic_df()
df.describe()  # 总体数据描述，只包括数值型数据

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [3]:
df.describe(include='all')  # 总体数据描述，包括所有类型数据

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
count,891.0,891.0,891,714.0,891.0,891.0,891.0,889,891,891,891,203,889,891,891
unique,,,2,,,,,3,3,3,2,7,3,2,2
top,,,male,,,,,S,Third,man,True,C,Southampton,no,True
freq,,,577,,,,,644,491,537,537,59,644,549,537
mean,0.383838,2.308642,,29.699118,0.523008,0.381594,32.204208,,,,,,,,
std,0.486592,0.836071,,14.526497,1.102743,0.806057,49.693429,,,,,,,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,,,,,,,
25%,0.0,2.0,,20.125,0.0,0.0,7.9104,,,,,,,,
50%,0.0,3.0,,28.0,0.0,0.0,14.4542,,,,,,,,
75%,1.0,3.0,,38.0,1.0,0.0,31.0,,,,,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


## 计数

### 分组计数 DataFrame.value_counts

定义：

```python
DataFrame.value_counts(
    self,
    subset: 'Sequence[Hashable] | None' = None,
    normalize: 'bool' = False,
    sort: 'bool' = True,
    ascending: 'bool' = False,
    dropna: 'bool' = True,
)
```

功能：

    返回一个 Series ， 其内容为 DataFrame 中唯一行的计数。

#### 针对某两列分组计数

In [5]:
df[['deck', 'sex']].value_counts()

deck  sex   
C     male      32
B     female    27
C     female    27
B     male      20
D     female    18
E     male      17
D     male      15
E     female    15
A     male      14
F     male       8
      female     5
G     female     4
A     female     1
dtype: int64

In [6]:
df.value_counts(['deck', 'sex'])

deck  sex   
C     male      32
B     female    27
C     female    27
B     male      20
D     female    18
E     male      17
D     male      15
E     female    15
A     male      14
F     male       8
      female     5
G     female     4
A     female     1
dtype: int64

#### 分组计数显示比率并从小到大排列

In [7]:
df.value_counts(['deck'], normalize=True, ascending=True)

deck
G       0.019704
F       0.064039
A       0.073892
E       0.157635
D       0.162562
B       0.231527
C       0.290640
dtype: float64

#### 分组计数默认不统计空值

In [8]:
df.value_counts(['age'])

age  
24.00    30
22.00    27
18.00    26
30.00    25
28.00    25
         ..
20.50     1
14.50     1
12.00     1
0.92      1
80.00     1
Length: 88, dtype: int64

In [9]:
df.value_counts(['age'], dropna=False)

age  
NaN      177
24.00     30
22.00     27
18.00     26
30.00     25
        ... 
24.50      1
0.67       1
0.92       1
36.50      1
0.42       1
Length: 89, dtype: int64

### 分组计数 DataFrame.groupby

In [10]:
df.groupby('deck').size()

deck
A    15
B    47
C    59
D    33
E    32
F    13
G     4
dtype: int64

In [11]:
df.groupby('deck').count()  # 因为age有空值，count不会统计空值行，所以数字会有所不同。

Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
deck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A,15,15,15,12,15,15,15,15,15,15,15,15,15,15
B,47,47,47,45,47,47,47,45,47,47,47,45,47,47
C,59,59,59,51,59,59,59,59,59,59,59,59,59,59
D,33,33,33,31,33,33,33,33,33,33,33,33,33,33
E,32,32,32,30,32,32,32,32,32,32,32,32,32,32
F,13,13,13,11,13,13,13,13,13,13,13,13,13,13
G,4,4,4,4,4,4,4,4,4,4,4,4,4,4


In [12]:
df.groupby('deck')['age'].count()  # 因为age有空值，count不会统计空值行，所以数字会有所不同。

deck
A    12
B    45
C    51
D    31
E    30
F    11
G     4
Name: age, dtype: int64

## 合计

In [13]:
data = {
    '序号': ['001','002','003', '004'],
    '姓名': ['小明', '小王', '小李', '小明'],
    '性别': ['男', '男', '男', '女'],
    '数学': [100, 95, 99, 66],
    '语文': [97, 88, 89, 76],
}
df = DataFrame(data)
df

Unnamed: 0,序号,姓名,性别,数学,语文
0,1,小明,男,100,97
1,2,小王,男,95,88
2,3,小李,男,99,89
3,4,小明,女,66,76


In [14]:
# 统计成绩
# 按行统计
df["个人总成绩"] = df[['数学', '语文']].apply(lambda x:x.sum(),axis =1)
# 按列统计
df.loc["科目总成绩"] = df[['数学', '语文']].apply(lambda x:x.sum(),axis = 0)
df

Unnamed: 0,序号,姓名,性别,数学,语文,个人总成绩
0,1.0,小明,男,100.0,97.0,197.0
1,2.0,小王,男,95.0,88.0,183.0
2,3.0,小李,男,99.0,89.0,188.0
3,4.0,小明,女,66.0,76.0,142.0
科目总成绩,,,,360.0,350.0,


## 统计：计数，平均，最大，最小，方差，标准差，同比，环比