# 用 Groupby 做出结论

In [3]:
# Load `winequality_edited.csv`
import pandas as pd
df =pd.read_csv('winequality_edited.csv')

In [4]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


### 是否有一种特定类型的酒具有较高质量？  
白葡萄酒

In [5]:
# 用 groupby 计算每个酒类型（红葡萄酒和白葡萄酒）的平均质量
df['quality'].groupby(df['color'])

<pandas.core.groupby.SeriesGroupBy object at 0x000002A69348FDA0>

#### 写法1：

In [6]:
df['quality'].groupby(df['color']).mean()

color
red      5.636023
white    5.877909
Name: quality, dtype: float64

#### 写法2：

In [7]:
df.groupby('color').mean()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
red,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
white,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909


根据color对每一列数据取均值

In [8]:
df.groupby('color').mean()['quality']

color
red      5.636023
white    5.877909
Name: quality, dtype: float64

### 哪个酸度水平的平均评分最高？

In [9]:
# 用 Pandas 描述功能查看最小、25%、50%、75% 和 最大 pH 值
df['pH'].describe()

count    6497.000000
mean        3.218501
std         0.160787
min         2.720000
25%         3.110000
50%         3.210000
75%         3.320000
max         4.010000
Name: pH, dtype: float64

酸度水平：  

    高: 最低 25% 时的 pH 值
    中等偏高: 25% - 50% 时的 pH 值
    中: 50% - 75% 时的 pH 值
    低: 最高 75% 时的 pH 值


In [10]:
# 对用于把数据“分割”成组的边缘进行分组
bin_edges = [2.72 ,3.11 ,3.21 ,3.32 ,4.01 ] # 用刚才计算的五个值填充这个列表

In [11]:
# 四个酸度水平组的标签
bin_names = [ 'High','Moderately High', 'Medium','Low' ]  # 对每个酸度水平类别进行命名

In [12]:
# 创建 acidity_levels 列
df['acidity_levels'] = pd.cut(df['pH'], bin_edges, labels=bin_names)

# 检查该列是否成功创建
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color,acidity_levels
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,Low
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red,Moderately High
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red,Medium
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red,Moderately High
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,Low


+ 使用**groupby** 对酸度水平进行分组，并计算平均值  
*（以下使用两种方式实现）*


In [15]:
# 用 groupby 计算每个酸度水平的平均质量
df.groupby('acidity_levels')['quality'].mean()

acidity_levels
High               5.783343
Moderately High    5.784540
Medium             5.850832
Low                5.859593
Name: quality, dtype: float64

In [18]:
df.groupby('acidity_levels').mean()['quality']

acidity_levels
High               5.783343
Moderately High    5.784540
Medium             5.850832
Low                5.859593
Name: quality, dtype: float64

### 得出结论：哪个水平的酸度获得最高的平均评级  
Low

In [19]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color,acidity_levels
0,7.4,0.700,0.00,1.90,0.076,11.0,34.0,0.99780,3.51,0.56,9.400000,5,red,Low
1,7.8,0.880,0.00,2.60,0.098,25.0,67.0,0.99680,3.20,0.68,9.800000,5,red,Moderately High
2,7.8,0.760,0.04,2.30,0.092,15.0,54.0,0.99700,3.26,0.65,9.800000,5,red,Medium
3,11.2,0.280,0.56,1.90,0.075,17.0,60.0,0.99800,3.16,0.58,9.800000,6,red,Moderately High
4,7.4,0.700,0.00,1.90,0.076,11.0,34.0,0.99780,3.51,0.56,9.400000,5,red,Low
5,7.4,0.660,0.00,1.80,0.075,13.0,40.0,0.99780,3.51,0.56,9.400000,5,red,Low
6,7.9,0.600,0.06,1.60,0.069,15.0,59.0,0.99640,3.30,0.46,9.400000,5,red,Medium
7,7.3,0.650,0.00,1.20,0.065,15.0,21.0,0.99460,3.39,0.47,10.000000,7,red,Low
8,7.8,0.580,0.02,2.00,0.073,9.0,18.0,0.99680,3.36,0.57,9.500000,7,red,Low
9,7.5,0.500,0.36,6.10,0.071,17.0,102.0,0.99780,3.35,0.80,10.500000,5,red,Low


In [20]:
# 保存更改，供下一段使用
df.to_csv('winequality_edited_addcut.csv', index=False)

参考：https://github.com/evaseemefly/DAND-QA/blob/master/Intro-DataAnalysis/L2-CaseStudy1/conclusions_groupby-zh.ipynb