# 用查询功能做出结论

In [4]:
import pandas as pd


In [5]:
# 加载 `winequality_edited.csv`

df =pd.read_csv('winequality_edited_addcut.csv')

### 酒精含量高的酒是否评分较高？

In [6]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 14 columns):
fixed acidity           6497 non-null float64
volatile acidity        6497 non-null float64
citric acid             6497 non-null float64
residual sugar          6497 non-null float64
chlorides               6497 non-null float64
free sulfur dioxide     6497 non-null float64
total sulfur dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6497 non-null float64
sulphates               6497 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
color                   6497 non-null object
acidity_levels          6496 non-null object
dtypes: float64(11), int64(1), object(2)
memory usage: 710.7+ KB


In [7]:
# 获取酒精含量的中位数
# 中位数为 10.30
df['alcohol'].describe()

count    6497.000000
mean       10.491801
std         1.192712
min         8.000000
25%         9.500000
50%        10.300000
75%        11.300000
max        14.900000
Name: alcohol, dtype: float64

In [8]:
# 选择酒精含量小于中位数的样本
low_alcohol =df.query('alcohol < 10.30')
low_alcohol

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color,acidity_levels
0,7.4,0.700,0.00,1.90,0.076,11.0,34.0,0.99780,3.51,0.56,9.400000,5,red,Low
1,7.8,0.880,0.00,2.60,0.098,25.0,67.0,0.99680,3.20,0.68,9.800000,5,red,Moderately High
2,7.8,0.760,0.04,2.30,0.092,15.0,54.0,0.99700,3.26,0.65,9.800000,5,red,Medium
3,11.2,0.280,0.56,1.90,0.075,17.0,60.0,0.99800,3.16,0.58,9.800000,6,red,Moderately High
4,7.4,0.700,0.00,1.90,0.076,11.0,34.0,0.99780,3.51,0.56,9.400000,5,red,Low
5,7.4,0.660,0.00,1.80,0.075,13.0,40.0,0.99780,3.51,0.56,9.400000,5,red,Low
6,7.9,0.600,0.06,1.60,0.069,15.0,59.0,0.99640,3.30,0.46,9.400000,5,red,Medium
7,7.3,0.650,0.00,1.20,0.065,15.0,21.0,0.99460,3.39,0.47,10.000000,7,red,Low
8,7.8,0.580,0.02,2.00,0.073,9.0,18.0,0.99680,3.36,0.57,9.500000,7,red,Low
10,6.7,0.580,0.08,1.80,0.097,15.0,65.0,0.99590,3.28,0.54,9.200000,5,red,Medium


In [10]:
# 选择酒精含量大于等于中位数的样本
high_alcohol =df.query('alcohol >= 10.30')

# 确保这些查询中的每个样本只出现一次
num_samples = df.shape[0]
num_samples == low_alcohol['quality'].count() + high_alcohol['quality'].count() # 应为真

True

In [11]:
# 获取低酒精含量组和高酒精含量组的平均质量评分
high_alcohol['quality'].mean()

6.146084337349397

In [12]:
low_alcohol['quality'].mean()

5.475920679886686

### 口感较甜的酒是否评分较高？

In [18]:
# 获取残留糖分的中位数
# 3.000000
df['residual sugar'].describe()

count    6497.000000
mean        5.443235
std         4.757804
min         0.600000
25%         1.800000
50%         3.000000
75%         8.100000
max        65.800000
Name: residual sugar, dtype: float64

In [20]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'color', 'acidity_levels'],
      dtype='object')

In [24]:
df.rename(columns={'residual sugar':'residual_sugar'},inplace=True)

In [26]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual_sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color,acidity_levels
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,Low
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red,Moderately High
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red,Medium
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red,Moderately High
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,Low


* 注意此处存在的问题是：对于列名中包含 **空格** 的会导致query查询失败，所以需要将空格转成_ 

In [27]:
# 选择残留糖分小于中位数的样本
low_sugar =df.query('residual_sugar < 3')

In [28]:
# 选择残留糖分大于等于中位数的样本
high_sugar =df.query('residual_sugar >= 3')

# 确保这些查询中的每个样本只出现一次
num_samples == low_sugar['quality'].count() + high_sugar['quality'].count() # 应为真

True

In [30]:
# 获取低糖分组和高糖分组的平均质量评分
low_sugar['quality'].mean()

5.808800743724822

In [31]:
high_sugar['quality'].mean()

5.82782874617737

参考网址：https://github.com/evaseemefly/DAND-QA/blob/master/Intro-DataAnalysis/L2-CaseStudy1/conclusions_query-zh.ipynb