# Drawing Conclusions Using Query

In [3]:
# Load 'wine_df_concat.csv,' a file you created in a previous section 
import pandas as pd

df = pd.read_csv(r'C:\Users\itspark\Documents\Analytics\ds code/wine_df_concat.csv')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,pH_Rank,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1439.0,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,369.0,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,577.0,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,249.5,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1439.0,red


In [4]:
df.shape

(6497, 14)

### Challenge 7 - Remove spaces " " in column names

In [5]:
df = df.rename(columns={"fixed acidity":"fixed_acidity"})
df.head()
# It would be tedious to write all column names

Unnamed: 0,fixed_acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,pH_Rank,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1439.0,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,369.0,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,577.0,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,249.5,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1439.0,red


In [6]:
# replacing blank spaces with '_'  using for loop
df.columns =[column.replace(" ", "_") for column in df.columns]

In [7]:
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,pH_Rank,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1439.0,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,369.0,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,577.0,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,249.5,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1439.0,red


### Challenge 8 - Is a certain type of wine associated with higher quality?

In [8]:
# Find the mean quality of each wine type (red and white) with groupby
df.groupby('color').mean().quality 

color
red      5.636023
white    5.877909
Name: quality, dtype: float64

### Challenge 9 - Do wines with higher alcoholic content receive better ratings?

In [11]:
# get the median amount of alcohol content
df.alcohol.median()
#df.alcohol.nunique()

10.3

In [12]:
# select samples with alcohol content less than the median
low_alcohol = df.query('alcohol < 10.3')

# select samples with alcohol content greater than or equal to the median
high_alcohol = df.query('alcohol >= 10.3')

In [13]:
low_alcohol

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,pH_Rank,color
0,7.4,0.700,0.00,1.90,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1439.0,red
1,7.8,0.880,0.00,2.60,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,369.0,red
2,7.8,0.760,0.04,2.30,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,577.0,red
3,11.2,0.280,0.56,1.90,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,249.5,red
4,7.4,0.700,0.00,1.90,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1439.0,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6487,6.8,0.220,0.36,1.20,0.052,38.0,127.0,0.99330,3.04,0.54,9.2,5,755.0,white
6488,4.9,0.235,0.27,11.75,0.030,34.0,118.0,0.99540,3.07,0.50,9.4,6,1047.0,white
6491,6.5,0.230,0.38,1.30,0.032,29.0,112.0,0.99298,3.29,0.54,9.7,5,3757.5,white
6493,6.6,0.320,0.36,8.00,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,2066.5,white


In [16]:
df.shape[0]

6497

In [17]:
# create a null dataframe
num_samples = df.shape[0]
#num_samples
# ensure these queries included each sample exactly once
num_samples == low_alcohol['quality'].count() + high_alcohol['quality'].count() # should be True


True

In [41]:
# get median quality rating for the low alcohol and high alcohol groups
low_alcohol.quality.median(), high_alcohol.quality.median()

(5.0, 6.0)

### Challenge 10 - Do sweeter wines receive better ratings?

In [18]:
df.residual_sugar.nunique()

316

In [15]:
# get the median amount of residual sugar
df.residual_sugar.median()

3.0

In [16]:
# select samples with residual sugar less than the median
low_sugar = df.query('residual_sugar < 3')

# select samples with residual sugar greater than or equal to the median
high_sugar = df.query('residual_sugar >= 3')

In [17]:
# ensure these queries included each sample exactly once
num_samples == low_sugar['quality'].count() + high_sugar['quality'].count() # should be True

True

In [18]:
# get median quality rating for the low sugar and high sugar groups
low_sugar.quality.median(), high_sugar.quality.median()

(6.0, 6.0)

### Challenge 11 - What level of acidity receives the highest average rating?

In [None]:
1-100
1.12,1.15,10.23,10.23,10.34
1-10: a
10-20:b
20-30:c

In [19]:
# View the min, 25%, 50%, 75%, max pH values with Pandas describe
df.describe().fixed_acidity

count    6497.000000
mean        7.215307
std         1.296434
min         3.800000
25%         6.400000
50%         7.000000
75%         7.700000
max        15.900000
Name: fixed_acidity, dtype: float64

In [19]:
df.fixed_acidity.nunique()

106

In [20]:
# Bin edges that will be used to "cut" the data into groups
bin_edges = [3.8, 6.4, 7.0, 7.7, 15.9] # Fill in this list with five values you just found

In [21]:
# Labels for the four acidity level groups
bin_names = ['low','medium','med_high','high'] # Name each acidity level category

In [22]:
# Creates acidity_levels column
df['acidity_levels'] = pd.cut(df['fixed_acidity'], bin_edges, labels=bin_names) #Use cut() when you need to segment and sort data values into bins.

In [27]:
# Checks for successful creation of this column
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,pH_Rank,color,acidity_levels
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1439.0,red,med_high
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,369.0,red,high
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,577.0,red,high
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,249.5,red,high
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1439.0,red,med_high


In [28]:
# Find the mean quality of each acidity level with groupby
df.groupby('acidity_levels').mean().quality

acidity_levels
low         5.898256
medium      5.878788
med_high    5.770557
high        5.705727
Name: quality, dtype: float64

In [21]:
# Save changes for the next section
df.to_csv('winequality_edited.csv', index=False)

In [27]:
#shift+t
list(range(10))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]