In [26]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib as mpl
mpl.rcParams['figure.dpi']= 100

In [27]:
# import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *
import re

%matplotlib inline

Let's start by reading in the data and checking the first few rows.

In [28]:
reviews = pd.read_csv("../input/winemag-data-130k-v2.csv", index_col=0)
reviews.head(2)

## Which country has the most reviewed wines?

In [29]:
df = pd.DataFrame(reviews['country'].value_counts().head(10)).reset_index()
df.columns = ['country', 'count']

In [30]:
(ggplot(df)
 + aes(x='country', y='count')
 + geom_col(fill='darkslateblue')
 + ggtitle("Countries with Most Reviews")
 + xlab('Country')
 + ylab('Reviews')
 + coord_flip()
 + theme(figure_size=(8, 6))
)

## Who are the most frequent tasters?

In [31]:
# check for missing values in taster_name column
reviews.loc[reviews.taster_name.isnull()].head(2)

The taster_name column has some missing values. Let's label these with the word 'Unknown'.

In [32]:
# fill missing values with "Unknown"
reviews.taster_name.fillna('Unknown', inplace=True)

In [33]:
df = pd.DataFrame(reviews['taster_name'].value_counts()).reset_index()
df.columns = ['Taster', 'Reviews']

(ggplot(df)
 + aes(x='Taster', y='Reviews')
 + geom_col(fill='gold', width=0.85)
 + ggtitle("Reviews by Each Taster")
 + geom_text(aes(label='Reviews'), size=9.5, ha='left')
 + xlab('Taster')
 + ylab('Reviews')
 + coord_flip(ylim=(0, 30000))
 + theme(figure_size=(8, 10))
)

## How are the review scores distributed?

In [34]:
(ggplot(reviews)
 + aes(x='points')
 + geom_bar(fill='maroon')
 + ggtitle("Review Scores")
 + xlab('Score')
 + ylab('Count')
 + theme(figure_size=(8, 6))
)

## Which variety is reviewed most frequently?

In [35]:
df = (reviews
    .groupby('variety').variety.agg([len])
    .sort_values(by='len', ascending=False)
    .reset_index()
    .head(10)
)

(ggplot(df)
 + aes(x='variety', y='len', fill='len')
 + geom_col()
 + ggtitle("Most Frequently Reviewed Varieties")
 + geom_text(aes(label='len'), ha='right', nudge_y=-100)
 + xlab('Variety')
 + ylab('Count')
 + coord_flip(ylim=(0, 20000)) # rotate axis
 + scale_fill_cmap('Paired') # set custom colormap
 + guides(fill=False) # remove legend
 + theme(figure_size=(8, 6))
)

In [36]:
# subset reviews dataframe for frequent varieties
common_wines = reviews.loc[reviews.variety.isin(df.variety.values)]

## What scores did the frequent varieties receive?

In [37]:
(ggplot(common_wines)
    + aes('points', 'variety')
    + geom_bin2d(bins=20)
    + coord_fixed(ratio=1)
    + ggtitle("Review Scores for Frequently Reviewed Varieties")
    + xlab('Score')
    + ylab('Variety')
    + scale_fill_cmap('RdPu')
    + theme(figure_size=(8, 4))
)

## How are these wines priced?

In [38]:
df = (common_wines
    .groupby('variety').price.mean()
    .reset_index()
    .round(2)
)

(ggplot(df)
 + aes(x='variety', y='price', fill='variety')
 + geom_col(fill='skyblue')
 + ggtitle("Average Prices of Frequently Reviewed Wines")
 + geom_text(aes(label='price'), ha='right', nudge_y=-0.8)
 + xlab('Variety')
 + ylab('Price')
 + coord_flip()
 + guides(fill=False)
 + theme(figure_size=(8, 6))
)

It ooks like the frequently reviewed wine varieties can be crudely divided into three price categories: the low-end range (around \$20),  the mid-range (around \$35), and the high-range (around \$45).

## Do expensive wines get higher review scores?

Lets compare the two wine varieties *Rosé* and *Cabernet Sauvignon*, which has the highest average price and the lowest average price respectively.

In [39]:
(ggplot(common_wines.loc[common_wines.variety.isin(['Rosé', 'Cabernet Sauvignon'])])
 + aes(x='points', y='price', color='variety')
 + geom_point()
 + ggtitle("Review Scores Vs. Price")
 + xlab('Score')
 + ylab('Price')
 + theme(figure_size=(8, 6))
)

It appears that the *Cabernet Sauvignon* wines generally score higher and are more expensive than the *Rosé* wines. 

Some of the *Cabernet Sauvignon* wines are in fact very expensive, priced at $400 and above. 

One  *Rosé* wine was priced at $800, being an exception from the other wines of the same variety. Despite its extraordinarily high price, it received a rather average score of 87.

## Which winery makes the best wines?

In [40]:
df = (reviews
    .groupby('winery').points.agg(['mean'])
    .sort_values(by='mean', ascending=False)
    .reset_index()
    .rename(columns={'mean': 'price'})
    .round(2)
    .head(10)
)

(ggplot(df)
 + aes(x='winery', y='price')
 + geom_col(fill='#DB6058')
 + ggtitle("Top 10 Wineries")
 + geom_text(aes(label='price'), color='white', ha='right', nudge_y=-0.15)
 + xlab('Winery')
 + ylab('Average Price')
 + coord_flip(ylim=(90, 100))
 + theme(figure_size=(8, 6))
)

## What year were the wines produced?

In [41]:
# extract year from title column
reviews['year'] = reviews.title.str.extract('((19|20)\d{2})')[0]

# check result
reviews.head(2)

In [53]:
df = (reviews
    .loc[reviews.year.notnull()]
    .assign(year=reviews.year.astype('float64'))
)

(ggplot(df)
 + aes('year')
 + geom_bar(width=0.7)
 + ggtitle("Years of Wines")
 + xlab('Year')
 + ylab('Count')
 + theme(axis_text_x=element_text(rotation=90),
         figure_size=(10, 4))
 + scale_x_continuous(breaks=range(1900, 2020, 5))
)

The histogram is heavily skewed. Most of the reviews were on wines produced after the year 2000. Let's take a look at the distribution for the years prior to 2000.

In [54]:
(ggplot(df)
 + aes('year')
 + geom_bar(width=0.7)
 + ggtitle("Years of Wines (1900 ~ 2000)")
 + xlab('Year')
 + ylab('Count')
 + theme(axis_text_x=element_text(rotation=90),
         figure_size=(10, 4))
 + scale_x_continuous(breaks=range(1900, 2020, 5))
 + scale_y_continuous(breaks=range(0, 21, 1))
 + coord_cartesian(xlim=(1900, 2000), ylim=(0, 20))
)

To take a closer look at the wines produced in the previous century, I have capped the limits for the y-axis to 20. In fact, an absolute majority of the wines produced during this period were reviewed less than 10 times, except for the ones produced after 1990.

## Are older wines more expensive?

In [59]:
(ggplot(df)
 + aes(x='year', y='price')
 + geom_point(size=0.5)
 + ggtitle("Price of Wines Vs. Year")
 + xlab('Year')
 + ylab('Price')
 + theme(axis_text_x=element_text(rotation=90),
         figure_size=(10, 4))
 + scale_x_continuous(breaks=range(1900, 2020, 5))
)

Compared to the majority of more recent wines, the older wines do appear to be more expensive the older they are.

How about the the review scores? Do older wines taste better as well? Let's find out.

## Do older wines taste better?

In [58]:
(ggplot(df)
 + aes(x='year', y='points')
 + geom_point(size=0.5)
 + ggtitle("Review Scores Vs. Year")
 + xlab('Year')
 + ylab('Score')
 + theme(axis_text_x=element_text(rotation=90),
         figure_size=(10, 4))
 + scale_x_continuous(breaks=range(1900, 2020, 5))
)

Interestingly, most old wines received high review scores above 90. This might be interpreted in two ways; for one, it may be that older wines actually do taste better. On the other hand, we might be witnessing a sampling bias in which the older wines selected for review were high-quality wines in the first place, thus were preserved as valuable assets for decades. 

Finally, let's plot the years of wines against the prices, and represent each point with the respective review score.

In [64]:
(ggplot(df)
 + aes(x='year', y='price', size='points')
 + geom_point(fill='#DB6058', color='lightgray', alpha=0.25)
 + ggtitle("Price - Year - Review Score")
 + xlab('Year')
 + ylab('Price')
 + theme(axis_text_x=element_text(rotation=90),
         figure_size=(10, 4))
 + scale_x_continuous(breaks=range(1900, 2020, 5))
 + scale_size_radius(range=(3, 12))
)