In [29]:
import pandas as pd
import numpy as np
    
pd.set_option('display.max_rows', None)
raw = pd.read_csv('winemag-data_first150k.csv')

review = raw.copy(deep=True).drop('Unnamed: 0', axis=1)
review.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


The data consists of 10 fields:

- *Points*: the number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80)
- *Title*: the title of the wine review, which often contains the vintage if you're interested in extracting that feature
- *Variety*: the type of grapes used to make the wine (ie Pinot Noir)
- *Description*: a few sentences from a sommelier describing the wine's taste, smell, look, feel, etc.
- *Country*: the country that the wine is from
- *Province*: the province or state that the wine is from
- *Region 1*: the wine growing area in a province or state (ie Napa)
- *Region 2*: sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank
- *Winery*: the winery that made the wine
- *Designation*: the vineyard within the winery where the grapes that made the wine are from
- *Price*: the cost for a bottle of the wine 
- *Taster Name*: name of the person who tasted and reviewed the wine
- *Taster Twitter Handle*: Twitter handle for the person who tasted ane reviewed the wine

In [2]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 10 columns):
country        150925 non-null object
description    150930 non-null object
designation    105195 non-null object
points         150930 non-null int64
price          137235 non-null float64
province       150925 non-null object
region_1       125870 non-null object
region_2       60953 non-null object
variety        150930 non-null object
winery         150930 non-null object
dtypes: float64(1), int64(1), object(8)
memory usage: 11.5+ MB


In [3]:
review.describe()

Unnamed: 0,points,price
count,150930.0,137235.0
mean,87.888418,33.131482
std,3.222392,36.322536
min,80.0,4.0
25%,86.0,16.0
50%,88.0,24.0
75%,90.0,40.0
max,100.0,2300.0


In [79]:
for col in review.columns:
    if review.loc[pd.isnull(review[str(col)])].index.size > 0:
        print(col, 'has missing value')

country has missing value
designation has missing value
price has missing value
province has missing value
region_1 has missing value
region_2 has missing value


In [6]:
countries = review.country.unique() # nan
countries

array(['US', 'Spain', 'France', 'Italy', 'New Zealand', 'Bulgaria',
       'Argentina', 'Australia', 'Portugal', 'Israel', 'South Africa',
       'Greece', 'Chile', 'Morocco', 'Romania', 'Germany', 'Canada',
       'Moldova', 'Hungary', 'Austria', 'Croatia', 'Slovenia', nan,
       'India', 'Turkey', 'Macedonia', 'Lebanon', 'Serbia', 'Uruguay',
       'Switzerland', 'Albania', 'Bosnia and Herzegovina', 'Brazil',
       'Cyprus', 'Lithuania', 'Japan', 'China', 'South Korea', 'Ukraine',
       'England', 'Mexico', 'Georgia', 'Montenegro', 'Luxembourg',
       'Slovakia', 'Czech Republic', 'Egypt', 'Tunisia', 'US-France'],
      dtype=object)

In [43]:
review.loc[pd.isnull(review.country)]

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
1133,,Delicate white flowers and a spin of lemon pee...,Askitikos,90,17.0,,,,Assyrtiko,Tsililis
1440,,"A blend of 60% Syrah, 30% Cabernet Sauvignon a...",Shah,90,30.0,,,,Red Blend,Büyülübağ
68226,,"From first sniff to last, the nose never makes...",Piedra Feliz,81,15.0,,,,Pinot Noir,Chilcas
113016,,"From first sniff to last, the nose never makes...",Piedra Feliz,81,15.0,,,,Pinot Noir,Chilcas
135696,,"From first sniff to last, the nose never makes...",Piedra Feliz,81,15.0,,,,Pinot Noir,Chilcas


In [7]:
des = review.designation.unique()
len(des)

30622

In [80]:
p = review.points.unique()
p # no missing point

array([ 96,  95,  94,  90,  91,  86,  89,  88,  87,  93,  92,  85,  84,
        83,  82,  81, 100,  99,  98,  97,  80])

In [9]:
mean = review.points.mean()
review.points = review.points - mean
review.head(10)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,8.111582,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,8.111582,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,8.111582,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,8.111582,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,7.111582,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude
5,Spain,"Deep, dense and pure from the opening bell, th...",Numanthia,7.111582,73.0,Northern Spain,Toro,,Tinta de Toro,Numanthia
6,Spain,Slightly gritty black-fruit aromas include a s...,San Román,7.111582,65.0,Northern Spain,Toro,,Tinta de Toro,Maurodos
7,Spain,Lush cedary black-fruit aromas are luxe and of...,Carodorum Único Crianza,7.111582,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
8,US,This re-named vineyard was formerly bottled as...,Silice,7.111582,65.0,Oregon,Chehalem Mountains,Willamette Valley,Pinot Noir,Bergström
9,US,The producer sources from two blocks of the vi...,Gap's Crown Vineyard,7.111582,60.0,California,Sonoma Coast,Sonoma,Pinot Noir,Blue Farm


In [4]:
price_counts = review.price.value_counts()
price_counts.head(10)

20.0    7860
15.0    7056
18.0    5988
25.0    5955
30.0    5449
10.0    5255
12.0    4950
13.0    4479
16.0    4273
35.0    4194
Name: price, dtype: int64

In [22]:
review.price.unique()[: 30] # price has missing values

array([235., 110.,  90.,  65.,  66.,  73.,  60.,  80.,  48., 185., 325.,
       290.,  75.,  24.,  79., 220.,  45.,  57.,  62., 105.,  15.,  37.,
        nan,  22.,  42., 135.,  29.,  23.,  17.,  26.])

In [21]:
price = review.price.unique()
price[: 40]

array([235., 110.,  90.,  65.,  66.,  73.,  60.,  80.,  48., 185., 325.,
       290.,  75.,  24.,  79., 220.,  45.,  57.,  62., 105.,  15.,  37.,
        nan,  22.,  42., 135.,  29.,  23.,  17.,  26.,  55.,  39.,  69.,
        30.,  50.,  40., 100.,  68.,  28.,  18.])

In [42]:
len(review.loc[review.price.isnull()])

13695

In [40]:
# row first, column second
review.loc[pd.isnull(review.price), ['country', 'points']].head(10)

Unnamed: 0,country,points
32,Italy,90
56,France,90
72,Italy,91
82,Italy,91
116,Spain,86
242,France,88
261,France,88
282,France,92
294,France,91
323,Spain,94


In [39]:
review.loc[pd.isnull(review.price)].head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
32,Italy,"Underbrush, scorched earth, menthol and plum s...",Vigna Piaggia,90,,Tuscany,Brunello di Montalcino,,Sangiovese,Abbadia Ardenga
56,France,"Delicious while also young and textured, this ...",Le Pavé,90,,Loire Valley,Sancerre,,Sauvignon Blanc,Domaine Vacheron
72,Italy,"This offers aromas of red rose, wild berry, da...",Bussia Riserva,91,,Piedmont,Barolo,,Nebbiolo,Silvano Bolmida
82,Italy,"Berry, baking spice, dried iris, mint and a hi...",Palliano Riserva,91,,Piedmont,Roero,,Nebbiolo,Ceste
116,Spain,Aromas of brandied cherry and crème de cassis ...,Dulce Tinto,86,,Levante,Jumilla,,Monastrell,Casa de la Ermita


In [31]:
prov = review.province.unique()
len(prov)

456

In [38]:
prov[: 40]

array(['California', 'Northern Spain', 'Oregon', 'Provence',
       'Northeastern Italy', 'Southwest France', 'Kumeu', 'Washington',
       'Bulgaria', 'Tuscany', 'France Other', 'Rhône Valley', 'Galicia',
       'Andalucia', 'Idaho', 'Burgundy', 'Loire Valley', 'New York',
       'Mendoza Province', 'Victoria', 'Alentejano', 'Piedmont',
       'Alentejo', 'Champagne', 'Upper Galilee', 'Beira Atlantico',
       'Veneto', 'Douro', 'Tejo', 'Stellenbosch', 'Levante',
       'Sicily & Sardinia', 'Southern Italy', 'Languedoc-Roussillon',
       'Bordeaux', 'Atalanti Valley', 'Catalonia', 'Santorini', 'Florina',
       'Marchigue'], dtype=object)

In [37]:
review.loc[pd.isnull(review.province)]

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
1133,,Delicate white flowers and a spin of lemon pee...,Askitikos,90,17.0,,,,Assyrtiko,Tsililis
1440,,"A blend of 60% Syrah, 30% Cabernet Sauvignon a...",Shah,90,30.0,,,,Red Blend,Büyülübağ
68226,,"From first sniff to last, the nose never makes...",Piedra Feliz,81,15.0,,,,Pinot Noir,Chilcas
113016,,"From first sniff to last, the nose never makes...",Piedra Feliz,81,15.0,,,,Pinot Noir,Chilcas
135696,,"From first sniff to last, the nose never makes...",Piedra Feliz,81,15.0,,,,Pinot Noir,Chilcas
