In [1]:
# Import packages that are used
import pandas as pd
import numpy as np
import re 
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# To ensure that axis-texts in plots are not cut off
plt.tight_layout()

In [5]:
df = pd.read_csv('Data/winemag-data-130k-v2.csv', sep = ',')
france = df[df.country.str.contains('France', na=False)]
france.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
7,7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,,Roger Voss,@vossroger,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach
9,9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,,Roger Voss,@vossroger,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam
11,11,France,"This is a dry wine, very spicy, with a tight, ...",,87,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Leon Beyer 2012 Gewurztraminer (Alsace),Gewürztraminer,Leon Beyer


In [7]:
# Drop unnecessary columns
wines = france.drop(['Unnamed: 0', 'country', 'region_2', 'taster_name','taster_twitter_handle'], axis=1)\
    .rename(columns={'region_1':'region'}).reset_index(drop = True)

# Extract year from title
wines['year'] = wines['title'].str.extract('(\d{4})')
wines.head(3)

# Save cleaned data
#wines.to_csv('Data/france_cleaned.csv')

Unnamed: 0,description,designation,points,price,province,region,title,variety,winery,year
0,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach,2012
1,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam,2012
2,"This is a dry wine, very spicy, with a tight, ...",,87,30.0,Alsace,Alsace,Leon Beyer 2012 Gewurztraminer (Alsace),Gewürztraminer,Leon Beyer,2012


In [8]:
wines.province.unique()

array(['Alsace', 'Beaujolais', 'Bordeaux', 'Champagne', 'Burgundy',
       'France Other', 'Southwest France', 'Rhône Valley',
       'Languedoc-Roussillon', 'Provence', 'Loire Valley'], dtype=object)

In [14]:
wines[wines.province == 'Southwest France'].region.unique()

array(['Cahors', 'Madiran', 'Vin de Pays des Côtes de Gascogne',
       'Côtes de Gascogne', 'Côtes du Lot', 'Gaillac', 'Montravel',
       'Monbazillac', 'Gers', 'Fronton', 'Bergerac', 'Jurançon',
       'Bergerac Sec', 'Buzet', 'Irrouléguy', 'Côtes du Marmandais',
       'Pécharmant', 'Vin de Pays du Comté Tolosan', nan, 'Côtes du Tarn',
       'Côtes de Bergerac', 'Pacherenc du Vic Bilh', 'Jurançon Sec',
       'Côtes de Duras', 'Vin de Pays du Lot', 'Marcillac', 'Saint-Mont',
       'Saussignac', 'Côtes du Frontonnais', 'Brulhois', 'Landes',
       'Périgord', 'Bergerac Rosé', 'Lot', 'Béarn', 'Gaillac Doux',
       'Côtes de Saint-Mont', 'Côtes de Montravel'], dtype=object)

In [21]:
#wines[wines.region == 'Vin de Pays des Côtes de Gascogne'].region = 'Côtes de Gascogne'
wines.loc[wines.region == 'Vin de Pays des Côtes de Gascogne', 'region'] = 'Côtes de Gascogne'
wines.loc[wines.region == 'Bergerac Sec', 'region'] = 'Bergerac'

In [22]:
wines[wines.province == 'Southwest France'].region.unique()

array(['Cahors', 'Madiran', 'Côtes de Gascogne', 'Côtes du Lot',
       'Gaillac', 'Montravel', 'Monbazillac', 'Gers', 'Fronton',
       'Bergerac', 'Jurançon', 'Buzet', 'Irrouléguy',
       'Côtes du Marmandais', 'Pécharmant',
       'Vin de Pays du Comté Tolosan', nan, 'Côtes du Tarn',
       'Côtes de Bergerac', 'Pacherenc du Vic Bilh', 'Jurançon Sec',
       'Côtes de Duras', 'Vin de Pays du Lot', 'Marcillac', 'Saint-Mont',
       'Saussignac', 'Côtes du Frontonnais', 'Brulhois', 'Landes',
       'Périgord', 'Bergerac Rosé', 'Lot', 'Béarn', 'Gaillac Doux',
       'Côtes de Saint-Mont', 'Côtes de Montravel'], dtype=object)

In [15]:
midi_pyrenees = ['Cahors', 'Madiran', 'Côtes de Gascogne', 'Côtes du Lot', 'Gaillac', 
                 'Gers', 'Fronton', 'Côtes du Tarn', '']
laguedoc_roussillion = ['']
aquitaine = ['Montravel', 'Monbazillac', 'Bergerac', 'Jurançon', 'Buzet', 'Irrouléguy', 
             'Côtes du Marmandais', 'Pécharmant', 'Côtes de Bergerac']