In [1]:
import re
import pandas as pd
import numpy as np

from fixerio import Fixerio
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from measurement.measures import Volume

In [3]:
va_df = pd.read_csv('../data_source/va_prices.csv', skiprows=1)
va_df.columns = va_df.columns.str.lower()

In [4]:
va_df = va_df[va_df.description.str.contains('WHISKEY') | va_df.description.str.contains('WHISKY')]
va_df.head()

Unnamed: 0,description,code,brand,size,age,proof,price
0,STRAIGHT BOURBON WHISKEY,21228,1792 Bourbon Full Proof,750ml,,125.0,$47.99
1,STRAIGHT BOURBON WHISKEY,21232,1792 Port Finish Bourbon,750ml,,88.9,$42.99
2,STRAIGHT BOURBON WHISKEY,21244,1792 Single Barrel Bourbon,750ml,,98.6,$42.99
3,STRAIGHT BOURBON WHISKEY,21236,1792 Small Batch Bourbon,750ml,,93.7,$33.99
4,STRAIGHT BOURBON WHISKEY,21242,1792 Sweet Wheat Bourbon,750ml,,91.2,$39.99


In [5]:
va_df.describe()

Unnamed: 0,code,proof
count,871.0,871.0
mean,20912.814007,84.514696
std,19615.108155,13.127495
min,137.0,0.0
25%,6897.5,80.0
50%,17920.0,80.0
75%,26114.5,90.0
max,100124.0,136.2


In [6]:
va_df.brand.isnull().any()

False

In [7]:
def my_va_transform(s):
    """Function to transform brand
    * make lower
    * do specific regex replacements
    * remove product types ex: 'bourbon', 'whiskey'
    * sort the words in the brand
    """
    s = s.lower()
    
    replacements = {"^gentleman jack whiskey$": "jack daniel's gentleman jack",
                    "^pritchard": "prichard",
                    "^balcones baby blue corn whiskey$": "balcones baby blue",
                    "^canadian club rye whisky$": "canadian club",
                    "^catoctin creek roundstone rye whisky$": "catoctin creek roundstone rye",
                    "^e h taylor jr. straight rye$": "colonel e.h. taylor straight rye",
                    "^e h taylor seasoned wood$": "colonel e.h. taylor seasoned wood",
                    "^james e. pepper 1776 rye": "james e. pepper 1776 straight rye",
                    "^lock stock & barrel 16 yr straight rye whiskey$": "lock stock and barrel 16 straight rye",
                    "^michter's us 1 single barrel straight rye$": "michter's us*1",
                    "^michter's us-1 barrel strength rye": "michter's barrel strength rye",
                    "^old overholt$": "old overholt rye",
                    "^wild turkey russell's reserve rye$": "russell's reserve rye 6",
                    "\s7\s": " seven ",
                    "^defiant whisky$": "defiant",
                    "^michter's us1 sour mash$": "michter sour mash",
                    "^red stag": "jim beam red stag",
                    "^four roses 2015 limited edition small batch$": "four roses limited edition 2015",
                    "^four roses 2016 limited edition small batch$": "four roses small batch limited edition 2016",
                    "^i w harper bourbon$": "i.w. harper",
                    "^jesse james bourbon whiskey$": "original jesse james",
                    "^the": " ",
                    "scotch$": " ",
                    "-": " ",
                    }
    for k, v in replacements.items():
        s = re.sub(k, v, s)
    
    replace = ['bourbon', 'craft',
               'Year Single Barrel Bourbon',
               '-year Single Barrel Bourbon',
               'whiskey', 'tennessee whiskey', 'tennessee',
               'year', 'yr', '-year single barrel',
               'year single barrel', 'label', "'s",
               'decades', 'whisky', '&', 'single malt', "(", ")",
               "yo", "no.", "irish", "’", "us1",
               ]
    
    for item in replace:
        s = s.replace(item, '')
    
    s = s.split() # remove extra spaces betwen words and sort
    s.sort()
    s = " ".join(s)
    
    return s.strip()

In [8]:
my_va_transform('12 glenlivet malt single')

'12 glenlivet malt single'

In [9]:
va_df['alt_brand'] = va_df['brand'].map(lambda x: my_va_transform(x))
va_df.head()

Unnamed: 0,description,code,brand,size,age,proof,price,alt_brand
0,STRAIGHT BOURBON WHISKEY,21228,1792 Bourbon Full Proof,750ml,,125.0,$47.99,1792 full proof
1,STRAIGHT BOURBON WHISKEY,21232,1792 Port Finish Bourbon,750ml,,88.9,$42.99,1792 finish port
2,STRAIGHT BOURBON WHISKEY,21244,1792 Single Barrel Bourbon,750ml,,98.6,$42.99,1792 barrel single
3,STRAIGHT BOURBON WHISKEY,21236,1792 Small Batch Bourbon,750ml,,93.7,$33.99,1792 batch small
4,STRAIGHT BOURBON WHISKEY,21242,1792 Sweet Wheat Bourbon,750ml,,91.2,$39.99,1792 sweet wheat


In [10]:
def my_size_transform(s):
    if 'ml' in s:
        v = Volume(milliliter=s.replace('ml', ''))
    elif 'L' in s:
        v = Volume(liter=s.replace('L', ''))
    return v.us_oz

In [11]:
va_df['oz'] = va_df['size'].map(lambda x: my_size_transform(x))
va_df['alt_price'] = va_df['price'].str.replace('$', '')
va_df['alt_age'] = va_df['age'].str.replace('YR', '')

In [13]:
w_archive = pd.read_csv('../data_source/reddit_archive.csv',
                        names=['timestamp', 'whisky_name', 'reviewer_username', 'link',
                               'rating', 'style', 'bottle_price', 'review_date'],
                        skiprows=1,
                        parse_dates=['timestamp', 'review_date'])

In [14]:
w_archive['rating'] = pd.to_numeric(w_archive['rating'], errors='coerce')
#w_archive['bottle_price'] = pd.to_numeric(w_archive['bottle_price'], errors='coerce')
w_archive['timestamp'] = pd.to_datetime(w_archive['timestamp'], errors='coerce')
w_archive['review_date'] = pd.to_datetime(w_archive['review_date'], errors='coerce')
w_archive['whisky_name'] = w_archive.whisky_name.str.lower()
w_archive['style'] = w_archive['style'].str.lower()
w_archive.dtypes

timestamp            datetime64[ns]
whisky_name                  object
reviewer_username            object
link                         object
rating                      float64
style                        object
bottle_price                 object
review_date          datetime64[ns]
dtype: object

In [15]:
#w_archive.head(20)

In [16]:
fxrio = Fixerio(base='USD')
conversion = fxrio.latest()
import math

def isnumber(num):
    return all(char.isdigit() for char in num)

def transform_currency(s):
    if 'CAD' in str(s) or 'CDN' in str(s):
        price = re.sub(r"""[^0-9]+""", '', s)
        new_price = float(price) / float(conversion['rates']['CAD'])
        return new_price
    elif '£' in str(s) or 'GBP' in str(s):
        price = re.sub(r"""[^0-9]+""", '', s)
        new_price = float(price) / float(conversion['rates']['GBP'])
        return new_price
    elif 'AUD' in str(s) or 'AUS' in str(s):
        price = re.sub(r"""[^0-9]+""", '', s)
        new_price = float(price) / float(conversion['rates']['AUD'])
        return new_price
    elif 'NZD' in str(s):
        price = re.sub(r"""[^0-9]+""", '', s)
        new_price = float(price) / float(conversion['rates']['NZD'])
        return new_price
    elif 'EUR' in str(s) or '€' in str(s) or 'Euro' in str(s):
        price = re.sub(r"""[^0-9]+""", '', s)
        new_price = float(price) / float(conversion['rates']['EUR'])
        return new_price
    elif 'SEK' in str(s):
        price = re.sub(r"""[^0-9]+""", '', s)
        new_price = float(price) / float(conversion['rates']['SEK'])
        return new_price
    elif 'RMB' in str(s):
        price = re.sub(r"""[^0-9]+""", '', s)
        new_price = float(price) / float(conversion['rates']['CNY'])
        return new_price
    elif 'DKR' in str(s):
        price = re.sub(r"""[^0-9]+""", '', s)
        new_price = float(price) / float(conversion['rates']['DKK'])
        return new_price
    elif 'HKD' in str(s):
        price = re.sub(r"""[^0-9]+""", '', s)
        new_price = float(price) / float(conversion['rates']['HKD'])
        return new_price
    elif 'JPY' in str(s):
        price = re.sub(r"""[^0-9]+""", '', s)
        new_price = float(price) / float(conversion['rates']['JPY'])
        return new_price
    elif 'ZAR' in str(s):
        price = re.sub(r"""[^0-9]+""", '', s)
        new_price = float(price) / float(conversion['rates']['ZAR'])
        return new_price
    elif type(s) == str and isnumber(s):
        # USD
        return s
    elif type(s) == float and math.isnan(s):
        # NAN
        return s
    elif re.match(r"""^\$*\d+\.*\d+$""", s):
        s = s.replace('$', '')
        return s
    else:
        # Convert everything else to NAN
        print(s)
        return None

In [17]:
w_archive['alt_brand'] = w_archive['whisky_name'].map(lambda x: my_va_transform(x))
w_archive['alt_bottle_price'] = w_archive['bottle_price'].map(lambda x: transform_currency(x))
#w_archive.head(100)

55 CHF
40 euro
$40 USD
$38 USD
$60 USD
$50 USD
$65 USD
$270 ARS


In [18]:
va_new_brand = va_df['alt_brand'].tolist()
wa_new_whisky = w_archive['alt_brand'].tolist()

In [19]:
exact_matches = set()
for brand in va_new_brand:
    if brand in wa_new_whisky:
        exact_matches.add(brand)
print('Matches: {}'.format(len(exact_matches)))
exact_matches = list(exact_matches)
exact_matches.sort()
print('\n'.join(exact_matches))

Matches: 239
1 edition macallan
10 bulleit
10 cork west
10 eagle rare
10 fine macallan oak
10 glenmorangie original the
10 old rip van winkle
101 turkey wild
114 dad grand old
12 balvenie doublewood
12 beam jim signature
12 bowmore
12 bunnahabhain
12 caol ila
12 cask double macallan
12 castle knappogue
12 chivas regal
12 craig elijah
12 dalmore
12 dewar reserve special
12 dickel george
12 glendronach
12 glenfiddich
12 glenlivet
12 hibiki
12 highland park
12 jameson
12 lagavulin
12 macallan
12 medley old
12 tomatin
12 yamazaki
14 balvenie caribbean cask
15 family pappy reserve van winkle
15 fine macallan oak
16 aberlour
16 and barrel lock rye stock straight
17 balvenie doublewood
17 fine macallan oak
1776 barrel e. james pepper proof rye
1776 e. james pepper rye straight
1783 evan williams
1792 batch small
1792 finish port
1792 full proof
1792 high rye
1792 sweet wheat
18 chivas regal
18 glenlivet
18 jameson
18 laphroaig
18 macallan
18 oban
18 talisker
1843 david nicholson
1870 forester

In [20]:
# for brand in va_new_brand:
#     matches = process.extract(brand, wa_new_whisky)
#     if matches[0][1] != 100:
#         print(brand, matches[0])

In [21]:
#va_df[va_df['new_brand'].str.contains('james')]

In [22]:
#w_archive[w_archive['new_whisky_name'].str.contains('stag')]

In [25]:
#va_df.to_csv('data_transformed/va_prices.csv', index=False)

In [24]:
#w_archive.to_csv('data_transformed/reddit_archive.csv', index=False)

## Whisky Critic Matching

In [26]:
def my_wc_transform(s):
    """Function to transform brand
    * make lower
    * remove product types ex: 'bourbon', 'whiskey'
    """

    
    replacements = {"yo\W": " ",
                    "^prichard’s rye$": " prichard ",
                    "^reserve rye straight woodford$": "reserve rye woodford",
                    "^kentucky rebel yell": "rebel yell",
                    "laphroaig triple wood": "laphroaig triplewood",
                    "barrel four roses single": "four roses",
                    "founder glenlivet reserve": "founders glenlivet reserve",
                    "founder’s irishman reserve": "founders irishman reserve",
                    "carribean": "caribbean",
                    "12 glendronach original": "12 glendronach",
                    "dalwhinnie distillers edition": "dalwhinnie distiller edition",
                    "all distiller edition editions glenkinchie": "distiller edition glenkinchie",
                    "dalwhinnie distillers edition": "dalwhinnie distiller edition",
                    "ardbeg uigeadail": "ardbeg uigeadall",
                    "all daniel distiller jack master series": "daniel distiller jack master",
                    "21 balvenie port wood": "21 balvenie portwood",
                    "21 fine macallan oak": "21 fine macallan oak old",
                    "american auchentoshan oak": "american auchentosahn oak",
                    "basil haydens kentucky": "basil hayden",
                    "2016 cairdeas laphroaig madeira": "cairdeas laphroaig madeira",
                    "cooper croze jameson": "cooper croze",
                    "all distillers edition oban vintages": "distillers edition oban",
                    "16 glenlivet nadurra": "glenlivet nadurra",
                    "all midleton rare very vintages": "midleton rare",
                    "mortlach old rare": "mortlach rare",
                    "and rare rich": "rare rich",
                    "high redemption rye": "redemption rye",
                    "1792 finished port": "1792 finish port",
                    }
    for k, v in replacements.items():
        s = re.sub(k, v, s)
    
    replace = ["nas", ]
    
    for item in replace:
        s = s.replace(item, '')
    
    s = s.split() # remove extra spaces between words
    s.sort()
    s = " ".join(s)
    
    return s.strip()

In [27]:
my_wc_transform('my best 12yo scotch')

'12 best my scotch'

In [28]:
wc_df = pd.read_csv('../data_source/Meta-Critic Whisky Database.csv')
wc_df.columns = wc_df.columns.str.lower().str.replace(' ', '_')

In [29]:
wc_df['alt_brand'] = wc_df['whisky'].map(lambda x: my_va_transform(x))
wc_df['alt_brand'] = wc_df['alt_brand'].map(lambda x: my_wc_transform(x))

In [30]:
wc_new_whisky = wc_df['alt_brand'].tolist()

In [31]:
exact_matches = set()
for brand in va_new_brand:
    if brand in wc_new_whisky:
        exact_matches.add(brand)
print('Matches: {}'.format(len(exact_matches)))
exact_matches = list(exact_matches)
exact_matches.sort()
print('\n'.join(exact_matches))

Matches: 162
1 edition macallan
10 bulleit
10 eagle rare
10 fine macallan oak
10 old rip van winkle
101 turkey wild
114 dad grand old
12 balvenie barrel single
12 balvenie doublewood
12 bowmore
12 bunnahabhain
12 caol ila
12 cask double macallan
12 castle knappogue
12 chivas regal
12 craig elijah
12 dalmore
12 dickel george
12 glendronach
12 glenfiddich
12 glenlivet
12 hibiki
12 tomatin
12 yamazaki
14 balvenie caribbean cask
15 family pappy reserve van winkle
15 fine macallan oak
150th anniversary daniel jack
17 balvenie doublewood
17 fine macallan oak
1792 barrel single
1792 finish port
1792 full proof
1792 sweet wheat
18 chivas regal
18 glenlivet
18 laphroaig
18 oban
18 talisker
2 edition macallan
2 gingers
21 ardbeg
21 balvenie portwood
21 bushmills
21 fine macallan oak old
25 cragganmore
25 highland park
25 laphroaig
25 talisker
30 caol ila
30 fine macallan oak
30 talisker
40 lot
46 maker mark
6 reserve russell rye
75th anniversary crown monarch royal
8 islay mist
8 lagavulin
81 ry

In [671]:
va_set = set(va_new_brand)
va_set = list(va_set)
va_set.sort()
for brand in va_set:
    matches = process.extract(brand, wc_new_whisky)
    if matches[0][1] >= 90 and matches[0][1] < 100:
        print(brand, matches[0])

1 balcones texas ('balcones texas', 95)
10 ardbeg islay ('10 ardbeg', 90)
10 glenmorangie original the ('"original" 10 glenmorangie', 95)
100 comfort proof southern ('proof', 90)
100 dad grand old ('100 bib dad grand old', 95)
110 pikesville proof rye ('proof', 90)
12 aberlour ('18 aberlour', 91)
12 dew tullamore ('10 dew tullamore', 94)
12 dewar reserve special ('12 dewar', 90)
12 highland park ('25 highland park', 94)
12 jameson ('jameson', 95)
12 lagavulin ('16 lagavulin', 92)
12 macallan ('12 macallan oak sherry', 90)
12 reserve special van winkle ('12 b lot reserve special van winkle', 95)
14 glenfiddich ('18 glenfiddich', 93)
15 bowmore ('15 bowmore laimrig', 90)
15 caol ila ('25 caol ila', 91)
15 founders glenlivet reserve w/50ml ('founders glenlivet reserve', 95)
15 french glenlivet oak old reserve ('15 french glenlivet oak', 90)
16 aberlour ('18 aberlour', 91)
1776 barrel e. james pepper proof rye ('proof', 90)
18 jameson ('jameson', 95)
18 macallan ('18 macallan oak sherry', 

In [32]:
#wc_df.to_csv('data_transformed/meta_critic.csv', index=False)

## Proof 66 matching

In [33]:
def my_proof_transform(s):
    """Function to transform brand
    * make lower
    * remove product types ex: 'bourbon', 'whiskey'
    """

    
    replacements = {"101 8 turkey wild": "101 turkey wild",
                    "114 dad grand old proof": "114 dad grand old",
                    "81 turkey wild": "81 rye turkey wild",
                   }
    for k, v in replacements.items():
        s = re.sub(k, v, s)
    
    replace = [ ]
    
    for item in replace:
        s = s.replace(item, '')
    
    s = s.split() # remove extra spaces between words
    s.sort()
    s = " ".join(s)
    
    return s.strip()

In [34]:
proof_df = pd.read_csv('../data_source/proof66.csv')
proof_df.columns = proof_df.columns.str.lower().str.replace(' ', '_')

In [35]:
proof_df['alt_brand'] = proof_df['name'].map(lambda x: my_va_transform(x))
proof_df['alt_brand'] = proof_df['alt_brand'].map(lambda x: my_proof_transform(x))

In [36]:
proof_new_whisky = proof_df['alt_brand'].tolist()

In [37]:
exact_matches = set()
for brand in va_new_brand:
    if brand in proof_new_whisky:
        exact_matches.add(brand)
print('Matches: {}'.format(len(exact_matches)))
exact_matches = list(exact_matches)
exact_matches.sort()
print('\n'.join(exact_matches))

Matches: 24
12 aberlour
12 bunnahabhain
12 caol ila
12 dalmore
12 glenfiddich
12 tomatin
15 balvenie cask sherry
16 aberlour
17 fine macallan oak
18 chivas regal
18 macallan
18 talisker
25 highland park
30 macallan oak sherry
30 talisker
8 islay mist
8 lagavulin
ardbeg corryvreckan
bay little oban
black johnnie walker
bradan orach speyburn
cask laphroaig quarter
green johnnie walker
storm talisker


In [38]:
va_set = set(va_new_brand)
va_set = list(va_set)
va_set.sort()
for brand in va_set:
    matches = process.extract(brand, proof_new_whisky)
    if matches[0][1] >= 80 and matches[0][1] < 100:
        print(brand, matches[0])

"m" macallan ('21 douglas drumlanrig macallan', 86)
' blue johnnie of rooster' the walker ('blended hm king the', 86)
. 10 barrel michter single ('10 aberlour', 86)
1 balcones texas ('1 5 bruichladdich octomore', 86)
1 box experiment set wood ('artists blend box compass great king st', 86)
1 edition macallan ('21 douglas drumlanrig macallan', 86)
10 ardbeg islay ('10 ardbeg', 90)
10 barrel limited michter release rye single ('10 aberlour', 86)
10 bulleit ('10 auchentoshan', 86)
10 cask sherry tamdhu ('10 aberlour', 86)
10 cork west ('10 benriach curiositas', 86)
10 eagle rare ('10 benriach curiositas', 86)
10 fine macallan oak ('17 fine macallan oak', 95)
10 glenmorangie original the ('10 glenmorangie', 90)
10 kinahan ('10 auchentoshan', 86)
10 old rip van winkle ('10 aberlour', 86)
10 reserve russell turkey wild ('10 aberlour', 86)
100 dad grand old ('bark grand', 86)
101 gold moonshine pro shiners sweetwater virginia ('18 blended clan gold', 86)
101 turkey wild ('13 mortlach scotsman

In [711]:
proof_df.to_csv('data_transformed/proof66.csv', index=False)