In [1]:
import pandas as pd

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
va_df = pd.read_csv('data/VA-Price-List.csv', skiprows=1)
va_df.columns = va_df.columns.str.lower()

In [3]:
va_df = va_df[va_df.description.str.contains('WHISKEY')]
va_df.head()

Unnamed: 0,description,code,brand,size,age,proof,price
0,STRAIGHT BOURBON WHISKEY,21228,1792 Bourbon Full Proof,750ml,,125.0,$47.99
1,STRAIGHT BOURBON WHISKEY,21232,1792 Port Finish Bourbon,750ml,,88.9,$42.99
2,STRAIGHT BOURBON WHISKEY,21244,1792 Single Barrel Bourbon,750ml,,98.6,$42.99
3,STRAIGHT BOURBON WHISKEY,21236,1792 Small Batch Bourbon,750ml,,93.7,$33.99
4,STRAIGHT BOURBON WHISKEY,21242,1792 Sweet Wheat Bourbon,750ml,,91.2,$39.99


In [4]:
va_df.describe()

Unnamed: 0,code,proof
count,584.0,584.0
mean,27547.248288,84.769007
std,20154.190746,13.630156
min,145.0,0.0
25%,17829.25,80.0
50%,21677.0,80.5
75%,27283.25,91.0
max,100124.0,136.2


In [5]:
va_df.brand.isnull().any()

False

In [6]:
def my_va_transform(s):
    """Function to transform brand
    * make lower
    * remove product types ex: 'bourbon', 'whiskey'
    """
    s = s.lower()
    
    replace = ['bourbon', 
               'Year Single Barrel Bourbon',
               '-year Single Barrel Bourbon',
               'whiskey', 'tennessee whiskey']
    for item in replace:
        s = s.replace(item, '')
    
    s = " ".join(s.split()) # remove extra spaces betwen words
    
    return s.strip()

In [7]:
va_df['new_brand'] = va_df['brand'].map(lambda x: my_va_transform(x))
va_df.head()

Unnamed: 0,description,code,brand,size,age,proof,price,new_brand
0,STRAIGHT BOURBON WHISKEY,21228,1792 Bourbon Full Proof,750ml,,125.0,$47.99,1792 full proof
1,STRAIGHT BOURBON WHISKEY,21232,1792 Port Finish Bourbon,750ml,,88.9,$42.99,1792 port finish
2,STRAIGHT BOURBON WHISKEY,21244,1792 Single Barrel Bourbon,750ml,,98.6,$42.99,1792 single barrel
3,STRAIGHT BOURBON WHISKEY,21236,1792 Small Batch Bourbon,750ml,,93.7,$33.99,1792 small batch
4,STRAIGHT BOURBON WHISKEY,21242,1792 Sweet Wheat Bourbon,750ml,,91.2,$39.99,1792 sweet wheat


In [8]:
wa_df = pd.read_csv('data/Meta-Critic Whisky Database.csv')
wa_df.columns = wa_df.columns.str.lower().str.replace(' ', '_')

In [9]:
wa_df.head()

Unnamed: 0,whisky,meta_critic,stdev,#,cost,class,super_cluster,cluster,country,type
0,Glenfarclas 40yo,9.25,0.3,11,$$$$$+,SingleMalt-like,ABC,A,Scotland,Malt
1,Amrut Greedy Angels (8yo and 10yo),9.2,0.2,6,$$$$$+,SingleMalt-like,ABC,C,India,Malt
2,Redbreast 21yo,9.19,0.32,13,$$$$$,SingleMalt-like,ABC,C,Ireland,Malt
3,Amrut Spectrum,9.18,0.25,8,$$$$$,SingleMalt-like,ABC,C,India,Malt
4,Highland Park 25yo,9.17,0.24,13,$$$$$+,SingleMalt-like,ABC,C,Scotland,Malt


In [10]:
w_archive = pd.read_csv('data/Reddit Whisky Network Review Archive - Review Archive.csv',
                        names=['timestamp', 'whisky_name', 'reviewer_username', 'link',
                               'rating', 'style', 'bottle_price', 'review_date'],
                        skiprows=1,
                        parse_dates=['timestamp', 'review_date'])

In [11]:
w_archive['rating'] = pd.to_numeric(w_archive['rating'], errors='coerce')
w_archive['bottle_price'] = pd.to_numeric(w_archive['bottle_price'], errors='coerce')
w_archive['timestamp'] = pd.to_datetime(w_archive['timestamp'], errors='coerce')
w_archive['review_date'] = pd.to_datetime(w_archive['review_date'], errors='coerce')
w_archive['whisky_name'] = w_archive.whisky_name.str.lower()
w_archive['style'] = w_archive['style'].str.lower()
w_archive.dtypes

timestamp            datetime64[ns]
whisky_name                  object
reviewer_username            object
link                         object
rating                      float64
style                        object
bottle_price                float64
review_date          datetime64[ns]
dtype: object

In [12]:
w_archive.head()

Unnamed: 0,timestamp,whisky_name,reviewer_username,link,rating,style,bottle_price,review_date
0,2012-12-14 10:03:18,100 pipers,merlinblack,http://www.reddit.com/r/Scotch/comments/14uder...,68.0,blend,,2012-12-14
1,2015-06-22 11:40:00,11 wells minnesota 13 white whiskey,KozureOkami,http://www.reddit.com/r/worldwhisky/comments/3...,75.0,white,30.0,2015-06-22
2,2016-10-31 16:14:05,1792 full proof,dmsn7d,https://www.reddit.com/r/bourbon/comments/5aez...,85.0,bourbon,,2016-10-31
3,2016-10-19 11:20:32,1792 full proof,mentel42,https://www.reddit.com/r/bourbon/comments/56f1...,87.0,bourbon,50.0,2016-10-08
4,NaT,1792 full proof,signde,https://www.reddit.com/r/bourbon/comments/52b8...,80.0,bourbon,45.0,2016-09-11


In [13]:
w_archive['new_whisky_name'] = w_archive['whisky_name'].map(lambda x: my_va_transform(x))
w_archive.head()

Unnamed: 0,timestamp,whisky_name,reviewer_username,link,rating,style,bottle_price,review_date,new_whisky_name
0,2012-12-14 10:03:18,100 pipers,merlinblack,http://www.reddit.com/r/Scotch/comments/14uder...,68.0,blend,,2012-12-14,100 pipers
1,2015-06-22 11:40:00,11 wells minnesota 13 white whiskey,KozureOkami,http://www.reddit.com/r/worldwhisky/comments/3...,75.0,white,30.0,2015-06-22,11 wells minnesota 13 white
2,2016-10-31 16:14:05,1792 full proof,dmsn7d,https://www.reddit.com/r/bourbon/comments/5aez...,85.0,bourbon,,2016-10-31,1792 full proof
3,2016-10-19 11:20:32,1792 full proof,mentel42,https://www.reddit.com/r/bourbon/comments/56f1...,87.0,bourbon,50.0,2016-10-08,1792 full proof
4,NaT,1792 full proof,signde,https://www.reddit.com/r/bourbon/comments/52b8...,80.0,bourbon,45.0,2016-09-11,1792 full proof


In [14]:
va_new_brand = va_df['new_brand'].tolist()
wa_new_whisky = w_archive['new_whisky_name'].tolist()

In [15]:
exact_matches = set()
for brand in va_new_brand:
    if brand in wa_new_whisky:
        exact_matches.add(brand)
print('Matches: {}'.format(len(exact_matches)))
exact_matches = list(exact_matches)
exact_matches.sort()
print('\n'.join(exact_matches))

Matches: 89
1792 full proof
1792 high rye
1792 port finish
1792 small batch
1792 sweet wheat
ancient age
angel's envy rye
basil hayden's
belle meade
booker's
booker's rye
breckenridge
buffalo trace
bulleit
bulleit 10 year
bulleit rye
copper fox rye
crown royal northern harvest rye
david nicholson 1843
dewar's highlander honey
early times
elijah craig small batch
evan williams 1783
evan williams single barrel
fighting cock
filibuster
four roses single barrel
four roses small batch
george dickel rye
hibiki harmony
high west double rye
hirsch small batch reserve
hudson baby
hudson four grain
hudson manhattan rye
irishman single malt
james e. pepper 1776 barrel proof rye
jameson irish
jefferson's reserve
jefferson's reserve groth cask finish
jefferson's very small batch
jim beam
jim beam black
jim beam bonded
jim beam devil's cut
jim beam double oak
jim beam honey
jim beam rye
jim beam single barrel
knob creek
knob creek rye
knob creek single barrel reserve
knob creek smoked maple
kopper k

In [16]:
for brand in va_new_brand:
    matches = process.extract(brand, wa_new_whisky)
    print(brand, matches[0])

1792 full proof ('1792 full proof', 100)
1792 port finish ('1792 port finish', 100)
1792 single barrel ('abraham bowman double barrel', 86)
1792 small batch ('1792 small batch', 100)
1792 sweet wheat ('1792 sweet wheat', 100)
abraham bowman gingerbread cocoa finish ('abraham bowman gingerbread beer finish', 88)
abraham bowman wheat limited edition ('1792 sweet wheat', 86)
ancient age ('ancient age', 100)
ancient age ('ancient age', 100)
ancient age ('ancient age', 100)
ancient age ('ancient age', 100)
ancient age traveler ('ancient age', 90)
ancient ancient age ('ancient age 100 bib', 95)
ancient ancient age ('ancient age 100 bib', 95)
angel's envy port barrel ("angel's envy", 90)
baker's ('auchentoshan 13 smws 5.42: "bathed in a baker\'s shop"', 90)
barterhouse ('orphan barrel barterhouse 20', 90)
basil hayden's ("basil hayden's", 100)
belle meade ('belle meade', 100)
belle meade sherry finish ('belle meade 9 sherry finished', 93)
belle meade single barrel ('belle meade', 90)
benchmar