# Whiskey Recommendation System

Importing Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

Loading the dataset

In [2]:
df = pd.read_csv("C:/Users/deval/OneDrive/Desktop/School/Projects/Whiskey Recommendation System/archive/scotch_review2020.csv")

Dataset Information

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2247 entries, 0 to 2246
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   2247 non-null   int64 
 1   name                 2247 non-null   object
 2   category             2247 non-null   object
 3   review.point         2247 non-null   int64 
 4   price                2247 non-null   object
 5   currency             2247 non-null   object
 6   description.1.2247.  2208 non-null   object
dtypes: int64(2), object(5)
memory usage: 123.0+ KB
None


Summary Statistics

In [4]:
print(df.describe())

                id  review.point
count  2247.000000   2247.000000
mean   1124.000000     88.477526
std     648.797349      2.949306
min       1.000000     83.000000
25%     562.500000     86.000000
50%    1124.000000     88.000000
75%    1685.500000     91.000000
max    2247.000000     97.000000


Missing values

In [5]:
print(df.isnull().sum())

id                      0
name                    0
category                0
review.point            0
price                   0
currency                0
description.1.2247.    39
dtype: int64


Data Preprocessing

In [7]:
features = ['name', 'category', 'review.point', 'description.1.2247.']
df['features'] = df[features].apply(lambda x: ' '.join(x.astype(str)), axis=1)
print(df[['name', 'features']].head())

                                                name  \
0      Black Bowmore 42 year old 1964 vintage, 40.5%   
1        Bowmore 46 year old (distilled 1964), 42.9%   
2                     Johnnie Walker Blue Label, 40%   
3    Glenlivet Cellar Collection 1969 vintage, 50.8%   
4  The Macallan 29 year old 1976 Vintage (Cask #1...   

                                            features  
0  Black Bowmore 42 year old 1964 vintage, 40.5% ...  
1  Bowmore 46 year old (distilled 1964), 42.9% Si...  
2  Johnnie Walker Blue Label, 40% Blended Scotch ...  
3  Glenlivet Cellar Collection 1969 vintage, 50.8...  
4  The Macallan 29 year old 1976 Vintage (Cask #1...  


Creating a similarity Matrix

In [8]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['features'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)

(2247, 2247)


Recommendation Function

In [13]:
def recommend_whiskeys(name, cosine_sim=cosine_sim, df=df, num_recommendations=5):
    try:
        idx = df[df['name'] == name].index[0]
    except IndexError:
        idx = None
    if idx is not None:
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:(num_recommendations + 1)]  
        whiskey_indices = [score[0] for score in sim_scores]
        recommendations = df['name'].iloc[whiskey_indices]
    else:
        recommendations = df['name'].sample(n=num_recommendations, random_state=42)
    return recommendations

Recommending a Whiskey

In [17]:
recommendations = recommend_whiskeys('Johnnie Walker Blue Label, 40%')
print(recommendations)

12                   Johnnie Walker Blue Anniversary, 60%
86      Johnnie Walker Blue Label Ghost and Rare Port ...
1345                Johnnie Walker Gold, 18 year old, 40%
71                                   The John Walker, 40%
769                      Johnnie Walker Sweet Peat, 40.8%
Name: name, dtype: object
