<a href="https://colab.research.google.com/github/chetannitk/BeerDataScienceAssignment/blob/master/notebooks/02-Beer_review_natural_language_understanding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<div class="alert alert-success">
<h1> Beer Data Science Assignment </h1>
</div>


In [0]:
from IPython.display import HTML
import pandas as pd
import os
import numpy as np
from datetime import datetime
import wordcloud
import warnings
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
warnings.filterwarnings('ignore')

#BASE_PATH = "../"
#INPUT_DATA_PATH = os.path.join(BASE_PATH, "data/raw/BeerDataScienceProject.csv")

  import pandas.util.testing as tm
Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  # Remove the CWD from sys.path while we load stuff.


In [0]:
## Upload BeerDataScienceProject.csv file from local system.
## Google will ask for authentication. Click on the link prompt and get the token.
## Pass the token in the input. It will save the BeerDataScienceProject.csv in /tmp dir.
import zipfile
from google.colab import drive

drive.mount('/content/drive/')

zip_ref = zipfile.ZipFile("/content/drive/My Drive/data/BeerDataScienceProject.zip", 'r')
zip_ref.extractall("/tmp")
zip_ref.close()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
!ls -lrt /tmp

total 405176
drwx------ 2 root root      4096 May 31 13:46 tmpf9jpugwd
srw------- 1 root root         0 May 31 13:48 drivefs_ipc.0
srw------- 1 root root         0 May 31 13:48 drivefs_ipc.0_shell
drwxr-xr-x 3 root root      4096 May 31 13:58 tfhub_modules
-rw-r--r-- 1 root root 414885425 May 31 14:40 BeerDataScienceProject.csv


In [0]:
import io
df_beer_raw = pd.read_csv('/tmp/BeerDataScienceProject.csv', encoding="ISO-8859-1")

In [0]:
df_beer_raw.shape

(528870, 13)

In [0]:
df_beer_raw.dtypes

beer_ABV              float64
beer_beerId             int64
beer_brewerId           int64
beer_name              object
beer_style             object
review_appearance     float64
review_palette        float64
review_overall        float64
review_taste          float64
review_profileName     object
review_aroma          float64
review_text            object
review_time             int64
dtype: object

### Data Cleaning
```In the below cell we can see that there are some columns where data is missing. 
   We need to remove those data points from our data set.
```

In [0]:
df_beer_raw.isna().sum()

beer_ABV              20280
beer_beerId               0
beer_brewerId             0
beer_name                 0
beer_style                0
review_appearance         0
review_palette            0
review_overall            0
review_taste              0
review_profileName      115
review_aroma              0
review_text             119
review_time               0
dtype: int64

In [0]:
df_beer = df_beer_raw.dropna()

In [0]:
df_beer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 508358 entries, 0 to 528796
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   beer_ABV            508358 non-null  float64
 1   beer_beerId         508358 non-null  int64  
 2   beer_brewerId       508358 non-null  int64  
 3   beer_name           508358 non-null  object 
 4   beer_style          508358 non-null  object 
 5   review_appearance   508358 non-null  float64
 6   review_palette      508358 non-null  float64
 7   review_overall      508358 non-null  float64
 8   review_taste        508358 non-null  float64
 9   review_profileName  508358 non-null  object 
 10  review_aroma        508358 non-null  float64
 11  review_text         508358 non-null  object 
 12  review_time         508358 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 54.3+ MB


In [0]:
df_beer = df_beer.reset_index()
df_beer['id'] = df_beer.index

## Data Changes in Raw and Clean dataset

In [0]:
print("Raw data size: ", df_beer_raw.shape[0])
print("Clean data size: ", df_beer.shape[0])
print("Data points difference: ", df_beer_raw.shape[0]-df_beer.shape[0])

Raw data size:  528870
Clean data size:  508358
Data points difference:  20512


### Describe the categorical features

In [0]:
df_beer.describe(include=[np.object])

Unnamed: 0,beer_name,beer_style,review_profileName,review_text
count,508358,508358,508358,508358
unique,14028,104,22563,508007
top,Sierra Nevada Celebration Ale,American IPA,northyorksammy,#NAME?
freq,2998,42092,1732,90


### Describe the numerical features

In [0]:
df_beer.describe(include=[np.number])

Unnamed: 0,index,beer_ABV,beer_beerId,beer_brewerId,review_appearance,review_palette,review_overall,review_taste,review_aroma,review_time,id
count,508358.0,508358.0,508358.0,508358.0,508358.0,508358.0,508358.0,508358.0,508358.0,508358.0,508358.0
mean,263669.490377,7.0174,21824.12084,2534.273207,3.872676,3.768993,3.840805,3.775327,3.827646,1226175000.0,254178.5
std,152590.091973,2.204528,22124.969107,5237.843825,0.601764,0.682355,0.706408,0.665586,0.715121,75308430.0,146750.458409
min,0.0,0.01,5.0,1.0,0.0,1.0,0.0,1.0,1.0,884390400.0,0.0
25%,131284.25,5.3,1673.0,132.0,3.5,3.5,3.5,3.5,3.5,1177202000.0,127089.25
50%,263756.5,6.5,13850.0,392.0,4.0,4.0,4.0,4.0,4.0,1241503000.0,254178.5
75%,395689.75,8.5,40418.0,1315.0,4.0,4.0,4.5,4.0,4.5,1289073000.0,381267.75
max,528796.0,57.7,77310.0,27980.0,5.0,5.0,5.0,5.0,5.0,1326277000.0,508357.0


In [0]:
NUMERICAL_ATTRIBUTE = ['beer_ABV', 'review_appearance', 'review_palette', 
                       'review_overall', 'review_taste', 'review_aroma', 'review_time']
CATEGORICAL_ATTRIBUTE = ['beer_name', 'beer_style', 'review_profileName', 'review_text']

In [0]:
df_beer.sample(4)

Unnamed: 0,index,beer_ABV,beer_beerId,beer_brewerId,beer_name,beer_style,review_appearance,review_palette,review_overall,review_taste,review_profileName,review_aroma,review_text,review_time,id
79813,82399,9.2,56761,140,30th Anniversary - Fritz And Ken's Ale,American Double / Imperial Stout,3.5,4.0,4.0,4.0,froghop,4.5,pours a jet black with a medium size brown hea...,1270251733,79813
7686,8141,6.6,3365,395,Hell For Certain,Belgian Dark Ale,4.0,4.0,4.0,3.5,tmitch75,4.5,Clear dark orange (almost red) color. A half i...,1209915737,7686
453894,471948,6.2,36613,392,Muse Farmhouse Ale,Saison / Farmhouse Ale,4.0,3.5,3.0,3.5,weeare138,3.5,Appears a hazy gold with a small white cap tha...,1179328511,453894
379955,394300,7.0,68916,20681,Lunch,American IPA,3.0,3.5,4.0,4.5,kbuzz,4.0,Poured from a 16.9 oz bottle into a snifter. D...,1319762324,379955


<div class="alert alert-info">
Convert the review timestamp to datetime format
</div>

In [0]:
df_beer['d_review_time'] = df_beer.review_time.apply(lambda t: datetime.fromtimestamp(t))

In [0]:
df_beer['d_review_time_year'] = df_beer['d_review_time'].dt.year
df_beer['d_review_time_month'] = df_beer['d_review_time'].dt.month

<div class="alert alert-success">
Q7) How do find similar beer drinkers by using written reviews only?   
</div>

In [0]:
!pip install tensorflow_text



In [0]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
tf.executing_eagerly()

True

In [0]:
## Get the pretrained embedding.
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

In [0]:
train = df_beer.review_text.tolist()
list_train = [train[i:i+100] for i in range(0,len(train),100)]

In [0]:
!pip install faiss-gpu 



In [0]:
import faiss

dimension = 512
nlist = 5  # number of clusters
quantiser = faiss.IndexFlatL2(dimension)  
index = faiss.IndexIVFFlat(quantiser, dimension, nlist, faiss.METRIC_L2)


## Index the embedding vectors in facebook AI similarity server in order to get faster similarity results.

In [0]:

for x in list_train:
    db_vectors = embed(x).numpy()
    print(index.is_trained)   # False
    index.train(db_vectors)  # train on the database vectors
    print(index.ntotal)   # 0
    index.add(db_vectors)   # add the vectors and update the index
    print(index.is_trained)  # True
    print(index.ntotal)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
383400
True
383500
True
383500
True
383600
True
383600
True
383700
True
383700
True
383800
True
383800
True
383900
True
383900
True
384000
True
384000
True
384100
True
384100
True
384200
True
384200
True
384300
True
384300
True
384400
True
384400
True
384500
True
384500
True
384600
True
384600
True
384700
True
384700
True
384800
True
384800
True
384900
True
384900
True
385000
True
385000
True
385100
True
385100
True
385200
True
385200
True
385300
True
385300
True
385400
True
385400
True
385500
True
385500
True
385600
True
385600
True
385700
True
385700
True
385800
True
385800
True
385900
True
385900
True
386000
True
386000
True
386100
True
386100
True
386200
True
386200
True
386300
True
386300
True
386400
True
386400
True
386500
True
386500
True
386600
True
386600
True
386700
True
386700
True
386800
True
386800
True
386900
True
386900
True
387000
True
387000
True
387100
True
387100
True
387200
True
387200
True
387300
True

In [0]:
faiss.write_index(index,"/content/drive/My Drive/data/beer_review_index_v1")


In [0]:
from  more_itertools import unique_everseen

inp_query = """I like a hoppy beer, but I found this a bit unbalanced. Has a great head and smell. Goes great with spaghetti."""
query_vectors = embed([inp_query]).numpy()
answer = []
distances, indices = index.search(query_vectors, 100)
for ind in list(unique_everseen(indices[0])):
    review_text = df_beer[df_beer['id']==ind].review_text.tolist()[0]
    beer_name = df_beer[df_beer['id']==ind].beer_name.tolist()[0]
    beer_style = df_beer[df_beer['id']==ind].beer_style.tolist()[0]
    answer.append((review_text, beer_name, beer_style))

df_ans = pd.DataFrame(answer, columns=['Review_Text', 'Beer_Name', 'Beer_Style'])
df_ans.head(10)

Unnamed: 0,Review_Text,Beer_Name,Beer_Style
0,"I like a hoppy beer, but I found this a bit un...",Dead Guy Ale,Maibock / Helles Bock
1,This beer has everything in my opinion. The ap...,Maudite,Belgian Strong Dark Ale
2,I poured this beer into a half-liter glass. Th...,Aecht Schlenkerla Rauchbier Märzen,Rauchbier
3,This is a very decent beer but nothing about i...,Sierra Nevada Southern Hemisphere Harvest Fres...,American IPA
4,This beer pours well with a good head that is ...,Chipotle Ale,Chile Beer
5,This beer pours well and leaves a solid head. ...,Boddingtons Pub Ale,English Pale Ale
6,"Good beer. Has a sweet hops flavor, mixed with...",Raging Bitch Belgian-Style IPA,Belgian IPA
7,Grabbed this one out of my fridge and poured i...,Sierra Nevada Pale Ale,American Pale Ale (APA)
8,"This is my favorite beer,so far. The hoppy swe...",Racer 5 India Pale Ale,American IPA
9,The beer is a dark copper with a good sized he...,Sierra Nevada Celebration Ale,American IPA



## REALTIME BEER RECOMENDATION BASED ON REVIEW-TEXT SIMILARITY.


In [0]:
from  more_itertools import unique_everseen
while True:
    print("Enter review Query? ... Press 'q' to Quit")
    print("=="*10)
    inp_query = input()
    if inp_query in ['q', 'Q']:
        break
    query_vectors = embed([inp_query]).numpy()
    print("Answer")
    distances, indices = index.search(query_vectors, 2)
    for ind in list(unique_everseen(indices[0])):
        print("BEER NAME: ", df_beer[df_beer['id']==ind].beer_name.tolist()[0])
        print("SIMILAR REVIEW TEXT: ", df_beer[df_beer['id']==ind].review_text.tolist()[0])
        print("=="*10)
    print("=="*10)

Enter review Query? ... Press 'q' to Quit
I like a hoppy beer, but I found this a bit unbalanced. Has a great head and smell. Goes great with spaghetti.
Answer
BEER NAME:  Dead Guy Ale
SIMILAR REVIEW TEXT:  I like a hoppy beer, but I found this a bit unbalanced. Has a great head and smell. Goes great with spaghetti.
BEER NAME:  Maudite
SIMILAR REVIEW TEXT:  This beer has everything in my opinion. The appearance gives the initial feeling that this beer might be the best ever. A nice head appears, about 1.5" and the smell provides a plethora of smells, that isn't too hoppy. The alcoholic content is masked very well and doesn't sit too heavy.
Enter review Query? ... Press 'q' to Quit
q
