In [102]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
from scipy.stats.stats import pearsonr
from scipy.spatial.distance import cdist
from itertools import islice
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import nltk
import seaborn as sns
from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 500)
import os

In [103]:
import googletrans
from googletrans import Translator
import time
import re

In [104]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [105]:
DATA_FOLDER = "data"
data_path = os.path.join(DATA_FOLDER) + "/"

if not DATA_FOLDER in os.listdir():
    os.mkdir(DATA_FOLDER)

# Read csv

In [109]:
df = pd.read_csv(data_path+'Beer_dataset_raw.csv', index_col=0)

In [110]:
df.isna().sum()

Name            0
Style           0
Description    66
dtype: int64

In [111]:
df.head()

Unnamed: 0,Name,Style,Description
0,La Vella Caravana / Black Bitch Caravan of Bitches,Witbier - 5.2 º - 29 IBU,"Caravan of Bitches\nHoppy Wheat Ale\nEstilo tradicional elaborado con malta pilsner, trigo y avena. Este estilo se caracteriza por las adiciones de cascara de naranja amarga y semillas de cilantro."
1,BlackLab Sour Lady,Berliner Weisse - 5 º,Sour Lady\nBerliner Weisse with fruit\nUna cerveza ácida con frambuesas.\n---------------\nSour beer with raspberries
2,Blacklab Claudia Passion,American IPA,Claudia Passion\nFruit IPA\nNuestra mejor cerveza con Maracuyá\n--------------\nOur best IPA with passion fruit
3,BlackLab More Sparkle,Brut IPA - 7 º,More Sparkle\nBrut IPA\nDry and aromatic. Low bitterness.\n-----------\nMuy seca y aromática. Poco amargor.
4,Greyhound Brewers Give ’em Hops,Imperial IPA - 8 º,Give ’em Hops\nDoble NEIPA


# Style

In [112]:
df[['Style', 'ABV', 'IBU']] = df.Style.str.split(' - ', n=2, expand=True)

In [113]:
df.head()

Unnamed: 0,Name,Style,Description,ABV,IBU
0,La Vella Caravana / Black Bitch Caravan of Bitches,Witbier,"Caravan of Bitches\nHoppy Wheat Ale\nEstilo tradicional elaborado con malta pilsner, trigo y avena. Este estilo se caracteriza por las adiciones de cascara de naranja amarga y semillas de cilantro.",5.2 º,29 IBU
1,BlackLab Sour Lady,Berliner Weisse,Sour Lady\nBerliner Weisse with fruit\nUna cerveza ácida con frambuesas.\n---------------\nSour beer with raspberries,5 º,
2,Blacklab Claudia Passion,American IPA,Claudia Passion\nFruit IPA\nNuestra mejor cerveza con Maracuyá\n--------------\nOur best IPA with passion fruit,,
3,BlackLab More Sparkle,Brut IPA,More Sparkle\nBrut IPA\nDry and aromatic. Low bitterness.\n-----------\nMuy seca y aromática. Poco amargor.,7 º,
4,Greyhound Brewers Give ’em Hops,Imperial IPA,Give ’em Hops\nDoble NEIPA,8 º,


# ABV

In [114]:
df.ABV.head()

0    5.2 º
1      5 º
2     None
3      7 º
4      8 º
Name: ABV, dtype: object

## Replace strange symbols

In [115]:
df.ABV = df.ABV.str.replace(" º", "")
df.ABV = df.ABV.str.replace(" ", "")
df.ABV = df.ABV.str.replace("\xa0", " ")
df.ABV = df.ABV.str.replace("%", "")
df.ABV = df.ABV.str.replace("vol.", "")
df.ABV = df.ABV.str.replace("ABV", "")

In [116]:
# Eliminamos "Hidromiel" y "Sidra"

df = df[df.ABV != "Applewine"]
df = df[df.ABV != "Hidromiel"]
df = df[df.ABV != "OtherSpecialtyCider/Perry"]
df = df[df.ABV != "FruitCider"]
df = df[df.ABV != "CTZ.Amarillo.Mosaic"]

In [117]:
# Eliminamos los 13 registros que no traen valor de ABV, solo de IBU

df = df[~df.ABV.str.contains("IBU", na=False)]

In [118]:
df.ABV = df.ABV.str.split("-", n=1, expand=True)[0]

In [119]:
df.ABV = df.ABV.str.replace("\.\.", ".", regex=True)
df.ABV = df.ABV.str.replace("’", "")
df.ABV = df.ABV.str.replace("´", "")

In [120]:
df.ABV = df.ABV.str.split("/", n=1, expand=True)[0]

In [121]:
df.ABV.isna().sum()

185

In [122]:
df.ABV = df.ABV.astype('float')

## Clean outliers

In [123]:
df.ABV = df.ABV.apply(lambda x: x if x < 30 else x/10)

# IBU

In [124]:
df.IBU = df.IBU.str.replace("IBU", "")
df.IBU = df.IBU.replace(" ", "")
df.IBU = df.IBU.str.replace("\xa0", " ")
df.IBU = df.IBU.str.replace(",", ".")

In [125]:
df.IBU = df.IBU.str.split("-", n=1, expand=True)[0]
df.IBU = df.IBU.str.split("/", n=1, expand=True)[0]

In [126]:
df.IBU = df.IBU.astype('float')

# Name, Brewery and Rating

In [127]:
df.Name = df.Name.str.replace("starstarstarstarstar", "star")
df.Name = df.Name.str.replace("starstarstarstar", "star")
df.Name = df.Name.str.replace("starstarstar", "star")
df.Name = df.Name.str.replace("starstar", "star")

In [128]:
df.Name = df.Name.str.replace("star_borderstar_borderstar_borderstar_borderstar_border", "/foo/")
df.Name = df.Name.str.replace("star_halfstar_borderstar_borderstar_borderstar_border", "/foo/")
df.Name = df.Name.str.replace("star_halfstar_borderstar_borderstar_border", "/foo/")
df.Name = df.Name.str.replace("star_borderstar_borderstar_borderstar_border", "/foo/")
df.Name = df.Name.str.replace("star_borderstar_borderstar_border", "/foo/")
df.Name = df.Name.str.replace("star_halfstar_borderstar_border", "/foo/")
df.Name = df.Name.str.replace("star_borderstar_border", "/foo/")
df.Name = df.Name.str.replace("star_halfstar_border", "/foo/")
df.Name = df.Name.str.replace("star_border", "/foo/")
df.Name = df.Name.str.replace("star_half", "/foo/")
df.Name = df.Name.str.replace(" star ", " /foo/ ")

In [129]:
df[['Name', 'Rating']] = df.Name.str.split('/foo/', n=1, expand=True)

In [130]:
df.Rating = df.Rating.str.replace(",", ".")

In [131]:
df.Rating = df.Rating.astype('float')

In [132]:
df.head()

Unnamed: 0,Name,Style,Description,ABV,IBU,Rating
0,La Vella Caravana / Black Bitch Caravan of Bitches,Witbier,"Caravan of Bitches\nHoppy Wheat Ale\nEstilo tradicional elaborado con malta pilsner, trigo y avena. Este estilo se caracteriza por las adiciones de cascara de naranja amarga y semillas de cilantro.",5.2,29.0,
1,BlackLab Sour Lady,Berliner Weisse,Sour Lady\nBerliner Weisse with fruit\nUna cerveza ácida con frambuesas.\n---------------\nSour beer with raspberries,5.0,,
2,Blacklab Claudia Passion,American IPA,Claudia Passion\nFruit IPA\nNuestra mejor cerveza con Maracuyá\n--------------\nOur best IPA with passion fruit,,,
3,BlackLab More Sparkle,Brut IPA,More Sparkle\nBrut IPA\nDry and aromatic. Low bitterness.\n-----------\nMuy seca y aromática. Poco amargor.,7.0,,
4,Greyhound Brewers Give ’em Hops,Imperial IPA,Give ’em Hops\nDoble NEIPA,8.0,,


In [133]:
# df["Test"] = df.Name.str.split('/')

In [134]:
# df["Name"] = df.Test.apply(lambda x: x[-1] if len(x)>1 else x[0])
# df["Brewery"] = df.Test.apply(lambda x: x[0:-1] if len(x)>1 else None)

In [135]:
df.Name = df.Name.str.rstrip()
df.Name = df.Name.str.lstrip()

# df.Brewery = df.Brewery.str.rstrip()
# df.Brewery = df.Brewery.str.lstrip()

# Reordering

In [136]:
# columns = ['Name', 'Brewery', 'Style', 'ABV', 'IBU', 'Rating', 'Description']
columns = ['Name', 'Style', 'ABV', 'IBU', 'Rating', 'Description']

df = df[columns]

In [137]:
df.head()

Unnamed: 0,Name,Style,ABV,IBU,Rating,Description
0,La Vella Caravana / Black Bitch Caravan of Bitches,Witbier,5.2,29.0,,"Caravan of Bitches\nHoppy Wheat Ale\nEstilo tradicional elaborado con malta pilsner, trigo y avena. Este estilo se caracteriza por las adiciones de cascara de naranja amarga y semillas de cilantro."
1,BlackLab Sour Lady,Berliner Weisse,5.0,,,Sour Lady\nBerliner Weisse with fruit\nUna cerveza ácida con frambuesas.\n---------------\nSour beer with raspberries
2,Blacklab Claudia Passion,American IPA,,,,Claudia Passion\nFruit IPA\nNuestra mejor cerveza con Maracuyá\n--------------\nOur best IPA with passion fruit
3,BlackLab More Sparkle,Brut IPA,7.0,,,More Sparkle\nBrut IPA\nDry and aromatic. Low bitterness.\n-----------\nMuy seca y aromática. Poco amargor.
4,Greyhound Brewers Give ’em Hops,Imperial IPA,8.0,,,Give ’em Hops\nDoble NEIPA


In [138]:
# df.Brewery.notnull().sum()

# Duplicates

In [139]:
df.Name.value_counts().head(10)

Tacoa Golden Ale                        2
Cata Craft Pumpk Not Died               2
Catalan Brewery Beach Bunny             2
Microbombolla Maguau                    2
Antiga La Pica en Flandes               1
La Pirata / Edge Brewing Bee’s Knees    1
Yakka / El Barbas Wood Works            1
Quer Jai Alai                           1
Hope Fil·loxera                         1
CESC Marion                             1
Name: Name, dtype: int64

In [140]:
# df.Brewery.value_counts().head(10)

In [141]:
# Drop some duplicates after checking which one we want

# df = df.drop(4359)
# df = df.drop(679)
# df = df.drop(4464)
# df = df.drop(3831)

In [142]:
df.reset_index(drop=True, inplace=True)

In [143]:
df.head()

Unnamed: 0,Name,Style,ABV,IBU,Rating,Description
0,La Vella Caravana / Black Bitch Caravan of Bitches,Witbier,5.2,29.0,,"Caravan of Bitches\nHoppy Wheat Ale\nEstilo tradicional elaborado con malta pilsner, trigo y avena. Este estilo se caracteriza por las adiciones de cascara de naranja amarga y semillas de cilantro."
1,BlackLab Sour Lady,Berliner Weisse,5.0,,,Sour Lady\nBerliner Weisse with fruit\nUna cerveza ácida con frambuesas.\n---------------\nSour beer with raspberries
2,Blacklab Claudia Passion,American IPA,,,,Claudia Passion\nFruit IPA\nNuestra mejor cerveza con Maracuyá\n--------------\nOur best IPA with passion fruit
3,BlackLab More Sparkle,Brut IPA,7.0,,,More Sparkle\nBrut IPA\nDry and aromatic. Low bitterness.\n-----------\nMuy seca y aromática. Poco amargor.
4,Greyhound Brewers Give ’em Hops,Imperial IPA,8.0,,,Give ’em Hops\nDoble NEIPA


# Export dataset

In [144]:
df.to_csv(data_path+'Beer_dataset_v1.csv')