In [1]:
import numpy as np 
import pandas as pd 
import chardet
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances

In [2]:
def find_encoding(fname):
    r_file = open(fname, 'rb').read()
    result = chardet.detect(r_file)
    charenc = result['encoding']
    return charenc


In [21]:
# Detect the enconde having in wines csv file.
wines_csv_encoding = find_encoding('data/wines.csv')
wines_csv_encoding


'UTF-8-SIG'

In [22]:
# Detect the enconde having in rating csv file.
rating_csv_encoding = find_encoding('data/wines.csv')
rating_csv_encoding

'UTF-8-SIG'

In [23]:
# Build dataframe
wines_col = ['wine_id', 'name', 'type', 'country', 'region', 'alcohol_content', 'producer', 'service', 'volume', 'grape', 'harvest', 'harmonization', 'image']
wines = pd.read_csv('data/wines.csv', sep=';', encoding=wines_csv_encoding, names=wines_col, engine='python')

ratings_col = ['user_id', 'wine_id', 'rating']
ratings = pd.read_csv('data/ratings.csv', sep=';', encoding=rating_csv_encoding, names=ratings_col, engine='python')

df = pd.merge(wines, ratings)
df.head()

Unnamed: 0,wine_id,name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image,user_id,rating
0,1.0,Expedicion Single Vineyard Selection Cabernet ...,Tinto,Chile,Vale Central,13.0,Finca Patagonia,17.0,750.0,Cabernet Sauvignon,2019.0,Carne Assada,Expedicion Single Vineyard Selection Cabernet ...,2,3
1,1.0,Expedicion Single Vineyard Selection Cabernet ...,Tinto,Chile,Vale Central,13.0,Finca Patagonia,17.0,750.0,Cabernet Sauvignon,2019.0,Carne Assada,Expedicion Single Vineyard Selection Cabernet ...,3,3
2,1.0,Expedicion Single Vineyard Selection Cabernet ...,Tinto,Chile,Vale Central,13.0,Finca Patagonia,17.0,750.0,Cabernet Sauvignon,2019.0,Carne Assada,Expedicion Single Vineyard Selection Cabernet ...,5,4
3,1.0,Expedicion Single Vineyard Selection Cabernet ...,Tinto,Chile,Vale Central,13.0,Finca Patagonia,17.0,750.0,Cabernet Sauvignon,2019.0,Carne Assada,Expedicion Single Vineyard Selection Cabernet ...,6,3
4,1.0,Expedicion Single Vineyard Selection Cabernet ...,Tinto,Chile,Vale Central,13.0,Finca Patagonia,17.0,750.0,Cabernet Sauvignon,2019.0,Carne Assada,Expedicion Single Vineyard Selection Cabernet ...,6,1


In [24]:
# Set wine's ratings and total ratings.
wines_sizes_ratings = df.groupby('wine_id').agg({'rating': [np.size, np.mean]})
sizes = wines_sizes_ratings['rating']['size']
means = wines_sizes_ratings['rating']['mean']
df.drop_duplicates('wine_id', inplace=True)
for i in range(len(wines_sizes_ratings)):
    df['vote_count'] = sizes.values
    df['vote_average'] = means.values
    
# Displaying the ordered dataframe
df.sort_values(['vote_average', 'vote_count'], ascending=False).head()

Unnamed: 0,wine_id,name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image,user_id,rating,vote_count,vote_average
501,49.0,Mil Raices Selection Cabernet Sauvignon 2018,Tinto,Chile,Valle Central,13.0,7 Colores,16.0,750.0,Cabernet Sauvignon,2018.0,"Carnes Vermelhas, Queijos Amarelos, Massas",Mil Raices Selection Cabernet Sauvignon 2018,1,5,4,4.0
610,60.0,Barolo Selezione Cantine by Massimo Rattalino ...,Tinto,Italia,Piemonte,14.0,Massimo Rattalino,17.0,750.0,Nebbiolo,2014.0,"Carnes de Caça, Massas com Molhos Cremosos, Qu...",Barolo Selezione Cantine by Massimo Rattalino ...,1,5,12,3.583333
322,32.0,Masereto Montepulciano d'Abruzzo 2018,Tinto,Italia,Abruzzo,,Angelo Rocca e Fligi Srl,0.0,750.0,Montepulciano,2018.0,,Masereto Montepulciano d'Abruzzo 2018,1,5,10,3.5
385,38.0,Buenos Aires Cabernet-Malbec 2017,Tinto,Argentina,Mendoza,13.0,Fecovita,17.0,750.0,Varias Uvas,2017.0,"Churrasco, Queijos Maduros",Buenos Aires Cabernet-Malbec 2017,1,3,11,3.454545
423,41.0,Biscardo Neropasso Rosso Veneto 2017,Tinto,Italia,Veneto,13.5,Biscardo,17.0,750.0,Varias Uvas,2017.0,"Pratos Italianos, Aperitivos",Biscardo Neropasso Rosso Veneto 2017,2,4,10,3.4


In [25]:
# Mean of average of the dataframe.
c = df['vote_average'].mean()
print(c)

2.460438603194594


In [26]:
# Now we calculate the 'm' to execute algorithm. -> m: minimum number of votes required to be listed
m = df['vote_count'].quantile(0.75)
print(m)

13.0


In [27]:
# Qualified wines, based on vote counts.
q_wines = df.copy().loc[df['vote_count'] >= m]
q_wines.shape


(24, 17)

In [28]:
def weighted_rating(x, m=m, c=c):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * c)

In [29]:
# Define a new feature 'score' and calculate its value with `weighted_rating()
q_wines['score'] = q_wines.apply(weighted_rating, axis=1)
q_wines = q_wines.sort_values('score', ascending=False)
q_wines.head()





Unnamed: 0,wine_id,name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image,user_id,rating,vote_count,vote_average,score
518,52.0,Puente Sur Malbec 2018,Tinto,Argentina,Mendoza,13.0,Fecovita,17.0,750.0,Malbec,2018.0,"Churrasco, Queijos Maduros",Puente Sur Malbec 2018,1,2,13,3.307692,2.884065
830,80.0,Anciano Reserva 7 Years Old Magnum 2011,Tinto,Espanha,Valdepenãs,13.0,Anciano,0.0,750.0,Tempranillo,2011.0,,Anciano Reserva 7 Years Old Magnum 2011,1,4,18,3.166667,2.870507
396,39.0,Palacio del Burgo Rioja DOCa 2017,Tinto,Espanha,Rioja,13.0,Burgo Viejo,17.0,750.0,Tempranillo,2017.0,"Carnes Vermelhas Assadas, Embutidos",Palacio del Burgo Rioja DOCa 2017,1,2,16,3.0625,2.79261
459,45.0,Anciano Reserva Douro DOC 2016,Tinto,Portugal,Douro,13.5,Casa Santos Lima/Guy Anderson Wines,16.0,750.0,Varias Uvas,2016.0,"Carnes Vermelhas Assadas, Cozido de Grão de Bi...",Anciano Reserva Douro DOC 2016,1,2,15,3.066667,2.785204
237,24.0,Castillo D'Elaro Bobal 2018,Tinto,Espanha,Castilla-La Mancha,13.0,Bodegas Gallegas,17.0,750.0,Bobal,2018.0,,Castillo D'Elaro Bobal 2018,1,5,13,2.923077,2.691758
