In [186]:
import numpy as np 
import pandas as pd 
import chardet
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances

In [3]:
def find_encoding(fname):
    r_file = open(fname, 'rb').read()
    result = chardet.detect(r_file)
    charenc = result['encoding']
    return charenc


In [4]:
# Detect the enconde having in wines csv file.
wines_csv_encoding = find_encoding('data/wines.csv')
wines_csv_encoding


'utf-8'

In [5]:
# Detect the enconde having in rating csv file.
rating_csv_encoding = find_encoding('data/wines.csv')
rating_csv_encoding

'utf-8'

In [154]:
# Build dataframe
wines_col = ['wine_id', 'name', 'type', 'country', 'region', 'alcohol_content', 'producer', 'service', 'volume', 'vintage', 'views']
wines = pd.read_csv('data/wines.csv', sep=';', names=wines_col, encoding=wines_csv_encoding, engine='python')

ratings_col = ['user_id', 'wine_id', 'rating']
ratings = pd.read_csv('data/ratings.csv', sep=';', names=ratings_col, encoding=rating_csv_encoding, engine='python')

df = pd.merge(wines, ratings)

In [156]:
# Set wine's ratings and total ratings.
wines_sizes_ratings = df.groupby('wine_id').agg({'rating': [np.size, np.mean]})
sizes = wines_sizes_ratings['rating']['size']
means = wines_sizes_ratings['rating']['mean']
df.drop_duplicates('wine_id', inplace=True)
for i in range(len(wines_sizes_ratings)):
    df['vote_count'] = sizes.values
    df['vote_average'] = means.values
    
# Displaying the ordered dataframe
df.sort_values(['vote_average', 'vote_count'], ascending=False).head()

In [179]:
# Mean of average of the dataframe.
c = df['vote_average'].mean()
print(c)

2.4764056121090685


In [180]:
# Now we calculate the 'm' to execute algorithm. -> m: minimum number of votes required to be listed
m = df['vote_count'].quantile(0.75)
print(m)

13.0


In [181]:
# Qualified wines, based on vote counts.
q_wines = df.copy().loc[df['vote_count'] >= m]
q_wines.shape


(25, 15)

In [182]:
def weighted_rating(x, m=m, c=c):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * c)

In [185]:
# Define a new feature 'score' and calculate its value with `weighted_rating()
q_wines['score'] = q_wines.apply(weighted_rating, axis=1)
q_wines = q_wines.sort_values('score', ascending=False)
q_wines.head()





Unnamed: 0,wine_id,name,type,country,region,alcohol_content,producer,service,volume,vintage,views,user_id,rating,vote_count,vote_average,score
522,52,Anciano Reserva Douro DOC 2016,Tinto,Portugal,Douro,13.5,Casa Santos Lima/Guy Anderson Wines,16.0,750,2016.0,46,7,4,13,3.307692,2.892049
834,80,Anciano 35 Years Old Vines Garnacha Calatayud ...,Tinto,Espanha,Calatayud,15.0,Bodegas San Gregorio - Norrel Robertson\r\n\r\n,,750,2016.0,4,7,2,18,3.166667,2.877202
400,39,Miliasso Primitivo di Manduria DOC 2017,Tinto,Italia,Puglia,14.5,Angelo Rocca e Fligi Srl,17.0,750,2017.0,13,10,5,16,3.0625,2.799768
463,45,El Molino Estate Bottled Merlot 2018,Tinto,Argentina,Mendoza,13.0,Penaflor,17.0,750,2018.0,4,6,5,15,3.066667,2.792617
241,24,Famiglia Castellani From Old Vines Primitivo 2018,Tinto,Italia,Puglia,13.0,Castellani,17.0,750,2018.0,79,1,5,13,2.923077,2.699741
