In [2]:
import numpy as np 
import pandas as pd 
import chardet

In [3]:
def find_encoding(fname):
    r_file = open(fname, 'rb').read()
    result = chardet.detect(r_file)
    charenc = result['encoding']
    return charenc


In [4]:
# Detect the enconde having in wines csv file.
wines_csv_encoding = find_encoding('data/wines.csv')
wines_csv_encoding


'UTF-8-SIG'

In [5]:
# Detect the enconde having in ratings csv file.
ratings_csv_encoding = find_encoding('data/wines.csv')
ratings_csv_encoding

'UTF-8-SIG'

In [6]:
# Detect the enconde having in users csv file.
users_csv_encoding = find_encoding('data/users.csv')
users_csv_encoding

'UTF-8-SIG'

In [7]:
# Build dataframe
wines_col = ['wine_id', 'wine_name', 'type', 'country', 'region', 'alcohol_content', 'producer', 'service', 'volume', 'grape', 'harvest', 'harmonization', 'image']
wines = pd.read_csv('data/wines.csv', sep=';', encoding=wines_csv_encoding, names=wines_col, engine='python')

ratings_col = ['user_id', 'wine_id', 'rating']
ratings = pd.read_csv('data/ratings.csv', sep=';', encoding=ratings_csv_encoding, names=ratings_col, engine='python')

# Build dataframe
users_col = ["user_id", 'user_name', 'gender', 'profession', 'age']
users = pd.read_csv('data/users.csv', sep=';', encoding=users_csv_encoding, names=users_col, engine='python')

df = pd.merge(wines, ratings)
df = pd.merge(df, users)
df.drop(columns=['type', 'country', 'region', 'alcohol_content', 'producer', 'service', 'volume', 'grape', 'harvest', 'harmonization', 'image', 'user_name', 'gender', 'profession', 'age'], axis=1, inplace=True)
df.head()


Unnamed: 0,wine_id,wine_name,user_id,rating
0,1,Abadia del Roble White La Mancha D.O,11,2
1,55,Don Simón Selección Tempranillo,11,0
2,68,Faustino Rivero Ulecia Joven Tempranillo-Garna...,11,0
3,108,Pinhal da Torre Late Harvest Tardio 2010 500 ml,11,2
4,131,Gallo Signature Series Russian River Valley Ch...,11,0


In [8]:
# Set wine's ratings and total ratings.
wines_sizes_ratings = df.groupby('wine_id').agg({'rating': [np.size, np.mean]})
sizes = wines_sizes_ratings['rating']['size']
means = wines_sizes_ratings['rating']['mean']
df.drop_duplicates('wine_id', inplace=True)
for i in range(len(wines_sizes_ratings)):
    df['vote_count'] = sizes.values
    df['vote_average'] = means.values
    
# Displaying the ordered dataframe
df.drop(columns=['user_id', 'rating'], axis=1, inplace=True)
df.sort_values(['vote_average', 'vote_count'], ascending=False).head()

Unnamed: 0,wine_id,wine_name,vote_count,vote_average
189,40,Château Graves du Privera Cru Artisan Médoc AO...,34,3.235294
101,164,El Bautismo El Criollo Rosado 2018,25,3.12
704,82,Lacryma Christi Feudi di San Gregorio Bianco d...,30,3.1
718,127,Solar das Bouças D.O.C. Vinho Verde Loureiro 2018,25,3.08
514,170,Calyptra Vivendo Reserve Rose 2018,28,3.071429


In [9]:
# Mean of average of the dataframe.
c = df['vote_average'].mean()
print(c)

2.486453584571613


In [10]:
# Now we calculate the 'm' to execute algorithm. -> m: minimum number of votes required to be listed
m = df['vote_count'].quantile(0.75)
print(m)

32.0


In [11]:
# Qualified wines, based on vote counts.
q_wines = df.copy().loc[df['vote_count'] >= m]
q_wines.shape


(45, 4)

In [12]:
def weighted_rating(x, m=m, c=c):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * c)

In [13]:
# Define a new feature 'score' and calculate its value with `weighted_rating()
q_wines['score'] = q_wines.apply(weighted_rating, axis=1)
q_wines = q_wines.sort_values('score', ascending=False)
q_wines.head()

Unnamed: 0,wine_id,wine_name,vote_count,vote_average,score
189,40,Château Graves du Privera Cru Artisan Médoc AO...,34,3.235294,2.87222
95,16,Barahonda D.O. Yecla Verdejo 2018,33,3.030303,2.762562
39,111,Portada Reserva 2016,32,2.96875,2.727602
614,80,La Mirada Tannat Tinto 2018,38,2.894737,2.708093
61,169,Tenuta Sant Antonio Scaia I.G.T. Veneto Rosato...,33,2.878788,2.685639
