In [64]:
import pandas as pd 
import chardet
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [65]:
def find_encoding(fname):
    r_file = open(fname, 'rb').read()
    result = chardet.detect(r_file)
    charenc = result['encoding']
    return charenc

In [83]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
        

In [87]:
# Detect the enconde having in wines csv file.
wines_csv_encoding = find_encoding('data/wines.csv')
wines_csv_encoding


'UTF-8-SIG'

In [88]:
# Import wines dataset
wines_col = ['wine_id', 'wine_name', 'type', 'country', 'region', 'alcohol_content', 'producer', 'service', 'volume', 'grape', 'harvest', 'harmonization', 'image']
wines = pd.read_csv('data/wines.csv', sep=';', names=wines_col, encoding=wines_csv_encoding, engine='python')
wines.head()



Unnamed: 0,wine_id,wine_name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image
0,1,Abadia del Roble White La Mancha D.O,Branco,Espanha,La Mancha,120,Bodegas Ayuso,10,750,Airen,,Carnes brancas,Abadia del Roble White La Mancha D.O
1,2,Alqueve Branco 2017,Branco,Portugal,Tejo,120,Pinhal da torre,9,750,Varias uvas,2017.0,"Camarão grelhado, tomates recheados, moqueca, ...",Alqueve Branco 2017
2,3,Altivo Vineyard Selection La Rioja Torrontes 2018,Branco,Argentina,La Rioja,135,Finca Eugenio Bustos,9,750,Torrontes,2018.0,"Salada com frango grelhado, espetinho de camar...",Altivo Vineyard Selection La Rioja Torrontes 2018
3,4,Anciano 35 Years Old Vines Garnacha Calatayud ...,Tinto,Espanha,Calatayud,150,Bodegas San Gregorio - Norrel Robertson,12,750,Garnacha,2016.0,,Anciano 35 Years Old Vines Garnacha Calatayud ...
4,5,Anciano Crianza 3 years Tempranillo Valdepeñas...,Tinto,Espanha,Valdepenãs,125,Anciano,12,750,Tempranillo,2014.0,,Anciano Crianza 3 years Tempranillo Valdepeñas...


In [89]:
wines['harmonization'] = wines['harmonization'].fillna('')
wines.head()

Unnamed: 0,wine_id,wine_name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image
0,1,Abadia del Roble White La Mancha D.O,Branco,Espanha,La Mancha,120,Bodegas Ayuso,10,750,Airen,,Carnes brancas,Abadia del Roble White La Mancha D.O
1,2,Alqueve Branco 2017,Branco,Portugal,Tejo,120,Pinhal da torre,9,750,Varias uvas,2017.0,"Camarão grelhado, tomates recheados, moqueca, ...",Alqueve Branco 2017
2,3,Altivo Vineyard Selection La Rioja Torrontes 2018,Branco,Argentina,La Rioja,135,Finca Eugenio Bustos,9,750,Torrontes,2018.0,"Salada com frango grelhado, espetinho de camar...",Altivo Vineyard Selection La Rioja Torrontes 2018
3,4,Anciano 35 Years Old Vines Garnacha Calatayud ...,Tinto,Espanha,Calatayud,150,Bodegas San Gregorio - Norrel Robertson,12,750,Garnacha,2016.0,,Anciano 35 Years Old Vines Garnacha Calatayud ...
4,5,Anciano Crianza 3 years Tempranillo Valdepeñas...,Tinto,Espanha,Valdepenãs,125,Anciano,12,750,Tempranillo,2014.0,,Anciano Crianza 3 years Tempranillo Valdepeñas...


In [90]:
#Define a TF-IDF Vectorizer Object.
tfidf = TfidfVectorizer(stop_words=['portuguese', 'inglish'])

In [91]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(wines['harmonization'])
tfidf_matrix.shape

(181, 433)

In [92]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)



In [93]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(wines.index, index=wines['wine_name']).drop_duplicates()
wines['harmonization'] = wines['harmonization'].apply(clean_data)

In [94]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(wine_name, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[wine_name]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    wines_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return wines.iloc[wines_indices]

In [95]:
get_recommendations('Don Simón Selección Tempranillo', cosine_sim)



Unnamed: 0,wine_id,wine_name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image
112,113,Premium Chianti DOCG 2017,Tinto,Italia,Toscana,120,Castellani,17,750,"Sangiovese, Canaiolo",2017.0,"espagueteaomolhosugo,lasanhaabolonhesa",Premium Chianti DOCG 2017
135,136,Trio Vecchio Podere Chianti,Tinto,Italia,Toscana,120,Cantine Cecconi,17,750,Varias uvas,2017.0,"espagueteaomolhosugo,lasanhaabolonhesa",Trio Vecchio Podere Chianti
141,142,Vecchio Podere Cantine Cecconi Chianti DOCG 2017,Tinto,Italia,Toscana,120,Cantine Cecconi,17,750,Varias uvas,2017.0,"espagueteaomolhosugo,lasanhaabolonhesa",Vecchio Podere Cantine Cecconi Chianti DOCG 2017
59,60,El Molino Malbec 2019,Tinto,Argentina,Mendonza,130,Grupo Penaflor,16,750,Malbec,2019.0,"churrasco,lasanhaabolonhesa",El Molino Malbec 2019
63,64,Famiglia Castellani Chianti Riserva DOCG 2015,Tinto,Italia,Toscana,125,Castellani,17,750,"Sangiovese, Canaiolo, Cabernet Sauvignon",2015.0,"lasanhaabolonhesa,carnesdecaça",Famiglia Castellani Chianti Riserva DOCG 2015
104,105,Palacio del Burgo Rioja DOCa 2017,Tinto,Espanha,Rioja,130,Burgo Viejo,17,750,Tempranillo,2017.0,"carnesvermelhasassadas,embutidos",Palacio del Burgo Rioja DOCa 2017
22,23,Bellamico Montepulciano d'Abruzzo DOC 2018,Tinto,Italia,Abruzzo,120,Angelo Rocca e Fligi Srl,16,750,Montepulciano,2018.0,"carnesdeporco,molhoencorpadoseembutidos",Bellamico Montepulciano d'Abruzzo DOC 2018
11,12,Atardecer de Los Andes Red Blend 2019,Tinto,Argentina,Argentina,130,Fecovita,12,750,Varias uvas,2019.0,"pizzas,massascommolhosdetomate,embutidos",Atardecer de Los Andes Red Blend 2019
146,147,Viñapeña Tempranillo,Tinto,Espanha,Multiregional,120,J.Garcia Carrion,17,750,Tempranillo,,"pizzasvariadas,tábuadeembutidos",Viñapeña Tempranillo
7,8,Anciano Reserva Douro DOC 2016,Tinto,Portugal,Douro,135,Casa Santos Lima/Guy Anderson Wines,16,750,Varias uvas,2016.0,"carnesvermelhasassadas,cozidodegrãodebicocomco...",Anciano Reserva Douro DOC 2016


In [76]:
features = ['type', 'country', 'region', 'producer', 'grape']
for feature in features:
    wines[feature] = wines[feature].apply(clean_data)
wines.head()

Unnamed: 0,wine_id,wine_name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image
0,1,Abadia del Roble White La Mancha D.O,branco,espanha,lamancha,120,bodegasayuso,10,750,airen,,Carnes brancas,Abadia del Roble White La Mancha D.O
1,2,Alqueve Branco 2017,branco,portugal,tejo,120,pinhaldatorre,9,750,variasuvas,2017.0,"Camarão grelhado, tomates recheados, moqueca, ...",Alqueve Branco 2017
2,3,Altivo Vineyard Selection La Rioja Torrontes 2018,branco,argentina,larioja,135,fincaeugeniobustos,9,750,torrontes,2018.0,"Salada com frango grelhado, espetinho de camar...",Altivo Vineyard Selection La Rioja Torrontes 2018
3,4,Anciano 35 Years Old Vines Garnacha Calatayud ...,tinto,espanha,calatayud,150,bodegassangregorio-norrelrobertson,12,750,garnacha,2016.0,,Anciano 35 Years Old Vines Garnacha Calatayud ...
4,5,Anciano Crianza 3 years Tempranillo Valdepeñas...,tinto,espanha,valdepenãs,125,anciano,12,750,tempranillo,2014.0,,Anciano Crianza 3 years Tempranillo Valdepeñas...


In [77]:
def create_soup(x):
    return ' '.join(x['type']) + ' ' + ' '.join(x['country']) + ' ' + x['region'] + ' ' + ' '.join(x['producer']) + ' ' + ' '.join(x['grape']) + ' ' + ' '.join(x['harmonization'])



In [78]:
# Create a new soup feature
wines['soup'] = wines.apply(create_soup, axis=1)
wines.head()

Unnamed: 0,wine_id,wine_name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image,soup
0,1,Abadia del Roble White La Mancha D.O,branco,espanha,lamancha,120,bodegasayuso,10,750,airen,,Carnes brancas,Abadia del Roble White La Mancha D.O,b r a n c o e s p a n h a lamancha b o d e g a...
1,2,Alqueve Branco 2017,branco,portugal,tejo,120,pinhaldatorre,9,750,variasuvas,2017.0,"Camarão grelhado, tomates recheados, moqueca, ...",Alqueve Branco 2017,b r a n c o p o r t u g a l tejo p i n h a l d...
2,3,Altivo Vineyard Selection La Rioja Torrontes 2018,branco,argentina,larioja,135,fincaeugeniobustos,9,750,torrontes,2018.0,"Salada com frango grelhado, espetinho de camar...",Altivo Vineyard Selection La Rioja Torrontes 2018,b r a n c o a r g e n t i n a larioja f i n c ...
3,4,Anciano 35 Years Old Vines Garnacha Calatayud ...,tinto,espanha,calatayud,150,bodegassangregorio-norrelrobertson,12,750,garnacha,2016.0,,Anciano 35 Years Old Vines Garnacha Calatayud ...,t i n t o e s p a n h a calatayud b o d e g a ...
4,5,Anciano Crianza 3 years Tempranillo Valdepeñas...,tinto,espanha,valdepenãs,125,anciano,12,750,tempranillo,2014.0,,Anciano Crianza 3 years Tempranillo Valdepeñas...,t i n t o e s p a n h a valdepenãs a n c i a n...


In [79]:
# Import CountVectorizer and create the count matrix
count = CountVectorizer(stop_words=['portuguese', 'english'])
count_matrix = count.fit_transform(wines['soup'])

In [80]:
# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [81]:
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = wines.reset_index()
indices = pd.Series(metadata.index, index=metadata['wine_name'])

In [82]:
get_recommendations('Don Simón Selección Tempranillo', cosine_sim2)


Unnamed: 0,wine_id,wine_name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image,soup
34,35,Castillo D'Elaro Bobal 2018,tinto,espanha,castilla-lamancha,130,bodegasgallegas,17,750,bobal,2018.0,,Castillo D'Elaro Bobal 2018,t i n t o e s p a n h a castilla-lamancha b o ...
54,55,Don Simón Selección Tempranillo,tinto,espanha,castilla-lamancha,120,j.garciacarrion,16,750,tempranillo,,"Embutidos, espaguete a bolonhesa",Don Simón Selección Tempranillo,t i n t o e s p a n h a castilla-lamancha j . ...
106,107,Pegaso Verdejo 2018,branco,espanha,castilla-lamancha,125,bodegasluisgurpeguimuga,16,750,verdejo,2018.0,"Peixe grelhado, frutos do mar, pimentoes reche...",Pegaso Verdejo 2018,b r a n c o e s p a n h a castilla-lamancha b ...
154,155,Sovento Chardonnay 2018,branco,espanha,castilla-lamancha,115,bodegasfernandocastro,7,750,chardonnay,2018.0,"Salmao, peixes com molho, aves",Sovento Chardonnay 2018,b r a n c o e s p a n h a castilla-lamancha b ...
155,156,Sovento Sauvignon Blanc 2018,branco,espanha,castilla-lamancha,115,bodegasfernandocastro,7,750,sauvignonblanc,2018.0,"Entradas leves, saladas, peixes",Sovento Sauvignon Blanc 2018,b r a n c o e s p a n h a castilla-lamancha b ...
0,1,Abadia del Roble White La Mancha D.O,branco,espanha,lamancha,120,bodegasayuso,10,750,airen,,Carnes brancas,Abadia del Roble White La Mancha D.O,b r a n c o e s p a n h a lamancha b o d e g a...
35,36,Castillo de Calatrava Tempranillo Crianza La M...,tinto,espanha,lamancha,130,vinicoladecastilla,17,750,tempranillo,2016.0,"Churrasco, queijos amarelos",Castillo de Calatrava Tempranillo Crianza La M...,t i n t o e s p a n h a lamancha v i n i c o l...
101,102,Montefrio Airen La Mancha D.O,branco,espanha,lamancha,110,felixsolis,12,750,airen,,"Frutos do mar, tabua de queijos",Montefrio Airen La Mancha D.O,b r a n c o e s p a n h a lamancha f e l i x s...
1,2,Alqueve Branco 2017,branco,portugal,tejo,120,pinhaldatorre,9,750,variasuvas,2017.0,"Camarão grelhado, tomates recheados, moqueca, ...",Alqueve Branco 2017,b r a n c o p o r t u g a l tejo p i n h a l d...
2,3,Altivo Vineyard Selection La Rioja Torrontes 2018,branco,argentina,larioja,135,fincaeugeniobustos,9,750,torrontes,2018.0,"Salada com frango grelhado, espetinho de camar...",Altivo Vineyard Selection La Rioja Torrontes 2018,b r a n c o a r g e n t i n a larioja f i n c ...
