In [25]:
import pandas as pd 
import chardet
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pymysql

In [24]:
def find_encoding(fname):
    r_file = open(fname, 'rb').read()
    result = chardet.detect(r_file)
    charenc = result['encoding']
    return charenc

In [23]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
        

In [26]:
# This method takes the database connection.
conn = pymysql.connect(host='localhost', port=int(3306), user='yabaconsultoria', passwd='yaba2389', db="livedwine",
                           charset='utf8')
wines = pd.read_sql_query("SELECT * FROM wine", conn)
conn.close()
wines.head()


Unnamed: 0,wine_id,wine_name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image
0,1,Abadia del Roble White La Mancha D.O,Branco,Espanha,La Mancha,12.0,Bodegas Ayuso,10,750,Airen,0,Carnes brancas,Abadia del Roble White La Mancha D.O\r
1,2,Alqueve Branco 2017,Branco,Portugal,Tejo,12.0,Pinhal da torre,9,750,Varias uvas,2017,"Camarão grelhado, tomates recheados, moqueca, ...",Alqueve Branco 2017\r
2,3,Altivo Vineyard Selection La Rioja Torrontes 2018,Branco,Argentina,La Rioja,13.5,Finca Eugenio Bustos,9,750,Torrontes,2018,"Salada com frango grelhado, espetinho de camar...",Altivo Vineyard Selection La Rioja Torrontes 2...
3,4,Anciano 35 Years Old Vines Garnacha Calatayud ...,Tinto,Espanha,Calatayud,15.0,Bodegas San Gregorio - Norrel Robertson,12,750,Garnacha,2016,,Anciano 35 Years Old Vines Garnacha Calatayud ...
4,5,Anciano Crianza 3 years Tempranillo Valdepeñas...,Tinto,Espanha,Valdepenãs,12.5,Anciano,12,750,Tempranillo,2014,,Anciano Crianza 3 years Tempranillo Valdepeñas...


In [27]:
wines['harmonization'] = wines['harmonization'].fillna('')
wines.head()

Unnamed: 0,wine_id,wine_name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image
0,1,Abadia del Roble White La Mancha D.O,Branco,Espanha,La Mancha,12.0,Bodegas Ayuso,10,750,Airen,0,Carnes brancas,Abadia del Roble White La Mancha D.O\r
1,2,Alqueve Branco 2017,Branco,Portugal,Tejo,12.0,Pinhal da torre,9,750,Varias uvas,2017,"Camarão grelhado, tomates recheados, moqueca, ...",Alqueve Branco 2017\r
2,3,Altivo Vineyard Selection La Rioja Torrontes 2018,Branco,Argentina,La Rioja,13.5,Finca Eugenio Bustos,9,750,Torrontes,2018,"Salada com frango grelhado, espetinho de camar...",Altivo Vineyard Selection La Rioja Torrontes 2...
3,4,Anciano 35 Years Old Vines Garnacha Calatayud ...,Tinto,Espanha,Calatayud,15.0,Bodegas San Gregorio - Norrel Robertson,12,750,Garnacha,2016,,Anciano 35 Years Old Vines Garnacha Calatayud ...
4,5,Anciano Crianza 3 years Tempranillo Valdepeñas...,Tinto,Espanha,Valdepenãs,12.5,Anciano,12,750,Tempranillo,2014,,Anciano Crianza 3 years Tempranillo Valdepeñas...


In [28]:
#Define a TF-IDF Vectorizer Object.
tfidf = TfidfVectorizer(stop_words=['portuguese', 'inglish'])

In [29]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(wines['harmonization'])
tfidf_matrix.shape

(181, 433)

In [30]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.25928832 ... 0.08518523 0.05132592 0.50185261]
 [0.         0.25928832 1.         ... 0.06224578 0.1353054  0.08961693]
 ...
 [0.         0.08518523 0.06224578 ... 1.         0.04425497 0.17661701]
 [0.         0.05132592 0.1353054  ... 0.04425497 1.         0.05478699]
 [0.         0.50185261 0.08961693 ... 0.17661701 0.05478699 1.        ]]


In [31]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(wines.index, index=wines['wine_name']).drop_duplicates()
wines['harmonization'] = wines['harmonization'].apply(clean_data)

In [41]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(wine_name, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[wine_name]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    wines_indices = [i[0] for i in sim_scores]
    print(wines_indices)

    # Return the top 10 most similar movies
    return wines.iloc[wines_indices]

In [33]:
get_recommendations('Don Simón Selección Tempranillo', cosine_sim)



AttributeError: 'list' object has no attribute 'head'

In [34]:
features = ['type', 'country', 'region', 'producer', 'grape']
for feature in features:
    wines[feature] = wines[feature].apply(clean_data)
wines.head()

Unnamed: 0,wine_id,wine_name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image
0,1,Abadia del Roble White La Mancha D.O,branco,espanha,lamancha,12.0,bodegasayuso,10,750,airen,0,carnesbrancas,Abadia del Roble White La Mancha D.O\r
1,2,Alqueve Branco 2017,branco,portugal,tejo,12.0,pinhaldatorre,9,750,variasuvas,2017,"camarãogrelhado,tomatesrecheados,moqueca,talha...",Alqueve Branco 2017\r
2,3,Altivo Vineyard Selection La Rioja Torrontes 2018,branco,argentina,larioja,13.5,fincaeugeniobustos,9,750,torrontes,2018,"saladacomfrangogrelhado,espetinhodecamarao,tom...",Altivo Vineyard Selection La Rioja Torrontes 2...
3,4,Anciano 35 Years Old Vines Garnacha Calatayud ...,tinto,espanha,calatayud,15.0,bodegassangregorio-norrelrobertson,12,750,garnacha,2016,none,Anciano 35 Years Old Vines Garnacha Calatayud ...
4,5,Anciano Crianza 3 years Tempranillo Valdepeñas...,tinto,espanha,valdepenãs,12.5,anciano,12,750,tempranillo,2014,none,Anciano Crianza 3 years Tempranillo Valdepeñas...


In [35]:
def create_soup(x):
    return ' '.join(x['type']) + ' ' + ' '.join(x['country']) + ' ' + x['region'] + ' ' + ' '.join(x['producer']) + ' ' + ' '.join(x['grape']) + ' ' + ' '.join(x['harmonization'])



In [36]:
# Create a new soup feature
wines['soup'] = wines.apply(create_soup, axis=1)
wines.head()

Unnamed: 0,wine_id,wine_name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image,soup
0,1,Abadia del Roble White La Mancha D.O,branco,espanha,lamancha,12.0,bodegasayuso,10,750,airen,0,carnesbrancas,Abadia del Roble White La Mancha D.O\r,b r a n c o e s p a n h a lamancha b o d e g a...
1,2,Alqueve Branco 2017,branco,portugal,tejo,12.0,pinhaldatorre,9,750,variasuvas,2017,"camarãogrelhado,tomatesrecheados,moqueca,talha...",Alqueve Branco 2017\r,b r a n c o p o r t u g a l tejo p i n h a l d...
2,3,Altivo Vineyard Selection La Rioja Torrontes 2018,branco,argentina,larioja,13.5,fincaeugeniobustos,9,750,torrontes,2018,"saladacomfrangogrelhado,espetinhodecamarao,tom...",Altivo Vineyard Selection La Rioja Torrontes 2...,b r a n c o a r g e n t i n a larioja f i n c ...
3,4,Anciano 35 Years Old Vines Garnacha Calatayud ...,tinto,espanha,calatayud,15.0,bodegassangregorio-norrelrobertson,12,750,garnacha,2016,none,Anciano 35 Years Old Vines Garnacha Calatayud ...,t i n t o e s p a n h a calatayud b o d e g a ...
4,5,Anciano Crianza 3 years Tempranillo Valdepeñas...,tinto,espanha,valdepenãs,12.5,anciano,12,750,tempranillo,2014,none,Anciano Crianza 3 years Tempranillo Valdepeñas...,t i n t o e s p a n h a valdepenãs a n c i a n...


In [37]:
# Import CountVectorizer and create the count matrix
count = CountVectorizer(stop_words=['portuguese', 'english'])
count_matrix = count.fit_transform(wines['soup'])

In [38]:
# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [39]:
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = wines.reset_index()
indices = pd.Series(metadata.index, index=metadata['wine_name'])

In [42]:
get_recommendations('Don Simón Selección Tempranillo', cosine_sim2)


[34, 54, 106, 154, 155, 0, 35, 101, 1, 2]


Unnamed: 0,wine_id,wine_name,type,country,region,alcohol_content,producer,service,volume,grape,harvest,harmonization,image,soup
34,35,Castillo D'Elaro Bobal 2018,tinto,espanha,castilla-lamancha,13.0,bodegasgallegas,17,750,bobal,2018,none,Castillo D'Elaro Bobal 2018\r,t i n t o e s p a n h a castilla-lamancha b o ...
54,55,Don Simón Selección Tempranillo,tinto,espanha,castilla-lamancha,12.0,j.garciacarrion,16,750,tempranillo,0,"embutidos,espagueteabolonhesa",Don Simón Selección Tempranillo\r,t i n t o e s p a n h a castilla-lamancha j . ...
106,107,Pegaso Verdejo 2018,branco,espanha,castilla-lamancha,12.5,bodegasluisgurpeguimuga,16,750,verdejo,2018,"peixegrelhado,frutosdomar,pimentoesrecheados",Pegaso Verdejo 2018\r,b r a n c o e s p a n h a castilla-lamancha b ...
154,155,Sovento Chardonnay 2018,branco,espanha,castilla-lamancha,11.5,bodegasfernandocastro,7,750,chardonnay,2018,"salmao,peixescommolho,aves",Sovento Chardonnay 2018\r,b r a n c o e s p a n h a castilla-lamancha b ...
155,156,Sovento Sauvignon Blanc 2018,branco,espanha,castilla-lamancha,11.5,bodegasfernandocastro,7,750,sauvignonblanc,2018,"entradasleves,saladas,peixes",Sovento Sauvignon Blanc 2018\r,b r a n c o e s p a n h a castilla-lamancha b ...
0,1,Abadia del Roble White La Mancha D.O,branco,espanha,lamancha,12.0,bodegasayuso,10,750,airen,0,carnesbrancas,Abadia del Roble White La Mancha D.O\r,b r a n c o e s p a n h a lamancha b o d e g a...
35,36,Castillo de Calatrava Tempranillo Crianza La M...,tinto,espanha,lamancha,13.0,vinicoladecastilla,17,750,tempranillo,2016,"churrasco,queijosamarelos",Castillo de Calatrava Tempranillo Crianza La M...,t i n t o e s p a n h a lamancha v i n i c o l...
101,102,Montefrio Airen La Mancha D.O,branco,espanha,lamancha,11.0,felixsolis,12,750,airen,0,"frutosdomar,tabuadequeijos",Montefrio Airen La Mancha D.O\r,b r a n c o e s p a n h a lamancha f e l i x s...
1,2,Alqueve Branco 2017,branco,portugal,tejo,12.0,pinhaldatorre,9,750,variasuvas,2017,"camarãogrelhado,tomatesrecheados,moqueca,talha...",Alqueve Branco 2017\r,b r a n c o p o r t u g a l tejo p i n h a l d...
2,3,Altivo Vineyard Selection La Rioja Torrontes 2018,branco,argentina,larioja,13.5,fincaeugeniobustos,9,750,torrontes,2018,"saladacomfrangogrelhado,espetinhodecamarao,tom...",Altivo Vineyard Selection La Rioja Torrontes 2...,b r a n c o a r g e n t i n a larioja f i n c ...
