In [1]:
import numpy as np 
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
artists = pd.read_csv("../data/artists.csv")

In [3]:
artists.head()

Unnamed: 0,id,name,years,genre,nationality,bio,wikipedia,paintings
0,0,Amedeo Modigliani,1884 - 1920,Expressionism,Italian,Amedeo Clemente Modigliani (Italian pronunciat...,http://en.wikipedia.org/wiki/Amedeo_Modigliani,193
1,1,Vasiliy Kandinskiy,1866 - 1944,Expressionism,Russian,Wassily Wassilyevich Kandinsky (Russian: Васи́...,http://en.wikipedia.org/wiki/Wassily_Kandinsky,88
2,2,Diego Rivera,1886 - 1957,Social Realism,Mexican,Diego María de la Concepción Juan Nepomuceno E...,http://en.wikipedia.org/wiki/Diego_Rivera,70
3,3,Claude Monet,1840 - 1926,Impressionism,French,Oscar-Claude Monet (; French: [klod mɔnɛ]; 14 ...,http://en.wikipedia.org/wiki/Claude_Monet,73
4,4,Rene Magritte,1898 - 1967,Impressionism,Belgian,René François Ghislain Magritte (French: [ʁəne...,http://en.wikipedia.org/wiki/René_Magritte,194


In [4]:
artists.shape

(50, 8)

In [5]:
artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           50 non-null     int64 
 1   name         50 non-null     object
 2   years        50 non-null     object
 3   genre        50 non-null     object
 4   nationality  50 non-null     object
 5   bio          50 non-null     object
 6   wikipedia    50 non-null     object
 7   paintings    50 non-null     int64 
dtypes: int64(2), object(6)
memory usage: 3.3+ KB


In [6]:
artists.iloc[0]['bio']

"Amedeo Clemente Modigliani (Italian pronunciation: [ameˈdɛːo modiʎˈʎaːni]; 12 July 1884 – 24 January 1920) was an Italian Jewish painter and sculptor who worked mainly in France. He is known for portraits and nudes in a modern style characterized by elongation of faces, necks, and figures that were not received well during his lifetime but later found acceptance. Modigliani spent his youth in Italy, where he studied the art of antiquity and the Renaissance. In 1906 he moved to Paris, where he came into contact with such artists as Pablo Picasso and Constantin Brâncuși. By 1912 Modigliani was exhibiting highly stylized sculptures with Cubists of the Section d'Or group at the Salon d'Automne."

In [7]:
artists['tags'] = artists['genre'] + artists['nationality'] + artists['bio'] + artists['years']

In [8]:
artists = artists[['name','tags']]

In [9]:
artists['tags'] = artists['tags'].apply(lambda x:x.lower())

In [10]:
artists.head()

Unnamed: 0,name,tags
0,Amedeo Modigliani,expressionismitalianamedeo clemente modigliani...
1,Vasiliy Kandinskiy,expressionismrussianwassily wassilyevich kandi...
2,Diego Rivera,social realismmexicandiego maría de la concepc...
3,Claude Monet,impressionismfrenchoscar-claude monet (; frenc...
4,Rene Magritte,impressionismbelgianrené françois ghislain mag...


In [11]:
artists.iloc[0]['tags']

"expressionismitalianamedeo clemente modigliani (italian pronunciation: [ameˈdɛːo modiʎˈʎaːni]; 12 july 1884 – 24 january 1920) was an italian jewish painter and sculptor who worked mainly in france. he is known for portraits and nudes in a modern style characterized by elongation of faces, necks, and figures that were not received well during his lifetime but later found acceptance. modigliani spent his youth in italy, where he studied the art of antiquity and the renaissance. in 1906 he moved to paris, where he came into contact with such artists as pablo picasso and constantin brâncuși. by 1912 modigliani was exhibiting highly stylized sculptures with cubists of the section d'or group at the salon d'automne.1884 - 1920"

In [12]:
ps = PorterStemmer()

In [13]:
def stems(text):
    T = []
    
    for i in text.split():
        T.append(ps.stem(i))
    
    return " ".join(T)

In [14]:
artists['tags'] = artists['tags'].apply(stems)

In [15]:
artists.iloc[0]['tags']

"expressionismitalianamedeo clement modigliani (italian pronunciation: [ameˈdɛːo modiʎˈʎaːni]; 12 juli 1884 – 24 januari 1920) wa an italian jewish painter and sculptor who work mainli in france. he is known for portrait and nude in a modern style character by elong of faces, necks, and figur that were not receiv well dure hi lifetim but later found acceptance. modigliani spent hi youth in italy, where he studi the art of antiqu and the renaissance. in 1906 he move to paris, where he came into contact with such artist as pablo picasso and constantin brâncuși. by 1912 modigliani wa exhibit highli styliz sculptur with cubist of the section d'or group at the salon d'automne.1884 - 1920"

In [16]:

vectorizer  = CountVectorizer(max_features=3000,stop_words='english')

In [17]:
tag_matrix = vectorizer.fit_transform(artists['tags'])

In [18]:
tag_matrix[0]

<1x1701 sparse matrix of type '<class 'numpy.int64'>'
	with 62 stored elements in Compressed Sparse Row format>

In [19]:
tag_matrix.shape

(50, 1701)

In [20]:
cosine_sim = cosine_similarity(tag_matrix, tag_matrix)

In [21]:
cosine_sim.shape

(50, 50)

In [22]:
artists[artists['name'] == 'Vincent van Gogh'].index[0]

8

In [39]:
def recommend(artist_name, num_recommendations=4):
    artist_idx = artists.index[artists['name'] == artist_name].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[artist_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i[0] for i in sim_scores[1:num_recommendations+1]]  
    for idx in sim_indices:
        print(artists['name'].iloc[idx])

In [40]:
recommend('Claude Monet')

Alfred Sisley
Gustave Courbet
Paul Gauguin
Giotto di Bondone


In [41]:
import pickle

In [42]:
pickle.dump(artists,open('artists_list.pkl','wb'))
pickle.dump(cosine_sim,open('similarity.pkl','wb'))