In [11]:
import pandas as pd

In [12]:
articles = pd.read_csv('articles_with_existing_images.csv')

In [13]:
articles

Unnamed: 0,article_id,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name,detail_desc
0,108775015,Strap top,Vest top,Garment Upper body,Solid,Black,Dark,Black,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,Strap top,Vest top,Garment Upper body,Solid,White,Light,White,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,Strap top (1),Vest top,Garment Upper body,Stripe,Off White,Dusty Light,White,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,OP T-shirt (Idro),Bra,Underwear,Solid,Black,Dark,Black,Clean Lingerie,Lingeries/Tights,Ladieswear,Womens Lingerie,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,OP T-shirt (Idro),Bra,Underwear,Solid,White,Light,White,Clean Lingerie,Lingeries/Tights,Ladieswear,Womens Lingerie,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,212766042,Dean Drawstring Trousers.,Trousers,Garment Lower body,Solid,Dark Beige,Medium Dusty,Mole,Trouser,Menswear,Menswear,Contemporary Street,Trousers,Joggers in washed cotton twill with an elastic...
269,212766043,Dean Drawstring Trousers.,Trousers,Garment Lower body,Solid,Black,Dark,Black,Trouser,Menswear,Menswear,Contemporary Street,Trousers,Joggers in washed cotton twill with an elastic...
270,212766045,Dean Drawstring Trousers.,Trousers,Garment Lower body,Solid,Dark Grey,Dark,Grey,Trouser,Menswear,Menswear,Contemporary Street,Trousers,Joggers in washed cotton twill with an elastic...
271,212766046,Dean Drawstring Trousers.,Trousers,Garment Lower body,Solid,Dark Blue,Dark,Blue,Trouser,Menswear,Menswear,Contemporary Street,Trousers,Joggers in washed cotton twill with an elastic...


In [14]:
# Taking only 10000 items :-
articles = articles[:273]

In [15]:
# Combine all info from descriptive columns to a single column separated by space :-
cols = ['prod_name', 'product_type_name', 'product_group_name',
        'graphical_appearance_name', 'colour_group_name',
        'perceived_colour_value_name', 'perceived_colour_master_name',
        'department_name', 'index_name', 'index_group_name', 'section_name',
        'garment_group_name', 'detail_desc']

articles['combined_cols'] = articles[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles['combined_cols'] = articles[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)


In [16]:
articles = articles[['article_id', 'combined_cols']]

In [17]:
articles.shape

(273, 2)

In [18]:
articles.head()

Unnamed: 0,article_id,combined_cols
0,108775015,Strap top Vest top Garment Upper body Solid Bl...
1,108775044,Strap top Vest top Garment Upper body Solid Wh...
2,108775051,Strap top (1) Vest top Garment Upper body Stri...
3,110065001,OP T-shirt (Idro) Bra Underwear Solid Black Da...
4,110065002,OP T-shirt (Idro) Bra Underwear Solid White Li...


In [19]:
# Download the stopwords corpus and Porter stemming algorithm :-

# import nltk

# nltk.download('stopwords')
# nltk.download('punkt')

In [20]:
# Data cleaning :-
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
def text_process(desc):
    articles['combined_cols'].fillna(value='', inplace=True) # Fill the null values with empty string
    # Remove punctuation :-
    noPunc = [c for c in desc if c not in string.punctuation]
    noPunc = ''.join(noPunc)
    noPunc = noPunc.split()
    # Remove stopwords :-
    stopword = stopwords.words('english')
    desc_stopwords = [word.lower() for word in noPunc if word.lower() not in stopword]
    # Replace words with their respective stems :-
    stemmer = PorterStemmer()
    desc_cleaned = [stemmer.stem(word) for word in desc_stopwords]
    return desc_cleaned

In [21]:
# Vectorizing the data :-
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer=text_process)
tfidf_matrix = tfidf.fit_transform(articles['combined_cols'])

In [22]:
# Recommender system :-
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [23]:
# Map the article_id to their indices :-
indices = pd.Series(articles.index, index=articles['article_id']).drop_duplicates()

In [24]:
# Method to predict similar articles :-
def recommendations(article_id):
    i = indices[article_id] # Index of the articles that match the given article
    sim_scores = list(enumerate(cos_sim[i])) # Similarity scores of all articles w.r.t. to the given article
    sim_scores.sort(key=lambda x: x[1], reverse=True) # Sort the similarity scores in descending order
    # Get the scores of the 10 most similar articles
    sim_scores = sim_scores[:10]
    # Get the article indices
    article_indices = [score[0] for score in sim_scores]
    return articles['article_id'].iloc[article_indices].values

In [25]:
# Get the recommendations for a sample product :-
predict_recom = recommendations(118458003)
print(predict_recom)

[118458003 118458029 118458038 118458004 118458028 118458039 118458034
 212766045 212766043 212766046]


In [26]:
# Save the recommender as a file :-
import pickle

# Save the similarity matrix to a file using pickle :-
with open('recommender.pkl', 'wb') as file:
    pickle.dump(cos_sim, file)

In [27]:
# Method for showing search results with a given description of the product :-
def search_result(desc):
    search_tfidf = tfidf.transform([desc])
    cos_sim = cosine_similarity(search_tfidf, tfidf_matrix)
    sim_scores = list(enumerate(cos_sim[0]))
    sim_scores.sort(key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[:50]
    article_indices = [score[0] for score in sim_scores]
    return articles['article_id'].iloc[article_indices].values

In [28]:
# Sample search :-
search_result("jogger women")

array([118458028, 118458038, 118458003, 118458029, 118458004, 118458034,
       118458039, 212766043, 212766045, 212766046, 212766042, 212766041,
       179123001, 179123040, 129085001, 108775015, 129085026, 129085027,
       108775044, 108775051, 194037001, 182909001, 194037002, 116379047,
       186372011, 156231002, 146730001, 186372042, 186372045, 179393001,
       148033001, 202017055, 179393018, 156227001, 201219001, 156224002,
       189955076, 156231001, 201219013, 201219003, 201219017, 201219016,
       156227002, 201219011, 201219014, 126589006, 114428026, 114428030,
       201219012, 200182001], dtype=int64)

In [29]:
# Save both the TfidfVectorizer and tfidf_matrix as files for searching products :-
with open('tfidf.pkl', 'wb') as file:
    pickle.dump(tfidf, file)
with open('tfidf_matrix.pkl', 'wb') as file:
    pickle.dump(tfidf_matrix, file)