In [193]:
#Import the required libraries
import pandas as pd 

import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics.pairwise import cosine_similarity
import json

from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')

import nltk
nltk.download('punkt')

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deepa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\deepa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\deepa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [194]:
from urllib.parse import quote

## Preprocess the Text Data
### Data Cleaning

In [195]:
def data_analysis(df):
    print("The shape of the DataFrame is ",df.shape)
    print("The null values in the DataFrame are:")
    print(df.isnull().sum())
    print("There may be some null strings in the Description which has to be replaced as nan")
    df['Description']=df['Description'].replace("",np.nan)
    print(df['Description'].isnull().sum())
    df=df.dropna()
    print("The Shape of the DataFrame after null values are removed:",df.shape)

In [196]:
def tokenize(text):
    '''
    INPUT:
    text: (String) - the Text which is to be tokenized
    
    OUTPUT:
    clean_tokens - (String) Clean Tokens 
    
    Process:
    The text is converted to lower case and any special characters are removed. The sentence is then converted to tokens 
    and stopwords are removed. WordNetlemmatizer is applied to lemmatize and clean tokens are obtained.
    '''
        
    clean_tokens=[]
    sent=str(text)
    sent=sent.lower()
    #sent1=re.sub('[0-9]','',sent)
    sp=re.compile('<.*?(#,-)>')
    sent2=re.sub(sp,'',sent)
    
    # Sentences are converted to tokens and stopwords removed
    tokens=word_tokenize(sent2)
    words=[w for w in tokens if len(w)>=1 if w not in stopwords.words('english')]
    
    
    lemmatizer=WordNetLemmatizer()
    for tok in words:
        cl=lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(cl)
    clean_tokens=' '.join(clean_tokens)  
    
    return clean_tokens

# 2. Measure Similarity and retrieve Ranked Results

In [208]:
def similarity_rank(df,inp_text):
    '''
    INPUT:
    df: (DataFrame) - the DataFrame containing the scraped data from websites
    inp_text: (String) -  The input string for which similar url's are to be determined
    
    OUTPUT:
    suggested_urls - json - A json object of suggested URL's that are similar to the input text
    
    
    '''
    #The input text is converted to tokens
    input_cl=tokenize(inp_text)
    
    #print(input_cl)
    cv=CountVectorizer()
    
    
    # Count Vectorizer converts the data into numerical values. Both the input data and the dataset Description attribute are converted
    cv.fit(df['Description'])
    inp_vect=cv.transform([input_cl])
    ds_vect=cv.transform(df['Description'])
    #print(ds_vect)

    # The cosine cimilarity between the input vector and the dataset Description vector is computed to find the similarity between the two text
    sim1=cosine_similarity(inp_vect,ds_vect)
    
    # The similarity is sorted in the reverse order to obtain the most similar top elements
    sort_ind=sim1.argsort()[0][::-1]
    #print(sort_ind[0:20])
    
    sugg1=[]
    
    for i in sort_ind:
        i+=1
        u=df[df['sno']==i]['link'].values[0]
        if u not in sugg1:
            sugg1.append(u)
        
    # The top 10 suggested urls are insertd in a dictionary
    sugg=sugg1[0:10]
    suggested={
        'suggestions':sugg
        
    }
    
    # The dictionary is converted to json file
    suggested_urls=json.dumps(suggested,indent=1)
    
    # The suggested urls are returned
    return suggested_urls

In [198]:
def Clothing_similarity_search(input_text):
    
    # The scraped data is stored in webscrap_data.csv file. This is loaded in df_final  
    df_final=pd.read_csv('webscrap_data.csv')
   
    # The dataset is analysed using data_analysis function
    data_analysis(df_final)
    
    # The tokenize function is applied to the Description attribute of the dataset
    df_final['Description']=df_final['Description'].apply(tokenize)

    # The similarity between input_text and the dataset is determined by calling the similarity_rank function
    urls=similarity_rank(df_final,input_text)
    
    return urls

In [213]:
suggested_URLs=Clothing_similarity_search("This is a stylish and comfortable pant")

The shape of the DataFrame is  (1003, 3)
The null values in the DataFrame are:
Description    0
link           0
sno            0
dtype: int64
There may be some null strings in the Description which has to be replaced as nan
0
The Shape of the DataFrame after null values are removed: (1003, 3)


In [214]:
print(suggested_URLs)

{
 "suggestions": [
  "https://www.pluss.in/women/black-track-pant-lpj8787-black.html",
  "https://instore.co.in//collections/offers/products/thumbi-thullal-kurti-coat-pant",
  "https://www.pluss.in/women/navy-blue-track-pant-lpj8787-navy.html",
  "https://instore.co.in//collections/offers/products/shireen",
  "https://www.pluss.in/men/men-black-solid-straight-fit-track-pants-mpjs12324-black.html",
  "https://www.pluss.in/men/men-black-solid-straight-fit-track-pants-mpjs11106-black.html",
  "https://www.pluss.in/men/men-navy-blue-solid-straight-fit-cotton-track-pants-mpjs12323-navy.html",
  "https://instore.co.in//collections/offers/products/janet",
  "https://www.pluss.in/women/women-off-white-solid-straight-palazzos-ldr5536-off-white.html",
  "https://www.pluss.in/women/navy-churidar-leggings-llg3199-navy.html"
 ]
}


In [209]:
Clothing_similarity_search("This is a blue dress")

The shape of the DataFrame is  (1003, 3)
The null values in the DataFrame are:
Description    0
link           0
sno            0
dtype: int64
There may be some null strings in the Description which has to be replaced as nan
0
The Shape of the DataFrame after null values are removed: (1003, 3)


'{\n "suggestions": [\n  "https://instore.co.in//collections/offers/products/manohariblue",\n  "https://instore.co.in//collections/offers/products/blue-lily",\n  "https://instore.co.in//collections/offers/products/kashmira-pastel-peach",\n  "https://www.pluss.in/women/navy-blue-track-pant-lpj8787-navy.html",\n  "https://www.pluss.in/men/men-navy-blue-solid-regular-shorts-mbr9483-navy.html",\n  "https://www.pluss.in/women/women-blue-white-printed-straight-kurta-llkt7268-blue-print.html",\n  "https://www.pluss.in/women/women-blue-wide-leg-solid-palazzos-lpzo6732-bata-wash.html",\n  "https://www.pluss.in/women/women-blue-wide-leg-solid-palazzos-lpzo6732-enzyme-wash.html",\n  "https://www.pluss.in/women/women-blue-solid-regular-fit-capris-lcp7812-royal-blue.html",\n  "https://instore.co.in//collections/offers/products/aarathya-blue-kurti-dupatta"\n ]\n}'