### Užduotis 1

Dovydas Martinkus  
3k. 2gr.

Tema:\
Pasirinktų prekių puslapio nagrinejimas. \
Puslapis ikea.lt.

In [2]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import os

import time
import re
from functools import partial

In [252]:
# getting the 'metapage' - this webpage has the links to all the item types
r_meta = requests.get("https://www.ikea.lt/en/products")
ikea = "https://www.ikea.lt" # this will be used to construct the required links in the future

In [253]:
soup_meta = BeautifulSoup(r_meta.text) # parsing the metapage

links_type = soup_meta.select(":not(.dropdown-item)[href*='products']") # all the links to the item types

# Filtering item types
regex_type = re.compile("ikea-verslui|kaledos|dovanu-korteles")
links_type = [ikea + i["href"] for i in links_type if not regex_type.search(i["href"])]

In [254]:
links_type[1:5]

['https://www.ikea.lt/en/products/kitchen/enhet-system',
 'https://www.ikea.lt/en/products/kitchen/knoxhult-system',
 'https://www.ikea.lt/en/products/kitchen/sunnersta-kitchen',
 'https://www.ikea.lt/en/products/kitchen/kitchen-appliances']

The goal is to create a row in a table like this for each item listed in the store:

| title  | description | url | availability | metatype | type | subtype | packages | length | width | height | gross weight| price |
| ----------- | ----------- | | | | | | | | | | | |
| NEREBY  |  Vacuum hose holder... | http://www.ikea.lt/en/products/kitchen/... | instock| virtuve | system-knoxhult | kitchen-interior-organisers	| 1.0 | 23.0 | 12.0 | 3.0 | 0.19 | 6 |

In [255]:
# These attributes will appear as attributes of HTML tags in the parsed webpage
attributes_item_1 = ["og:title", "og:description", "og:url", "product:price:amount", "og:availability"] 

# These attributes will appear as table rows in the parsed webpage
attributes_item_2 =  ["Pakuotės: ","Ilgis: ","Plotis: ","Aukštis: ","Bendrasis svoris: "]

In [39]:
def scrap_items(links_type,skip=None):
    """
    Scrap items from the IKEA store and save them to disk.

    Parameters
    ----------
    links_type : list
        The list with the top-level links to scrap.
    skip : sequence
        Index positions of the links to skip. 
        Used to continue scrapping in case an error was thrown.

    Returns
    -------
    None
    """
    if skip is None:
        skip = []
    for num_type,link_type in enumerate(links_type):
            if (num_type is not None) and (num_type in skip): 
                continue
            print("Currently in item type ",num_type," ",link_type) # for convenience, to know what is currently being scrapped
            list_items = [] # list where the items of the same type will be stored
            try:
                r_type = requests.get(link_type,timeout=2) # getting the type webpage
            except requests.exceptions.Timeout:
                print("Timed out")
                continue
            soup_type = BeautifulSoup(r_type.text) # parsing the type webpage
            try:
                # largest number that appears in the page selection = number of pages
                total_pages_type = max([int(i["data-page"]) for i in soup_type.select("a.page-link[data-page]:not([aria-label])")])
            except ValueError:
                # website specific : no page selection = only a single page of items of that type exists
                total_pages_type = 1

            for num_page in range(1,total_pages_type+1):
                print("Currently in page",num_page) # again for convenience
                # constructing the link for a specific page of items of the same type
                link_type_page = link_type + "?page=" + str(num_page) 
                try:
                    r_type_page = requests.get(link_type_page,timeout=2) # getting the specified page of items of the same type
                except requests.exceptions.Timeout:
                    print("Timed out")
                    continue
                soup_type_page = BeautifulSoup(r_type_page.text)

                for item in soup_type_page.select("a.itemName"):
                    time.sleep(2) # to prevent getting thrown out for web scrapping
                    link_item = ikea + item["href"] # creating the link for the item webpage
                    try:
                        r_item = requests.get(link_item,timeout=2) # downloading the item webpage
                    except requests.exceptions.Timeout:
                        print("Timed out")
                        continue
                    soup_item = BeautifulSoup(r_item.text)
                    dict_item = dict() # dictionary to store the item attributes
                    # getting the attributes for the item from the meta tags:
                    for i in soup_item.find_all("meta",attrs={"property":attributes_item_1}):
                        dict_item[str(i["property"])] = str(i["content"])
                    # Extracting item type/subtype from the url: 
                    try:
                        for i,j in zip(["metatype","type","subtype"],dict_item["og:url"].split("/")[5:8]):
                            dict_item[i]= j
                    except KeyError:
                        print("Key error: this was caused because the url of item doesn't follow the usual structure")
                    # getting attributes from a table in the webpage:
                    for i in soup_item.find_all("td",text=attributes_item_2):
                        dict_item[str(i.string)]= str(i.next_sibling.string)
                    list_items.append(dict_item) # adding the item with it's attributes to the list of items of the same type
            # Saving all the items of a single type to disk
            df=pd.DataFrame(list_items)
            df.to_pickle("just_now/pickle"+str(num_type+1))

            
# Example: scrapping a very small type that only has two items
scrap_items([links_type[3]])
pd.read_pickle("just_now/pickled") # looks mostly as expected but still need to make a few changes

Currently in item type  0   https://www.ikea.lt/lt/products/virtuve/virtuvele-sunnersta
Currently in page 1


Unnamed: 0,og:title,og:description,og:url,product:price:amount,og:availability,metatype,type,subtype,Pakuotės:,Ilgis:,Plotis:,Aukštis:,Bendrasis svoris:
0,SUNNERSTA,Su virtuvėle SUNNERSTA lengva įsirengti prakti...,http://www.ikea.lt/lt/products/virtuve/virtuve...,95,instock,virtuve,virtuvele-sunnersta,atskiros-virtuves,1,115 cm,62 cm,17 cm,"16,35 kg"
1,SUNNERSTA,Su virtuvėle SUNNERSTA lengva įsirengti prakti...,http://www.ikea.lt/lt/products/virtuve/virtuve...,129,instock,virtuve,virtuvele-sunnersta,atskiros-virtuves,1,115 cm,62 cm,17 cm,"16,35 kg"


In [91]:
# creating the names for the saved files to be read in
names_pickle = ["pickles_en\\"  + p for p in os.listdir("pickles_en")]

# reading in and concatenating the files into a single DataFrame    
df = pd.concat(list(map(pd.read_pickle,names_pickle)),ignore_index=True)

In [92]:
# tidying the DataFrame

def my_rename(column_name):
    """
    Rename the columns using specific rules for the item table.
    
    Parameters
    ----------
    column_name : str
        Column name to convert.
        
    Returns
    -------
    str:
        Converted column name.

    """
    if "og:" in column_name:
        return str(column_name[3:].lower())
    elif column_name ==  "product:price:amount":
        return "price"
    else:
        return str(column_name.strip().strip(":").lower())
    
df = df.rename(columns=my_rename)

# moving the price column to the end for convenience
df.insert(len(df.columns)-1, 'price', df.pop('price'))


def my_numeric(column):
    """
    Convert the column to numeric after stripping it's values of 'cm' or 'kg'.
    
    Parameters
    ----------
    column : Series
        Column to convert to a numeric type.
        
    Returns
    -------
    Series:
        Column converted to a numeric type.
    """
    return pd.to_numeric(column.str.strip("cm").str.strip("kg").str.replace(",","."),errors="coerce")

# applying the function to every column that should be numeric
df.iloc[:,7:] = df.iloc[:,7:].apply(my_numeric)

df = df[df["description"]!=""].reset_index(drop=True)

In [93]:
df # 20000+ items in the DataFrame

Unnamed: 0,title,description,url,availability,metatype,type,subtype,packages,length,width,height,weight gross,price
0,FÖRENKLA,Two inner pockets for things you want to keep ...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,80.0,38.0,20.0,3.20,69.99
1,PIVRING,The backpack stays firmly in place since the s...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,37.0,27.0,2.0,0.12,1.99
2,RENSARE,This clothes bag has 2 compartments – one with...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,20.0,15.0,2.0,0.08,2.99
3,STARTTID,There’s a separate compartment in the back tha...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,42.0,37.0,3.0,0.40,19.99
4,STARTTID,Inside the backpack there are 2 large compartm...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,57.0,31.0,16.0,1.50,19.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21581,KNARDRUP,A timeless design with a sheen effect.\nThe so...,http://www.ikea.lt/en/products/living-room/hom...,out of stock,living-room,home-furnishing-rugs,rugs,1.0,202.0,,,10.85,179.00
21582,AVSKILDRA,The beauty is in the uniqueness - this rug is ...,http://www.ikea.lt/en/products/living-room/hom...,out of stock,living-room,home-furnishing-rugs,rugs,1.0,174.0,,,18.30,279.00
21583,LANGSTED,The cut edges makes it easy to join several ru...,http://www.ikea.lt/en/products/living-room/hom...,out of stock,living-room,home-furnishing-rugs,small-rugs-and-runners,1.0,80.0,80.0,15.0,1.40,9.99
21584,SVÄRDBORG,The beauty is in the uniqueness - this rug is ...,http://www.ikea.lt/en/products/living-room/hom...,out of stock,living-room,home-furnishing-rugs,rugs,1.0,138.0,,,11.54,199.00


In [94]:
import tensorflow as tf

import tensorflow_hub as hub

from sklearn.metrics.pairwise import cosine_similarity as cosine

import numpy as np

# Universal sentence encoder will be used to create emebeddings of the item descriptions
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)


def create_embeddings(df,model):
    embeddings=df.iloc[:,1].to_numpy().astype("str")
    embeddings=model(tf.convert_to_tensor(embeddings))
    return embeddings

embeddings = create_embeddings(df,model).numpy()

In [95]:
# Savingthe embeddings to disk
import numpy as np
np.save("embeddings.npy",embeddings)

In [96]:
def most_similar(row,df,embeddings = None,limit=5,unique = True,
                 threshold = 0,categorical_weight = 1,
                 numeric_weight= 1,reverse = False, append = False):
    
    """Find most similar items.
    
    Compares the sentence embeddings for the descriptions of the items
    using cosine similarity and returns the most similar items.

    Parameters
    ----------
    row : int or named Series or DataFrame
        The row to find the most similar items for.
        If given a DataFrame the first row will be selected.
        If given an int the df will be indexed using it as a label.
    df : DataFrame
        DataFrame to find the most similar items in.
    embeddings: array_like, Default None
        Sentence embeddings to compare the description of a given row against using cosine similarity.
    limit: int, Default 5
        The number of items to return.
    unique: boolean, Default True
        Whether to return only one item most with the same description. 
    threshold: float, Default 0
        If larger than 0, filter the DataFrame to only include items with higher similarity score for other attributes
        than the threshold. If given a negative number will take it's absolute value and filter 
        to include items with lower similarity score for other attributes.
    categorical_weight, numeric_weight: float, Default 1
        Calculation parameters to give categorical or numeric attributes more weight
        when chekcing the similarity of attributes other than description.
    reverse: boolean, Default False
        If true returns the least similar items.
    append: boolean, Default False
        Whether to add the cosine similariy as a column of the returned DataFrame

    Returns
    -------
    DataFrame:
        DataFrame with the items most similar to the given row.
    Series:
        Series with the similarity scores for each row of the returned DataFrame.
    """
    
    if embeddings is None or len(embeddings) != len(df):
        raise ValueError("Compatible sentence embeddings not found!\nYou probably deleted rows from the\
        data frame since the last time you ran this\nCreate the embeddings again")
    
    if isinstance(row,int):
        row = df.loc[row,:]
    elif isinstance(row,pd.DataFrame):
        row = row.iloc[0,:]


    relevant = np.ones(len(df)).astype("bool")
    if threshold != 0:
        col_categorical =  [0,*list(range(2,8))]
        col_numeric = range(8,13)

        comp_categorical = row[col_categorical] == df.iloc[:,col_categorical]
        sim_1 = comp_categorical.sum(axis=1) * categorical_weight

        max_value = df.iloc[:,col_numeric].max()
        min_value = df.iloc[:,col_numeric].min()
        comp_numeric = 1-(abs(row[col_numeric] - df.iloc[:,col_numeric]) / (max_value - min_value))
        sim_2 = comp_numeric.sum(axis=1) * numeric_weight

        relevant = (sim_1 + sim_2).sort_values(ascending=False) / \
            ((len(col_categorical)*categorical_weight)+(len(col_numeric)*numeric_weight))
        if threshold > 0:
            relevant = relevant > threshold
        else:
            relevant = relevant < abs(threshold)
            
            
        
    results=cosine(embeddings[row.name].reshape(1,-1),embeddings)[0]
    results=pd.Series(results,index=df.index)[relevant]
    results=results[df["description"]!=row[1]]
    
    if unique:
        results=results[~df["description"].duplicated()]
        
    results=results.sort_values(ascending=reverse).head(limit)
    results_df = df.loc[results.index]
                        
    if append:
        results_df = results_df.copy()
        results_df["similarity"] = results
    return results_df, results

In [97]:
# Getting items with the most similar descriptions
# High chance of getting multiple items with the same description
res,_ = most_similar(1,df,embeddings,limit=7,unique=False,append=True)
res

Unnamed: 0,title,description,url,availability,metatype,type,subtype,packages,length,width,height,weight gross,price,similarity
24,STARTTID,The backpack has a handy outer compartment whe...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,38.0,33.0,3.0,0.29,6.99,0.808151
22,STARTTID,The backpack has a handy outer compartment whe...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,38.0,33.0,3.0,0.29,6.99,0.808151
3,STARTTID,There’s a separate compartment in the back tha...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,42.0,37.0,3.0,0.4,19.99,0.793498
13,STARTTID,The backpack has a handy outer compartment whe...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,38.0,33.0,3.0,0.29,9.99,0.785619
23,DRÖMSÄCK,The fabric is made of recycled polyester. Usin...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,29.0,20.0,5.0,0.14,9.99,0.767641
12,DRÖMSÄCK,The fabric is made of recycled polyester. Usin...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,48.0,41.0,2.0,0.68,29.99,0.743566
5,VÄRLDENS,The bag is comfortable to carry and sits firml...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,25.0,20.0,3.0,0.14,7.99,0.726541


In [274]:
# Getting only one item with the same description
res,_ = most_similar(1,df,embeddings,limit=7,unique=True,append=True)
res

Unnamed: 0,title,description,url,availability,metatype,type,subtype,packages,length,width,height,weight gross,price,similarity
13056,MYLLRA,The front of the changing table can be removed...,http://www.ikea.lt/en/products/children-s-room...,instock,children-s-room,baby,changing-tables,3.0,96.0,84.0,6.0,25.58,199.0,0.160059
13062,MYLLRA,You can easily reach diapers and clothes since...,http://www.ikea.lt/en/products/children-s-room...,instock,children-s-room,baby,changing-tables,3.0,96.0,84.0,6.0,25.58,199.0,0.160059
13048,MYLLRA,If you want to create a complete children's ro...,http://www.ikea.lt/en/products/children-s-room...,instock,children-s-room,baby,cots,3.0,123.0,57.0,6.0,20.5,249.0,0.160059
13092,MYLLRA,One cot side can be removed when the child is ...,http://www.ikea.lt/en/products/children-s-room...,instock,children-s-room,baby,cots,3.0,123.0,57.0,6.0,20.5,249.0,0.160059
5069,VINTER 2020,Made of a durable material which does not brea...,http://www.ikea.lt/en/products/living-room/dec...,instock,living-room,decoration,holiday-decoration,1.0,24.0,24.0,11.0,0.4,9.99,0.158635
5209,VINTER 2020,"Easy to hang up, as hooks are included. \nMade...",http://www.ikea.lt/en/products/living-room/dec...,instock,living-room,decoration,holiday-decoration,1.0,62.0,16.0,16.0,0.75,12.99,0.158635
5040,VINTER 2020,"Easy to hang up, as ribbons are included. \nWi...",http://www.ikea.lt/en/products/living-room/dec...,instock,living-room,decoration,holiday-decoration,1.0,32.0,26.0,7.0,0.45,12.99,0.158635


In [30]:
# Will only return items with similarity for other attributes greater than 0.5
res,_ = most_similar(1,df,embeddings,limit=7,unique=True,threshold=-0.5,append=True)
res

NameError: name 'np' is not defined

In [276]:
# Will only return items with similarity for other attributes less than 0.5
# Will also search for the LEAST similar items
res,_ = most_similar(1,df,embeddings,limit=7,unique=True,threshold=-0.5,reverse=True,append=True)
res

Unnamed: 0,title,description,url,availability,metatype,type,subtype,packages,length,width,height,weight gross,price,similarity
5938,LYDERSHOLM,The rug is perfect for outdoor use because it’...,http://www.ikea.lt/en/products/living-room/hom...,instock,living-room,home-furnishing-rugs,rugs,1.0,164.0,,,4.46,59.99,-0.141231
6029,LYDERSHOLM,The rug is perfect for outdoor use because it’...,http://www.ikea.lt/en/products/living-room/hom...,out of stock,living-room,home-furnishing-rugs,rugs,1.0,135.0,,,3.1,39.99,-0.141231
5967,LYDERSHOLM,The colour and weave create a natural look.\nT...,http://www.ikea.lt/en/products/living-room/hom...,instock,living-room,home-furnishing-rugs,small-rugs-and-runners,1.0,81.0,,,1.48,19.99,-0.141231
12183,FLODALEN,The towel is made from yarn that has been tigh...,http://www.ikea.lt/en/products/bathroom/towels...,out of stock,bathroom,towels-bathmats,towels,1.0,31.0,16.0,6.0,0.06,2.49,-0.127384
15110,JANINGE/VANGSTA,Extendable dining table with 1 extra leaf seat...,http://www.ikea.lt/en/products/dining-room/din...,instock,dining-room,dining-tables,tables,2.0,77.0,52.0,47.0,4.8,139.97,-0.12136
7485,SONGESAND,"You save space with a mirror door, since you d...",http://www.ikea.lt/en/products/bedroom/solitai...,instock,bedroom,solitaire-wardrobes,solitaire-wardrobes,4.0,186.0,54.0,4.0,21.5,179.0,-0.120354
6051,SONGESAND,Adjustable bed sides allow you to use mattress...,http://www.ikea.lt/en/products/beds-mattresses...,instock,beds-mattresses,beds,bed-frames,2.0,206.0,28.0,9.0,24.2,109.0,-0.120354


In [277]:
def find_duplicates(df, row = None, by_columns = None, drop = False):
    """Find duplicate items 

    Parameters
    ----------
    df : DataFrame
        DataFrame to find duplicated in.
    row : int or named Series or DataFrame, default None
        The row to find duplicates for.
        If None will check for duplicates for any row.
        If given a DataFrame the first row will be selected.
        If given an int the df will be indexed using it as a label.
    by_column: list, default None
        The columns names or integers specifying the columns locations
        required to match to be considered duplicate, None for all columns.
    drop: boolean
        Whether to drop the duplicated rows in the original data.

    Returns
    -------
    DataFrame
        DataFrame with only the duplicates for the specified row.
    """
    
    if by_columns is None:
        by_columns = df.columns
    if row is None:
        duplicate_rows = df[df.loc[:,by_columns].duplicated()]
    else:
        if isinstance(row,int):
            row = df.loc[row,:]
        elif isinstance(row,pd.DataFrame):
            row = row.iloc[0,:]
        duplicate = pd.DataFrame(row[by_columns] == df.loc[:,by_columns]).all(axis=1)
        # preventing row from being a duplicate of itself
        duplicate.loc[row.name] = False
        duplicate_rows = df.loc[duplicate] 
    if drop:
        df.drop(duplicate_rows.index, inplace = True)
    return duplicate_rows

In [278]:
find_duplicates(df,22) # There doesn't seem to be any duplicates

find_duplicates(df,22,["title","description"]) # finds duplicates when searching for duplicates using only specified columns

Unnamed: 0,title,description,url,availability,metatype,type,subtype,packages,length,width,height,weight gross,price
24,STARTTID,The backpack has a handy outer compartment whe...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,38.0,33.0,3.0,0.29,6.99


In [279]:
find_duplicates(df,200,["title","description"],drop=True) # duplicates are found be they are deleted

Unnamed: 0,title,description,url,availability,metatype,type,subtype,packages,length,width,height,weight gross,price
189,VIMUND,This ergonomic junior chair has a flexible bac...,http://www.ikea.lt/en/products/children-s-room...,instock,children-s-room,for-teenagers,home-office-chairs,1.0,57.0,45.0,16.0,10.92,49.99
609,VIMUND,This ergonomic junior chair has a flexible bac...,http://www.ikea.lt/en/products/children-s-room...,instock,children-s-room,for-teenagers,home-office-chairs,1.0,57.0,45.0,16.0,10.92,49.99
627,VIMUND,This ergonomic junior chair has a flexible bac...,http://www.ikea.lt/en/products/children-s-room...,instock,children-s-room,for-teenagers,home-office-chairs,1.0,57.0,45.0,16.0,10.92,49.99
13596,VIMUND,This ergonomic junior chair has a flexible bac...,http://www.ikea.lt/en/products/children-s-room...,instock,children-s-room,for-teenagers,home-office-chairs,1.0,57.0,45.0,16.0,10.92,49.99
13597,VIMUND,This ergonomic junior chair has a flexible bac...,http://www.ikea.lt/en/products/children-s-room...,instock,children-s-room,for-teenagers,home-office-chairs,1.0,57.0,45.0,16.0,10.92,49.99


In [280]:
find_duplicates(df,200,["title","description"]) # No duplicates are found this time

Unnamed: 0,title,description,url,availability,metatype,type,subtype,packages,length,width,height,weight gross,price


In [281]:
# Returning all items that have exact same measurements as some other item
find_duplicates(df,None,["length","width","height"])

Unnamed: 0,title,description,url,availability,metatype,type,subtype,packages,length,width,height,weight gross,price
16,VÄRLDENS,"Separate, padded compartment with a zipper tha...",http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,54.0,42.0,3.0,0.70,39.99
21,SKYNKE,Easy to keep on hand since it folds very small...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,11.0,11.0,1.0,0.04,0.99
22,STARTTID,The backpack has a handy outer compartment whe...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,38.0,33.0,3.0,0.29,6.99
24,STARTTID,The backpack has a handy outer compartment whe...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,38.0,33.0,3.0,0.29,6.99
26,KARISMATISK,You can do something good for the environment ...,http://www.ikea.lt/en/products/hallway/bags-an...,instock,hallway,bags-and-travel-accessories,bags-and-travel-accessories,1.0,11.0,11.0,1.0,0.04,0.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21581,KNARDRUP,A timeless design with a sheen effect.\nThe so...,http://www.ikea.lt/en/products/living-room/hom...,out of stock,living-room,home-furnishing-rugs,rugs,1.0,202.0,,,10.85,179.00
21582,AVSKILDRA,The beauty is in the uniqueness - this rug is ...,http://www.ikea.lt/en/products/living-room/hom...,out of stock,living-room,home-furnishing-rugs,rugs,1.0,174.0,,,18.30,279.00
21583,LANGSTED,The cut edges makes it easy to join several ru...,http://www.ikea.lt/en/products/living-room/hom...,out of stock,living-room,home-furnishing-rugs,small-rugs-and-runners,1.0,80.0,80.0,15.0,1.40,9.99
21584,SVÄRDBORG,The beauty is in the uniqueness - this rug is ...,http://www.ikea.lt/en/products/living-room/hom...,out of stock,living-room,home-furnishing-rugs,rugs,1.0,138.0,,,11.54,199.00


In [282]:
def showcase(row,df,embeddings = None,limit=5,unique = True,
                 return_middle = True, append = False):
    
    """Find the most and the least similar items.
    
    Compares the sentence embeddings for the descriptions of the items
    using cosine similarity and returns items that are the most, the least similar 
    and possibly in the middle by similarity.

    Parameters
    ----------
    row : int or named Series or DataFrame
        The row to find the most similar items for.
        If given a DataFrame the first row will be selected.
        If given an int the df will be indexed using it as a label.
    df : DataFrame
        DataFrame to find the most similar items in.
    embeddings: array_like, Default None
        Sentence embeddings to compare the description of a given row against using cosine similarity.
    limit: int, Default 5
        The number of items to return.
    unique: boolean, Default True
        Whether to return only one item most with the same description. 
    return_middle: boolean, Default True
    append: boolean, Default False
        Whether to add the cosine similariy as a column of the returned DataFrame

    Returns
    -------
    DataFrame:
        DataFrame with the items most similar to the given row.
    Series:
        Series with the similarity scores for each row of the returned DataFrame.
    """
    
    if embeddings is None or len(embeddings) != len(df):
        raise ValueError("Compatible sentence embeddings not found!\nYou probably deleted rows from the\
        data frame since the last time you ran this\nCreate the embeddings again")
    
    if isinstance(row,int):
        row = df.loc[row,:]
    elif isinstance(row,pd.DataFrame):
        row = row.iloc[0,:]


    results=cosine(embeddings,model(np.array(row[1]).reshape(1)))
    results=pd.Series(results[:,0],index=df.index)
    results=results[df["description"]!=row[1]]
    
    if unique:
        results=results[~df["description"].duplicated()]
    
    results=results.sort_values(ascending=False)
    length = len(results) - 1
    middle = length//2 
    
    if not return_middle:
        rows = [*range(0,limit),*range(length-limit,length)]       
    else:
        rows = [*range(0,limit),*range(middle,middle+limit),*range(length-limit,length)]    
  
    results = results.iloc[rows]
    results_df = df.loc[results.index]   
    
    if append:
        results_df = results_df.copy()
        results_df["similarity"] = results
        
    return results_df, results

In [297]:
df.iloc[5000,1]

'The included collage template and coordinated motifs make it easy to create your own personal wall collage.\nMotif created by Odilon Redon, Albrecht Dürer, Theodor Severin Kittelsen, Théophile Steinlen and Paul Cézanne.\nThe motifs are in place and the pictures are ready to hang.\nIf you want some variation, you can easily change motif in the frame.\nYou can split the wall collage template into smaller collages and display your art in different ways.\nHolds 8 pictures so you can create your own personal collage.\nFront protection in plastic makes the frame safer to use.\nThe motifs have a common theme so you can easily create a coherent collage.\nYou can personalise your home with artwork that expresses your style.\n'

In [298]:
# Returning the most and the least similar items at the same time
rez,_ = showcase(5000,df,embeddings,5,return_middle=True,append=True)
rez

Unnamed: 0,title,description,url,availability,metatype,type,subtype,packages,length,width,height,weight gross,price,similarity
9537,SKOGSFRÄKEN,"A high, easy-care pillow with a cotton and lyo...",http://www.ikea.lt/en/products/bedroom/bedding...,instock,bedroom,bedding,pillows,1.0,48.0,,,0.87,9.99,0.189384
9538,SKOGSFRÄKEN,"A low, easy-care pillow with a cotton and lyoc...",http://www.ikea.lt/en/products/bedroom/bedding...,instock,bedroom,bedding,pillows,1.0,48.0,,,0.68,8.99,0.189384
4375,ISJAKT,The uplighter spreads a pleasant light that il...,http://www.ikea.lt/en/products/living-room/liv...,out of stock,living-room,living-room-lighting,floor-lamps,1.0,93.0,25.0,7.0,6.7,59.99,0.18181
18183,JOSEF,The cabinet has two adjustable shelves and the...,http://www.ikea.lt/en/products/outdoor/outdoor...,instock,outdoor,outdoor-organising,outdoor-organising,1.0,87.0,38.0,10.0,11.5,39.99,0.169603
15347,KRUSTAD,"Made of feldspar porcelain, which makes the bo...",http://www.ikea.lt/en/products/kitchen/servewa...,instock,kitchen,serveware,dinnerware-and-serving,1.0,,,7.0,0.33,2.99,0.166961
906,KALLAX,The practical inserts hide things and help mai...,http://www.ikea.lt/en/products/living-room/she...,instock,living-room,shelving-units-systems,open-shelving-units,2.0,73.0,34.0,4.0,4.64,151.0,0.022454
957,KALLAX,Easy to assemble.\nThe insert looks nice in a ...,http://www.ikea.lt/en/products/living-room/she...,instock,living-room,shelving-units-systems,open-shelving-units,1.0,73.0,34.0,4.0,4.65,19.0,0.022454
876,KALLAX,An easy-to-place shelving unit for storing and...,http://www.ikea.lt/en/products/living-room/she...,instock,living-room,shelving-units-systems,open-shelving-units,2.0,46.0,34.0,3.0,2.55,73.99,0.022454
1330,KALLAX,An easy-to-place shelving unit for storing and...,http://www.ikea.lt/en/products/living-room/she...,instock,living-room,shelving-units-systems,open-shelving-units,2.0,45.0,34.0,3.0,2.35,139.99,0.022454
881,KALLAX,The practical inserts hide things and help mai...,http://www.ikea.lt/en/products/living-room/she...,instock,living-room,shelving-units-systems,open-shelving-units,2.0,70.0,34.0,1.0,0.9,66.97,0.022454


In [None]:
# Misc

In [252]:
import csv

x = pd.DataFrame(df["subtype"].unique())
x = x[x.notna()]
x[1]=x[0].str.replace("-"," ")
x.to_csv("entity.csv",index=False,header=False,quoting = csv.QUOTE_ALL)

In [259]:
regex = re.compile(".+/")
regex.findall(df["url"][~df["subtype"].duplicated()].iloc[99])[0]

x = df[~df["subtype"].duplicated()]_unique.copy()                      
x["url"] = x["url"].str.extract("(.+/)")
x = x[["url","subtype"]]

dictionary = {}
for i in range(len(x)):
    dictionary[x.iloc[i,1]] = x.iloc[i,0]

In [88]:
def search_string(string,df,append = True):
    string_embedding = model(tf.convert_to_tensor([string]))
    results=cosine(string_embedding,embeddings)[0]
    results=pd.Series(results,index=df.index)     
    results.sort_values(ascending=False,inplace=True)

    if append:
        results_df = df.loc[results.index].copy()
        results_df["similarity"] = results
    return results_df, results

In [90]:
search_string("artificial plant",df)[0]

Unnamed: 0,title,description,url,availability,metatype,type,subtype,packages,length,width,height,weight gross,price,similarity
19120,FEJKA,Lifelike artificial plant that remain just as ...,http://www.ikea.lt/en/products/living-room/dec...,instock,living-room,decoration,artificial-plants,1.0,55.0,,,0.33,4.99,0.556706
4916,FEJKA,Lifelike artificial plant that remain just as ...,http://www.ikea.lt/en/products/living-room/dec...,instock,living-room,decoration,artificial-plants,1.0,55.0,,,0.33,4.99,0.556706
10898,FEJKA,Lifelike artificial plant that remain just as ...,http://www.ikea.lt/en/products/living-room/dec...,out of stock,living-room,decoration,artificial-plants,1.0,63.0,,,0.63,19.99,0.556706
12627,FEJKA,Lifelike artificial plant that remain just as ...,http://www.ikea.lt/en/products/living-room/dec...,instock,living-room,decoration,artificial-plants,1.0,21.0,,,0.20,2.99,0.556706
16691,FEJKA,Lifelike artificial plant that remain just as ...,http://www.ikea.lt/en/products/living-room/dec...,out of stock,living-room,decoration,artificial-plants,1.0,63.0,,,0.63,19.99,0.556706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2552,GALLRA,Protects the table top surface and reduces noi...,http://www.ikea.lt/en/products/kitchen/servewa...,instock,kitchen,serveware,table-linen,1.0,45.0,33.0,0.0,0.06,1.49,-0.169614
2550,SLUTEN,Protects the table top surface and reduces noi...,http://www.ikea.lt/en/products/kitchen/servewa...,instock,kitchen,serveware,table-linen,1.0,0.0,,,0.05,1.99,-0.169614
2549,SMAKSINNE,Protects the table top surface and reduces noi...,http://www.ikea.lt/en/products/kitchen/servewa...,instock,kitchen,serveware,table-linen,1.0,37.0,37.0,0.0,0.07,1.49,-0.169614
2545,PANNÅ,Protects the table top surface and reduces noi...,http://www.ikea.lt/en/products/kitchen/servewa...,instock,kitchen,serveware,table-linen,1.0,37.0,37.0,0.0,0.06,1.99,-0.169614
