In [108]:
from url_checker import UrlCheck
from bs4 import BeautifulSoup
import re

import inflect
p = inflect.engine()

import spacy

In [109]:
import pandas as pd

keywords = pd.read_excel("combined.xlsx",sheetname="Jewelry")
keywords["keyword_len"] = keywords["Keyword"].apply(len)

keywords = keywords.sort_values(by ="keyword_len", ascending = False)
keywords = keywords.reset_index(drop = True)

In [110]:
def build_url_from_keyword(keyword):
    base_url = "https://www.overstock.com/search?keywords="
    keyword_for_url = keyword.replace(" ","+")
    final_url = base_url + keyword_for_url
    return final_url

In [111]:
def get_soup(url):
    check = UrlCheck()
    req = check.get_request(url)
    return BeautifulSoup(req.text, 'lxml')

In [112]:
def get_prod_titles(soup):
    prod_titles = soup.findAll("div", { "class" : "product-title"}) 
    text_title = []
    for title in prod_titles:
        text_title.append(title.get_text())
    return text_title

In [113]:
def get_refinement_group(soup):
    refinements_titles = soup.findAll("div",{"class":"refinement-group"})
    text_title = []
    for title in refinements_titles:
        text_title.append(title)
    return text_title

In [114]:
def get_refinement(refinement_group):
    refinements = refinement_group.findAll("li",{"class":"refinement-item"})
    heading = refinement_group.find("h3", {"class":"refinement-heading"}).get_text()
    refs = []
    for title in refinements:
        refs.append(title.get_text())
    
    refs = [re.sub("\(\d*\)","", item.strip()) for item in refs]
    refs = [re.sub("\(\d,\d*\)","", item.strip()) for item in refs]
    final_ref_group = {}
    final_ref_group[heading] = refs
    return final_ref_group

In [115]:
# get rid of this ['Jewelry & Watch Store ', 'Worldstock Fair Trade ', 'Holiday ', 'Pet Supply Store ', 'Main Street Revolution ']

In [116]:
url = build_url_from_keyword("diamond engagement ring with sapphire side stones")  #sterling silver wedding ring sets for him and her
soup = get_soup(url)
prod_titles = get_prod_titles(soup)

In [117]:
group = get_refinement_group(soup)

In [118]:
refinement_dict_list = []
for g in group:
    refinement_dict_list.append(get_refinement(g))

### Modify input and output

In [119]:
keyword  = "diamond engagement ring with sapphire side stones"

def change_ref_to_lower(refinement_dict_list):
    new_refinement_dict_list = []
    for ref_dict in refinement_dict_list:
        new_ref_dict = {}
        for title, values in ref_dict.items():
            new_title = title.lower()
            new_values = values = [value.lower() for value in values]
            new_ref_dict[new_title] = new_values
        new_refinement_dict_list.append(new_ref_dict)
    return new_refinement_dict_list

In [120]:
refinement_dict_list = change_ref_to_lower(refinement_dict_list)

In [121]:
prod_titles = [title.lower() for title in prod_titles]

### Occurance Df

In [122]:
from nltk import ngrams

In [123]:
ngrams_list = []
final_list_of_keywords = keyword.split() + [p.plural(word) for word in keyword.split()]
for n in range(1, len(final_list_of_keywords)):
    temp_list = list(ngrams(final_list_of_keywords,n))
    for item in temp_list:
        ngrams_list.append(item)

In [124]:
new_keyword_combinations = []
for words in ngrams_list:
    new_keyword_combinations.append(' '.join(words))

In [125]:
original_keyword_length = len(keyword)
new_keyword_combinations = [keyword for keyword in new_keyword_combinations if len(keyword) <= original_keyword_length]

In [126]:
word_frequency = {}
for word in new_keyword_combinations:
    count = 0 
    for prod_title in prod_titles:
        if word in prod_title:
            count += 1
            
    word_frequency[word] = count

In [127]:
df = pd.DataFrame.from_dict(word_frequency, orient="index")

In [128]:
df = df.rename(columns= {0:"count"})

In [129]:
df[df["count"] != 0]

Unnamed: 0,count
and,37
ring,57
sterling silver,15
set,13
silver,15
wedding,2
wedding ring,2
sterling,15


In [130]:
list(refinement_dict_list[1].keys())

['sales & promotions']

In [131]:
attrs = {}
for word in new_keyword_combinations:
    for d in refinement_dict_list:
        if word in list(d.values())[0]:
            key_name = list(d.keys())[0]
            attrs[key_name] = word

In [132]:
attrs

{'metal colors': 'silver',
 'metals': 'sterling silver',
 'ring styles': 'wedding ring sets'}