# Getting Data

### Import libraries

In [232]:
from url_checker import UrlCheck
from bs4 import BeautifulSoup
import re

import inflect
p = inflect.engine()

import spacy

import pandas as pd

from nltk import ngrams

### Sample Keywords Data

In [233]:
keywords = pd.read_excel("combined.xlsx",sheetname="Jewelry")
keywords["keyword_len"] = keywords["Keyword"].apply(len)

keywords = keywords.sort_values(by ="keyword_len", ascending = False)
keywords = keywords.reset_index(drop = True)

In [234]:
def build_url_from_keyword(keyword):
    base_url = "https://www.overstock.com/search?keywords="
    keyword_for_url = keyword.replace(" ","+")
    final_url = base_url + keyword_for_url
    return final_url

In [235]:
def get_soup(url):
    check = UrlCheck()
    req = check.get_request(url)
    return BeautifulSoup(req.text, 'lxml')

In [236]:
def get_prod_titles(soup):
    prod_titles = soup.findAll("div", { "class" : "product-title"}) 
    text_title = []
    for title in prod_titles:
        text_title.append(title.get_text())
    return text_title

In [237]:
def get_refinement_group(soup):
    refinements_titles = soup.findAll("div",{"class":"refinement-group"})
    text_title = []
    for title in refinements_titles:
        text_title.append(title)
    return text_title

In [238]:
def get_refinement(refinement_group):
    refinements = refinement_group.findAll("li",{"class":"refinement-item"})
    heading = refinement_group.find("h3", {"class":"refinement-heading"}).get_text()
    refs = []
    for title in refinements:
        refs.append(title.get_text())
    
    refs = [re.sub("\(\d*\)","", item.strip()) for item in refs]
    refs = [re.sub("\(\d,\d*\)","", item.strip()) for item in refs]
    final_ref_group = {}
    final_ref_group[heading] = refs
    return final_ref_group

In [239]:
# get rid of this ['Jewelry & Watch Store ', 'Worldstock Fair Trade ', 'Holiday ', 'Pet Supply Store ', 'Main Street Revolution ']

In [240]:
keyword = keywords["Keyword"][69]

print(keyword)

engagement rings with sapphire side stones


In [241]:
url = build_url_from_keyword(keyword)  #sterling silver wedding ring sets for him and her
soup = get_soup(url)
prod_titles = get_prod_titles(soup)

In [242]:
group = get_refinement_group(soup)

In [243]:
refinement_dict_list = []
for g in group:
    refinement_dict_list.append(get_refinement(g))

In [244]:
keyword

'engagement rings with sapphire side stones'

### Modify input and output

In [245]:
def change_ref_to_lower(refinement_dict_list):
    new_refinement_dict_list = []
    for ref_dict in refinement_dict_list:
        new_ref_dict = {}
        for title, values in ref_dict.items():
            new_title = title.lower()
            new_values = values = [value.lower() for value in values]
            new_ref_dict[new_title] = new_values
        new_refinement_dict_list.append(new_ref_dict)
    return new_refinement_dict_list

In [246]:
refinement_dict_list = change_ref_to_lower(refinement_dict_list)

In [247]:
prod_titles = [title.lower() for title in prod_titles]

In [248]:
refinement_dict_list[6:7]

[{'stone shapes': ['round',
   'princess',
   'cushion',
   'oval',
   'baguette',
   'emerald',
   'marquise',
   'radiant',
   'heart',
   'pear',
   'asscher',
   'trillion']}]

In [249]:
prod_titles[:5]

['miadora 10k white gold created white sapphire and diamond 3-stone engagement ring',
 'simon frank cushion-cut rhodium-overlay cubic zirconia ring',
 'miadora sterling silver cushion and round-cut created white sapphire halo 3-piece bridal ring set',
 'miadora sterling silver sapphire and diamond accent halo cocktail ring',
 '14k white gold 3/4 ct tdw halo diamond split engagement ring']

# Keyword Stats

### Occurance Df

In [250]:
ngrams_list = []
final_list_of_keywords = keyword.split() + [p.plural(word) for word in keyword.split()]
for n in range(1, len(final_list_of_keywords)):
    temp_list = list(ngrams(final_list_of_keywords,n))
    for item in temp_list:
        ngrams_list.append(item)

In [251]:
new_keyword_combinations = []
for words in ngrams_list:
    new_keyword_combinations.append(' '.join(words))

In [252]:
original_keyword_length = len(keyword)
new_keyword_combinations = [keyword for keyword in new_keyword_combinations if len(keyword) <= original_keyword_length]

In [253]:
word_frequency = {}
for word in new_keyword_combinations:
    count = 0 
    for prod_title in prod_titles:
        if word in prod_title:
            count += 1
            
    word_frequency[word] = count

In [254]:
df = pd.DataFrame.from_dict(word_frequency, orient="index")

In [255]:
df = df.rename(columns= {0:"count"})

In [256]:
imp_words_df = df[df["count"] != 0]

In [257]:
attrs = {}
for word in new_keyword_combinations:
    for d in refinement_dict_list:
        if word in list(d.values())[0]:
            key_name = list(d.keys())[0]
            attrs[key_name] = word

In [258]:
attrs

{'ring styles': 'engagement', 'stones': 'stone'}

In [259]:
list(imp_words_df.index)

['sapphire', 'ring', 'side', 'engagement', 'stone', 'with']

In [260]:
def modify_prod_titles(prod_title_list, word_list):
    regex = ""
    for word in word_list:
        regex = regex + "\\b"+word+"\\b" + "|"
    regex = regex.strip("|")
    new_prod_title_list = []
    for prod in prod_title_list:
        temp_list = re.findall(regex,prod)
        print(" ".join(temp_list),"########", prod)
        new_prod_title_list.append(" ".join(temp_list))
    

    return new_prod_title_list

In [261]:
prod_titles_modified = modify_prod_titles(prod_titles, list(imp_words_df.index))

sapphire stone engagement ring ######## miadora 10k white gold created white sapphire and diamond 3-stone engagement ring
ring ######## simon frank cushion-cut rhodium-overlay cubic zirconia ring
sapphire ring ######## miadora sterling silver cushion and round-cut created white sapphire halo 3-piece bridal ring set
sapphire ring ######## miadora sterling silver sapphire and diamond accent halo cocktail ring
engagement ring ######## 14k white gold 3/4 ct tdw halo diamond split engagement ring
ring ######## miadora sterling silver octagon-cut green amethyst and white topaz split shank cocktail ring
sapphire ring ######## miadora 10k white gold created white sapphire bridal ring set
sapphire stone engagement ring ######## miadora 10k rose gold morganite and created white sapphire diamond accent 3-stone engagement ring (g-h, i1-i2)
ring ######## sterling silver cubic zirconia wedding ring
with side engagement ring ######## charles & colvard 14k white gold 1 3/4ct dew forever one near-color

In [262]:
from collections import Counter

Counter(prod_titles_modified)

Counter({'': 3,
         'engagement ring': 12,
         'ring': 11,
         'sapphire engagement ring': 3,
         'sapphire ring': 11,
         'sapphire stone engagement ring': 7,
         'sapphire stone ring': 5,
         'stone engagement ring': 2,
         'stone ring': 2,
         'stone sapphire engagement ring': 1,
         'with sapphire engagement ring': 1,
         'with side engagement ring': 2})

In [263]:
word_len_list = []
for item in set(prod_titles_modified):
    word_len_list.append(len(item))

In [264]:
mod_prod_title_df = pd.DataFrame({"word":list(set(prod_titles_modified)),"length":word_len_list})

In [265]:
mod_prod_title_df.sort_values(by= "length", ascending=False)

Unnamed: 0,length,word
0,30,sapphire stone engagement ring
6,30,stone sapphire engagement ring
5,29,with sapphire engagement ring
1,25,with side engagement ring
4,24,sapphire engagement ring
8,21,stone engagement ring
3,19,sapphire stone ring
11,15,engagement ring
10,13,sapphire ring
7,10,stone ring


In [266]:
mod_prod_title_df["word"]

0     sapphire stone engagement ring
1          with side engagement ring
2                                   
3                sapphire stone ring
4           sapphire engagement ring
5      with sapphire engagement ring
6     stone sapphire engagement ring
7                         stone ring
8              stone engagement ring
9                               ring
10                     sapphire ring
11                   engagement ring
Name: word, dtype: object

In [267]:
keyword

'engagement rings with sapphire side stones'

In [268]:
imp_words_df.sort_values(by = "count", ascending=False)

Unnamed: 0,count
ring,57
sapphire,28
engagement,28
stone,18
with,3
side,2
