# Getting Data

### Import libraries

In [190]:
from url_checker import UrlCheck
from bs4 import BeautifulSoup
import re

import inflect
p = inflect.engine()

import spacy

import pandas as pd

from nltk import ngrams

### Sample Keywords Data

In [191]:
keywords = pd.read_excel("combined.xlsx",sheetname="Jewelry")
keywords["keyword_len"] = keywords["Keyword"].apply(len)

keywords = keywords.sort_values(by ="keyword_len", ascending = False)
keywords = keywords.reset_index(drop = True)

In [192]:
def build_url_from_keyword(keyword):
    base_url = "https://www.overstock.com/search?keywords="
    keyword_for_url = keyword.replace(" ","+")
    final_url = base_url + keyword_for_url
    return final_url

In [193]:
def get_soup(url):
    check = UrlCheck()
    req = check.get_request(url)
    return BeautifulSoup(req.text, 'lxml')

In [194]:
def get_prod_titles(soup):
    prod_titles = soup.findAll("div", { "class" : "product-title"}) 
    text_title = []
    for title in prod_titles:
        text_title.append(title.get_text())
    return text_title

In [195]:
def get_refinement_group(soup):
    refinements_titles = soup.findAll("div",{"class":"refinement-group"})
    text_title = []
    for title in refinements_titles:
        text_title.append(title)
    return text_title

In [196]:
def get_refinement(refinement_group):
    refinements = refinement_group.findAll("li",{"class":"refinement-item"})
    heading = refinement_group.find("h3", {"class":"refinement-heading"}).get_text()
    refs = []
    for title in refinements:
        refs.append(title.get_text())
    
    refs = [re.sub("\(\d*\)","", item.strip()) for item in refs]
    refs = [re.sub("\(\d,\d*\)","", item.strip()) for item in refs]
    final_ref_group = {}
    final_ref_group[heading] = refs
    return final_ref_group

In [197]:
# get rid of this ['Jewelry & Watch Store ', 'Worldstock Fair Trade ', 'Holiday ', 'Pet Supply Store ', 'Main Street Revolution ']

In [198]:
keyword ="safavieh casual natural fiber natural and beige border seagrass rug" #keywords["Keyword"][67]

print(keyword)

safavieh casual natural fiber natural and beige border seagrass rug


In [199]:
url = build_url_from_keyword(keyword)  #sterling silver wedding ring sets for him and her
soup = get_soup(url)
prod_titles = get_prod_titles(soup)

In [200]:
group = get_refinement_group(soup)

In [201]:
refinement_dict_list = []
for g in group:
    refinement_dict_list.append(get_refinement(g))

In [202]:
keyword

'safavieh casual natural fiber natural and beige border seagrass rug'

### Modify input and output

In [203]:
def change_ref_to_lower(refinement_dict_list):
    new_refinement_dict_list = []
    for ref_dict in refinement_dict_list:
        new_ref_dict = {}
        for title, values in ref_dict.items():
            new_title = title.lower()
            new_values = values = [value.lower() for value in values]
            new_ref_dict[new_title] = new_values
        new_refinement_dict_list.append(new_ref_dict)
    return new_refinement_dict_list

In [204]:
refinement_dict_list = change_ref_to_lower(refinement_dict_list)

In [205]:
prod_titles = [title.lower() for title in prod_titles]

In [206]:
refinement_dict_list[6:7]

[{'weave type': ['machine-made', 'power-loomed', 'hand-woven', 'handmade']}]

In [207]:
prod_titles[:5]

["safavieh natural fiber seagrass rug (5' x 8')",
 "safavieh casual natural fiber natural and beige border seagrass rug (8' x 10')",
 "safavieh handwoven natural beige seagrass area rug (9' x 12')",
 "safavieh casual natural fiber natural and beige border seagrass rug (6' x 9')",
 "safavieh casual natural fiber natural and beige border seagrass rug (4' x 6')"]

# Keyword Stats

### Occurance Df

In [208]:
ngrams_list = []
final_list_of_keywords = keyword.split() + [p.plural(word) for word in keyword.split()]
for n in range(1, len(final_list_of_keywords)):
    temp_list = list(ngrams(final_list_of_keywords,n))
    for item in temp_list:
        ngrams_list.append(item)

In [209]:
new_keyword_combinations = []
for words in ngrams_list:
    new_keyword_combinations.append(' '.join(words))

In [210]:
original_keyword_length = len(keyword)
new_keyword_combinations = [keyword for keyword in new_keyword_combinations if len(keyword) <= original_keyword_length]

In [211]:
word_frequency = {}
for word in new_keyword_combinations:
    count = 0 
    for prod_title in prod_titles:
        if word in prod_title:
            count += 1
            
    word_frequency[word] = count

In [212]:
df = pd.DataFrame.from_dict(word_frequency, orient="index")

In [213]:
df = df.rename(columns= {0:"count"})

In [214]:
imp_words_df = df[df["count"] != 0]

In [215]:
attrs = {}
for word in new_keyword_combinations:
    for d in refinement_dict_list:
        if word in list(d.values())[0]:
            key_name = list(d.keys())[0]
            attrs[key_name] = word

In [216]:
attrs

{'brands': 'safavieh',
 'colors': 'beige',
 'materials': 'natural fiber',
 'patterns': 'border',
 'styles': 'casual'}

In [217]:
list(imp_words_df.index)

['natural fiber natural',
 'casual natural fiber natural and beige border',
 'and',
 'natural fiber natural and beige border',
 'safavieh casual natural',
 'fiber natural and beige border',
 'fiber natural and beige border seagrass rug',
 'natural and',
 'safavieh casual natural fiber natural and beige border',
 'fiber natural and beige',
 'natural fiber natural and beige',
 'safavieh casual natural fiber natural and beige',
 'and beige border',
 'safavieh',
 'safavieh casual natural fiber',
 'and beige',
 'natural fiber',
 'border seagrass rug',
 'natural and beige border',
 'casual natural fiber natural and beige border seagrass rug',
 'fiber natural',
 'beige border seagrass rug',
 'rug',
 'casual natural fiber',
 'casual natural fiber natural',
 'fiber natural and',
 'natural fiber natural and beige border seagrass rug',
 'beige border seagrass',
 'fiber',
 'seagrass',
 'casual natural fiber natural and',
 'safavieh casual',
 'safavieh casual natural fiber natural',
 'casual natura

In [218]:
def modify_prod_titles(prod_title_list, word_list):
    regex = ""
    for word in word_list:
        regex = regex + "\\b"+word+"\\b" + "|"
    regex = regex.strip("|")
    new_prod_title_list = []
    for prod in prod_title_list:
        temp_list = re.findall(regex,prod)
        print(" ".join(temp_list),"########", prod)
        new_prod_title_list.append(" ".join(temp_list))
    

    return new_prod_title_list

In [219]:
prod_titles_modified = modify_prod_titles(prod_titles, list(imp_words_df.index))

safavieh natural fiber seagrass rug ######## safavieh natural fiber seagrass rug (5' x 8')
safavieh casual natural fiber natural and beige border seagrass rug ######## safavieh casual natural fiber natural and beige border seagrass rug (8' x 10')
safavieh natural beige seagrass rug ######## safavieh handwoven natural beige seagrass area rug (9' x 12')
safavieh casual natural fiber natural and beige border seagrass rug ######## safavieh casual natural fiber natural and beige border seagrass rug (6' x 9')
safavieh casual natural fiber natural and beige border seagrass rug ######## safavieh casual natural fiber natural and beige border seagrass rug (4' x 6')
safavieh casual natural fiber natural and beige border seagrass rug ######## safavieh casual natural fiber natural and beige border seagrass rug (3' x 5')
safavieh casual natural fiber natural beige seagrass ######## safavieh casual natural fiber hand-woven sisal natural / beige seagrass runner (2'6 x 10')
safavieh casual natural fibe

In [220]:
prod_titles_modified

['safavieh natural fiber seagrass rug',
 'safavieh casual natural fiber natural and beige border seagrass rug',
 'safavieh natural beige seagrass rug',
 'safavieh casual natural fiber natural and beige border seagrass rug',
 'safavieh casual natural fiber natural and beige border seagrass rug',
 'safavieh casual natural fiber natural and beige border seagrass rug',
 'safavieh casual natural fiber natural beige seagrass',
 'safavieh casual natural fiber natural beige seagrass rug',
 'safavieh casual natural fiber natural and beige border seagrass',
 'safavieh casual natural fiber natural beige seagrass rug',
 'safavieh casual natural fiber natural and beige border seagrass rug',
 'safavieh casual natural fiber natural beige seagrass',
 'safavieh casual natural fiber natural beige seagrass rug',
 'safavieh casual natural fiber natural beige seagrass',
 'safavieh casual natural fiber natural and beige border seagrass rug',
 'safavieh casual natural fiber natural and border seagrass rug',


In [221]:
word_len_list = []
for item in set(prod_titles_modified):
    word_len_list.append(len(item))

In [222]:
mod_prod_title_df = pd.DataFrame({"word":list(set(prod_titles_modified)),"length":word_len_list})

In [223]:
mod_prod_title_df.sort_values(by= "length", ascending=False)

Unnamed: 0,length,word
10,67,safavieh casual natural fiber natural and beig...
15,63,safavieh casual natural fiber natural and beig...
6,61,safavieh casual natural fiber natural and bord...
0,56,safavieh casual natural fiber natural beige se...
4,52,safavieh casual natural fiber natural beige se...
12,50,safavieh casual natural fiber natural seagrass...
8,47,safavieh casual natural fiber natural beige rug
1,46,safavieh casual natural fiber natural seagrass
3,44,safavieh casual natural fiber and border rug
13,42,safavieh casual natural fiber seagrass rug


In [226]:
mod_prod_title_df["word"]

0     safavieh casual natural fiber natural beige se...
1        safavieh casual natural fiber natural seagrass
2                 safavieh casual natural fiber natural
3          safavieh casual natural fiber and border rug
4     safavieh casual natural fiber natural beige se...
5                   safavieh natural fiber seagrass rug
6     safavieh casual natural fiber natural and bord...
7                     safavieh casual natural fiber rug
8       safavieh casual natural fiber natural beige rug
9                         safavieh casual natural fiber
10    safavieh casual natural fiber natural and beig...
11                  safavieh natural beige seagrass rug
12    safavieh casual natural fiber natural seagrass...
13           safavieh casual natural fiber seagrass rug
14              safavieh casual natural fiber beige rug
15    safavieh casual natural fiber natural and beig...
16            safavieh casual natural fiber natural rug
Name: word, dtype: object

In [225]:
keyword

'safavieh casual natural fiber natural and beige border seagrass rug'

In [230]:
imp_words_df.sort_values(by = "count", ascending=False)

Unnamed: 0,count
natural,60
safavieh,60
natural fiber,59
fiber,59
casual natural fiber,58
safavieh casual natural,58
casual,58
casual natural,58
safavieh casual natural fiber,58
safavieh casual,58
