# Crawl web pages

In [8]:
import requests
import pandas as pd

In [192]:
with open('personalized_gifts.txt') as f:
    urls = f.read().splitlines()

In [193]:
urls

['http://www.personalizationmall.com/',
 'https://www.uncommongoods.com/gifts/personalized/personalized-gifts',
 'https://www.etsy.com/featured/personalized-holiday-and-christmas-gifts',
 'https://www.thingsremembered.com/personalized-gifts-for-any-occasion',
 'https://www.personalcreations.com/',
 'https://www.shutterfly.com/personalized-gifts',
 'https://www.walmart.com/cp/personalized-gifts/133224',
 'https://www.gifts.com/categories/personalized-gifts/1vV',
 'https://www.agiftpersonalized.com/',
 'http://www.hallmark.com/personalized-gifts/',
 'https://www.amazon.com/Amazon-Custom/b?ie=UTF8&node=11032013011',
 'https://www.lakeside.com/browse/Gift-Ideas-Personalized-Personalized-Gifts/_/N-287k',
 'https://weddingshop.theknot.com/gifts/for-her',
 'http://www.bradfordexchange.com/mcategory/personalized-gifts.html',
 'https://www.papersource.com/personalized_gifts/',
 'https://www.bedbathandbeyond.com/store/category/personalized-gifts/13807/',
 'https://www.ltdcommodities.com/browse/G

In [194]:
crawler_endpoint = 'http://localhost:8888/page/extract'

In [195]:
payload = {
    'urls': ','.join(urls),
    'extractor': 'all_text'
}

In [196]:
crawled_urls = requests.post(url=crawler_endpoint, data=payload)

In [197]:
crawled_urls_data = crawled_urls.json()

In [198]:
urls_data = []
for url, page in crawled_urls_data['pages']:
    page['url'] = url
    urls_data.append(page)

In [209]:
urls_df = pd.DataFrame(data=urls_data)

In [214]:
urls_df.groupby('code')['code'].count()

code
200.0    48
Name: code, dtype: int64

In [680]:
urls_df['content'].values[1]

u'doctype.vm, start _seo_cssJs_versions.vm, end _seo_cssJs_versions.vm, start _seo_cssJs_versions_app.vm, end _seo_cssJs_versions_app.vm, User-Agent Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 -1, Server name: U03, endeca_category_template.vm, BEGIN component: open-document, EndecaQuery.descriptorDimension 20091217001 [Apparel Product Types, Disney Themes, Doll Accessories, Doll Lines, Doll Themes, Doll Types, Dolls by Gender, Dolls by Material, Dolls by Size, Gifts by Price, Jewelry Product Types, Kinkade Themes, MerchDim, More Ways To Shop, Product Types, Relationships, Styles & Materials, Themes], EndecaQuery.descriptorDimension 11238 $row.descriptor.properties.getValues("categoryDimensions"), EndecaQuery.descriptorDimension P $row.descriptor.properties.getValues("categoryDimensions"), EndecaQuery.descriptorDimension Y $row.descriptor.properties.getValues("categoryDimensions"), true, true, breadcrumbs_list 

# Get top keywords

In [606]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import re
from nltk.stem.porter import PorterStemmer

In [218]:
urls_df

Unnamed: 0,code,content,error,ok,tokens,url
0,200.0,"Begin Static META, Ux 1/ 2 specific code, ALP-...",False,True,"[begin, static, meta, ux, 1, 2, specific, code...",https://www.staples.com/Personalized-Gifts/cat...
1,200.0,"doctype.vm, start _seo_cssJs_versions.vm, end ...",False,True,"[doctype.vm, start, seo_cssjs_versions.vm, end...",http://www.bradfordexchange.com/mcategory/pers...
2,200.0,"Coradiant Include, End of Coradiant Include, S...",False,True,"[coradiant, include, end, of, coradiant, inclu...",https://www.landsend.com/shop/gifts
3,200.0,"emit CSM JS, 18tf9hw, From remote config v3, [...",False,True,"[emit, csm, js, 18tf9hw, from, remote, config,...",https://www.amazon.com/Amazon-Custom/b?ie=UTF8...
4,200.0,"<![endif], <PageMap>\n <DataObject type=""docu...",False,True,"[endif, pagemap, dataobject, type, document, a...",http://www.montblanc.com/en-us/collection/land...
5,200.0,"<![endif], BEGIN GOOGLE UNIVERSAL ANALYTICS CO...",False,True,"[endif, begin, google, universal, analytics, c...",http://www.packersproshop.com/green-bay-packer...
6,200.0,"CentralNotice, Talk:Personalized gifts, From W...",False,True,"[centralnotice, talk, personalized, gifts, fro...",https://en.wikipedia.org/wiki/Talk%3APersonali...
7,200.0,"[if lte IE 9]>\n <link rel=""stylesh...",False,True,"[if, lte, ie, 9, link, rel, stylesheet, type, ...",https://www.bedbathandbeyond.com/store/categor...
8,200.0,"<![endif], Google Tag Manager, Google Tag Mana...",False,True,"[endif, google, tag, manager, google, tag, man...",https://www.iseeme.com/en-us/personalized-gift...
9,200.0,"[if lt IE 9]>\n<script type=""text/javascript"" ...",False,True,"[if, lt, ie, 9, script, type, text/javascript,...",https://www.thegrommet.com/collections/persona...


In [239]:
content_tokens = urls_df.query('ok==True')['tokens'].values.tolist()

In [400]:
def is_number(text):
    try:
        int(text)
        return True
    except:
        return False

In [763]:
stop_words = {'www', 'https', 'href', 'http'}

In [764]:
def is_valid_token(token):
    return  not token.startswith('<') \
            and not token.endswith('>') \
            and not token.startswith('[') \
            and not token.endswith(']') \
            and len(token) > 3 \
            and token not in stop_words \
            and not is_number(clean_token(token))

In [811]:
def clean_token(token):
    return re.sub(r'[\W-]', ' ', token)

In [812]:
def tokenize(text):
    text = ' '.join([clean_token(token) for token in text.split() if is_valid_token(token) and clean_token(token)])
    return [token.strip() for token in text.split() if token and token.strip() and is_valid_token(token.strip())]

In [798]:
stemmer = PorterStemmer()

In [799]:
stemmer.stem('personalized')

u'person'

In [800]:
token_pattern = re.compile(r'(?u)\b\w\w+\b')
token_pattern.findall('acbc cdf')

['acbc', 'cdf']

In [810]:
re.sub'acbc_cdf'

['acbc_cdf']

In [801]:
tokenize('@...s.et, acbc %9')

['acbc']

In [802]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english', min_df=0.3, max_df=0.9, max_features=200)
vectorizer.fit_transform(urls_df['content'])
features = vectorizer.get_feature_names()
indices = np.argsort(vectorizer.idf_)[::-1]

top_n = 20
print '- Top %d keywords' % top_n
top_keywords = [(features[i], vectorizer.idf_[i]) for i in indices[:top_n]]
for keyword, score in top_keywords:
    print keyword, '-', score

- Top 20 keywords
save - 2.15923691048
center - 2.15923691048
summer - 2.15923691048
know - 2.15923691048
sure - 2.15923691048
featured - 2.15923691048
favorite - 2.15923691048
exclusive - 2.15923691048
corporate - 2.15923691048
touch - 2.15923691048
organization - 2.15923691048
candles - 2.15923691048
personalize - 2.15923691048
print - 2.15923691048
body - 2.15923691048
purchase - 2.15923691048
quick - 2.15923691048
apparel - 2.15923691048
guarantee - 2.09861228867
shirts - 2.09861228867


In [803]:
print len(vectorizer.vocabulary_)
print len(vectorizer.stop_words_)

200
8078


In [804]:
print 'personalized' in vectorizer.vocabulary_
print 'gifts' in vectorizer.vocabulary_
print 'personalized' in vectorizer.stop_words_
print 'gifts' in vectorizer.stop_words_

False
False
True
True


In [823]:
TfidfVectorizer?