In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

# utility functions

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

## test

In [15]:
raw_html = simple_get('https://www.ebay.com/sch/shellygallayoga/m.html?_nkw=&_armrs=1&_ipg=&_from')

In [16]:
html = BeautifulSoup(raw_html, 'html.parser')

In [21]:
print (html)

<!DOCTYPE html>

<html class="no-touch" lang="en">
<head>
<link href="//i.ebayimg.com" rel="dns-prefetch"/><link href="//thumbs.ebaystatic.com" rel="dns-prefetch"/><link href="//thumbs1.ebaystatic.com" rel="dns-prefetch"/><link href="//thumbs2.ebaystatic.com" rel="dns-prefetch"/><link href="//thumbs3.ebaystatic.com" rel="dns-prefetch"/><link href="//thumbs4.ebaystatic.com" rel="dns-prefetch"/><link href="//vi.vipr.ebaydesc.com" rel="dns-prefetch"/><link href="//p.ebaystatic.com" rel="dns-prefetch"/><link href="//q.ebaystatic.com" rel="dns-prefetch"/><link href="//pics.ebaystatic.com" rel="dns-prefetch"/><link href="//ir.ebaystatic.com" rel="dns-prefetch"/><link href="//tags.bluekai.com" rel="dns-prefetch"/><link href="//srx.main.ebayrtm.com" rel="dns-prefetch"/><link href="//rover.ebay.com" rel="dns-prefetch"/><link href="//reco.ebay.com" rel="dns-prefetch"/><link href="//ads.ebay.com" rel="dns-prefetch"/><link href="//svcs.ebay.com" rel="dns-prefetch"/><!-- Sync with the domain names 

## download item info and save into a dataframe

In [18]:
items = pd.DataFrame(columns={'Pictures','Product_title','eBay_item_number','Price','Buy_it_now_price','url'})

In [19]:
T = list(html.findAll("h3", {"class":'lvtitle'}))
for i in range(len(T)):
#for i in [1]:
    url = T[i]('a')[0]['href']
    
    item_raw = simple_get(url)
    item_html = BeautifulSoup(item_raw, 'html.parser')
    
    print (i)
    
    # download picture, product_title, eBay_item_number, price and buy_it_now_price and save to a dictionary
    item_info = dict()
    
    item_info['Pictures'] = list(item_html.findAll("img", {"class": "img"}))[2]['src']
    item_info['Product_title'] = item_html('title')[0].contents[0]
    item_info['eBay_item_number'] = item_html.findAll("div", {'class':"u-flL iti-act-num itm-num-txt"})[0].contents[0]
    
    P = list(item_html.findAll("span", {'class':"notranslate"}))
    
    item_info['Price'] = float(P[0].contents[0][4:])
    
    if len(P) == 3:
        item_info['Buy_it_now_price'] = float(P[1].contents[0][4:])

    item_info['url'] = url
        
    # save item_info and append to items
    items = items.append(item_info, ignore_index=True)

In [14]:
items

Unnamed: 0,url,Pictures,Price,eBay_item_number,Buy_it_now_price,Product_title


## post-process results to get new/used condition from the result

In [106]:
items.loc[:,'New'] = items.Product_title.apply(lambda x: 'NEW' in re.split(' |\(|\)',x.upper()))

In [107]:
items.loc[:,'Used'] = items.Product_title.apply(lambda x: 'USED' in re.split(' |\(|\)',x.upper()))

In [108]:
condition = []
for i in range(len(items)):
    if items.iloc[i,6] == True:
        condition.append('New')
    elif items.iloc[i,7] == True:
        condition.append('Used')
    else:
        condition.append('NA')

In [109]:
items.loc[:,'condition'] = condition

In [110]:
items

Unnamed: 0,url,Pictures,Price,eBay_item_number,Buy_it_now_price,Product_title,New,Used,condition
0,https://www.ebay.com/itm/Microsoft-Surface-PRO...,https://i.ebayimg.com/images/g/FjoAAOSwzcNc3Jt...,75.0,123769421747,,Microsoft Surface PRO4/PRO3 Docking Station (P...,False,False,
1,https://www.ebay.com/itm/Dell-D6000-USB-3-0-UH...,https://i.ebayimg.com/images/g/~nwAAOSweNxc3Jl...,80.0,123769418872,,Dell D6000 USB 3.0 UHD 4k Universal Docking St...,False,False,


## save result

In [113]:
items[['Product_title','eBay_item_number','condition','Price','Buy_it_now_price','url']].fillna('').to_csv('gallagher_active.csv',index=False,encoding='utf-8')

In [112]:
for i in range(len(items)):
    url = items.iloc[i,1]
    r = get(url, allow_redirects=True)
    open('pic_'+str(i+2)+'.jpg', 'wb').write(r.content)