In [2]:
"""
remove all whitespace(including \n and \t) of string,
just keep one space between the parts originally seperated by space
"""
def removeSpace(input_str):
    input_str = str(input_str)
    input_str.split()
    return ' '.join(input_str.split())

In [67]:
from pprint import pprint

import pickle
import json

import pandas as pd

import requests
from bs4 import BeautifulSoup

In [3]:
base_url = "http://www.edmunds.com"
year = "2015"

new_cars = base_url + "/new-cars/"
response = requests.get(new_cars)
html = response.content
car_make_soup = BeautifulSoup(html, 'lxml')

In [6]:
ss = car_make_soup.findAll('div', attrs = {'class': 'content grid-138'})[1]
s = ss.findAll('a')

makes = []
for item in s:
    m = str(item['href'].split('/')[1])
    makes.append(m)
    
urls = []
for m in makes:
    s = base_url + "/" + m
    urls.append(s)


crawl_list = []
car_list = []
for test in urls:
    model_response = requests.get(test)
    model_html = model_response.content
    model_soup = BeautifulSoup(model_html, 'lxml')

    s = model_soup.findAll('a', attrs = {'class': 'canonicalLink'})

    for item in s:
        temp = item['href']
        
        year_response = requests.get(base_url + temp)
        year_html = year_response.content
        year_soup = BeautifulSoup(year_html, 'lxml')
        
        for div in year_soup.findAll('div', class_='vehicle-card new-vm sh-none'):
            car_list.append(div.find('div', class_='info1').find('a')['href'])
            
with open('saved-data/car_list_url.pickle', 'w') as f:
    pickle.dump(car_list, f)

In [17]:
car_list = []
with open('saved-data/car_list_url.pickle', 'r') as f:
    car_url = pickle.load(f)
    
for car in car_url:
    temp = filter(None, car.split('/'))
    info = {}
    try:
        info['make'] = temp[0]
        info['model'] = temp[1]
        info['year'] = temp[2]
        info['type'] = temp[3]
        car_list.append(info)
    except:
        print temp

['cadillac', 'ats-coupe', '2015']
['cadillac', 'ats-coupe', '2016']
['cadillac', 'cts-coupe', '2014']
['cadillac', 'cts-v-coupe', '2015']
['ford', 'transit-van', '2015']
['ford', 'transit-van', '2016']
['hyundai', 'genesis-coupe', '2015']
['hyundai', 'genesis-coupe', '2016']
['infiniti', 'q60-convertible', '2015']
['infiniti', 'q60-coupe', '2015']
['maserati', 'granturismo-convertible', '2015']
['maserati', 'granturismo-convertible', '2016']
['mclaren', '650s-coupe', '2015']
['ram', 'promaster-cargo-van', '2015']
['ram', 'promaster-cargo-van', '2016']
['ram', 'promaster-window-van', '2015']
['ram', 'promaster-window-van', '2016']
['rolls-royce', 'phantom-coupe', '2015']
['rolls-royce', 'phantom-coupe', '2016']
['volkswagen', 'beetle-convertible', '2015']
['volkswagen', 'beetle-convertible', '2016']


In [None]:
"""
Example urls:

http://www.edmunds.com/acura/ilx/2016/sedan/features-specs/
http://www.edmunds.com/lexus/ct-200h/2016/hatchback/features-specs/
"""

In [None]:
""""
Main class to crawl html from edmunds.com.

This class takes a specific url such as 'http://www.edmunds.com/lexus/ct-200h/2016/hatchback/features-specs/'

Six types of information about the specific car are parsed and structured:
1. interior color
2. exterior color
3. specification, such as engine, fuel and interior measurements
4. interior features
5. exterior features
6. safety features, such as airbags and stability control

The crawler is based on requests and BeautifulSoup.
The processing procedure:
- take an url, use requests to issue a request and get the content back as a string
- take the returned string from request as input, use BeautifulSoup to contruct a parse tree of the html
- parse info to get the six types of information we need

Always use try-except block, because many cars don't have all six types of information on the webpage.
"""

In [76]:
class CarInfoCrawler():
    def __init__(self, car_info):
        self.color_tree = None
        self.feature_spec_list = []
        self.request_status = False
        
        self.result = {}
        self.result['car description'] = {}
        self.result['car data'] = {}
        
        self.result['car description']['make'] = car_info['make']
        self.result['car description']['model'] = car_info['model']
        self.result['car description']['year'] = car_info['year']
        self.result['car description']['type'] = car_info['type']
            
        base_url = "http://www.edmunds.com"
        self.url = base_url + '/' + \
        '/'.join([self.result['car description']['make'], self.result['car description']['model'], \
                 self.result['car description']['year'], self.result['car description']['type']]) + \
                  '/features-specs/'
        
        self.result['car description']['url'] = self.url
        
        print 'initialize success'
        
    def request(self):
        try:
            spec_feature_request = requests.get(self.url)
        except:
            print 'request failed'
        print 'request success: ', self.url
        print 'request status: ', spec_feature_request.ok
        self.request_status = spec_feature_request.ok
        if self.request_status == True:
            soup = BeautifulSoup(spec_feature_request.content, 'lxml')
            self.color_tree = soup.find('div', id='colors-pod')
            self.feature_spec_list = soup.findAll('div', class_='feature-spec box')
            
            self.getColor()
            self.getSpec()
            self.getInteriorFeature()
            self.getExteriorFeature()
            self.getSafety()
        pprint(self.result)
        print 
        print
        
    def getColor(self):
        try:
            interior_color = []
            for outer_span in self.color_tree.find('p', text='Interior Colors').findNext('div').findAllNext('span'):
                for inner_span in outer_span.findAll('span'):
                    text = removeSpace(inner_span.text)
                    if text:
                        interior_color.append(text)
        except:
            print 'interior color unavailable'
            interior_color = 'unavailable'

        try:
            exterior_color = []
            for outer_span in self.color_tree.find('p', text='Exterior Colors').findNext('div').findAllNext('span'):
                for inner_span in outer_span.findAll('span'):
                    text = removeSpace(str(inner_span.text))
                    if text:
                        exterior_color.append(text)
        except:
            print 'exterior color unavailable'
            exterior_color = 'unavailable'
                    
        self.result['car data']['interior color'] = interior_color
        self.result['car data']['exterior color'] = exterior_color
        print 'get color done'
        
    def getSpec(self):
        try:
            spec_tree = self.feature_spec_list[0]
            spec_result = {}
            spec_h3 = spec_tree.findAll('h3')
            spec_table = spec_tree.findAll('table')
            for i in range(len(spec_h3)):
                spec = []
                if spec_h3[i].text != 'Suspension':
                    for tr in spec_table[i].findAll('tr'):
                        for td in tr.findAll('td'):
                            if td.label:
                                spec.append({
                                        str(td.label.text): str(td.span.text)
                                    })
                else:
                    for span in spec_table[i].findAll('span'):
                        spec.append(str(span.text))
                spec_result[str(spec_h3[i].text)] = spec
            self.result['car data']['specification'] = spec_result
        except:
            print 'spec unavailable'
            self.result['car data']['specification'] = 'unavailable'
        print 'get spec done'
        
    def getInteriorFeature(self):
        try:
            interior_tree = self.feature_spec_list[1]
            interior_result = {}
            counter = 0
            interior_h4 = interior_tree.findAll('h4')
            interior_table = interior_tree.findAll('table')
            for i in range(len(interior_h4)):
                feature = []
                for span in interior_table[counter].findAll('span'):
                    feature.append(str(span.text))

                next_table = interior_table[counter] .findNext('table')
                if counter < len(interior_table) - 1 and interior_table[counter].findNext('h4') == next_table.findNext('h4'):
                    for span in next_table.findAll('span'):
                        feature.append(str(span.text))
                    counter += 1

                counter += 1
                interior_result[str(interior_h4[i].text)] = feature
            self.result['car data']['interior feature'] = interior_result
        except:
            print 'interior feature unavailable'
            self.result['car data']['interior feature'] = 'unavailable'
        print 'get interior feature done'
            
            
    def getExteriorFeature(self):
        try:
            exterior_tree = self.feature_spec_list[2]
            counter = 0
            exterior_result = {}
            exterior_h4 = exterior_tree.findAll('h4')
            exterior_table = exterior_tree.findAll('table')
            for i in range(len(exterior_h4)):
                feature = []
                for span in exterior_table[counter].findAll('span'):
                    feature.append(str(span.text))

                next_table = exterior_table[counter] .findNext('table')
                if counter < len(exterior_table) - 1 and exterior_table[counter].findNext('h4') == next_table.findNext('h4'):
                    for span in next_table.findAll('span'):
                        feature.append(str(span.text))
                    counter += 1

                counter += 1
                exterior_result[str(exterior_h4[i].text)] = feature
            self.result['car data']['exterior feature'] = exterior_result
        except:
            print 'exterior unavailable'
            self.result['car data']['exterior feature'] = 'unavailable'
        print 'get exterior feature done'
        
    def getSafety(self):
        try:
            safety_tree = self.feature_spec_list[3]
            self.result['car data']['safety'] = []
            for table in safety_tree.findAll('table'):
                for span in table.findAll('span'):
                    self.result['car data']['safety'].append(str(span.text))
        except:
            print 'safery unavailable'
            self.result['car data']['safety'] = 'unavailable'
            
        print 'get safety done'




In [77]:
format_result = []
for car in car_list:
    crawler = CarInfoCrawler(car)
    crawler.request()
    result = all_car_result.append(crawler.result)
   
    format_result.append(result)

initialize success
request success:  http://www.edmunds.com/acura/ilx/2015/sedan/features-specs/
request status:  True
get color done
get spec done
get interior feature done
get exterior feature done
safery unavailable
get safety done
{'car data': {'exterior color': ['Bellanova White Pearl',
                                 'Crystal Black Pearl',
                                 'Fathom Blue Pearl',
                                 'Modern Steel Metallic',
                                 'Silver Moon',
                                 'Ebony, leather'],
              'exterior feature': {'Roof and Glass': ['Intermittent wipers',
                                                      'Rear defogger',
                                                      'Power glass sunroof',
                                                      'Remote sunroof operation'],
                                   'Tires and Wheels': ['Alloy wheels',
                                                        '17

UnboundLocalError: local variable 'spec_feature_request' referenced before assignment

In [57]:
count = 0
for result in format_result:
    if not result:
        count += 1
print count

0


In [86]:
with open('saved-data/json/car_data.json', 'r') as f:
    car_info_all = json.load(f)

In [89]:
count = 0
for car in car_info_all:
    if int(car['car description']['year'])  == 2014:
        count += 1
print count

19
