In [None]:
from bs4 import BeautifulSoup as bs
import datetime
import numpy as np
import pandas as pd
import re
import requests

In [None]:
def get_links_by_state():
    """ This function pulls all of the US State craigslist website cities from craigslist
    and returns them as a dictionary
    
    Returns
    -------
    states (dictionary): dictionary of cities for which craigslist has a website in the format
        {'state1':['city1', 'city2', ...], 'state2':['city3', 'city4', ...], ...}
    """
    
    # Get site
    q = "https://www.craigslist.org/about/sites#US"
    html = requests.get(q).text
    
    # Make beautifulsoup object
    soup = bs(html, 'html.parser')
    
    # find divs on page
    div_list = soup.find_all('div')
    
    # cut div_list only to the correct class of div and split out unnecessary data
    hrefs = [p for p in div_list if p.get('class') == [u'colmask']]
    hrefs = str(hrefs[0]).split('<h4>')
    hrefs2 = []
    for h in hrefs:
        hrefs2.append(h.split('</h4>'))
    
    # Create states dictionary in the format {'state1':['city1_url', 'city2_url', ...], ...}
    states = {}
    for html_split in hrefs2:
        if len(html_split) == 2:
            states[html_split[0].lower()] = html_split[1]
    
    # Parse city urls so that the states dictionary transforms to
    # {'state1':['city1', 'city2', ...], 'state2':['city3', 'city4', ...], ...}
    for state in states.keys():
        soup = bs(states[state], 'html.parser')
        links = soup.find_all('a')
        states[state] = [get_city_from_url(a.get("href")) for a in links]
    return states

def get_car_links(url):
    """ Gets item links from craigslist search results page
    Runs recursively for queries with more than 1 page of results
    
    Parameters
    ----------
    url (string): URL to craigslist search query page
    
    Returns
    -------
    test_list (list of strings): List of titles of pages
    href_list (list of strings): List of links to pages
    """
    
    city = get_city_from_url(url)
    
    # Load page
    html = requests.get(url).text
    
    # Create beautiful soup parser
    soup = bs(html, 'html.parser')
    
    # find all links on page
    link_list = soup.find_all('a') 
    
    # Find all car links 
    car_list = [a for a in link_list 
                 if str(a.get('class')) == "[u'hdrlnk']"]
                 #if any(x in str(a) for x in search_strings)]
    
    # Parse the page to get the title of and link to each item on the search results page
    text_list, href_list = [], []
    for l in car_list:
        if l.contents:
            text_list.append(unicode(l.contents[0]))
            href = l.get('href')
            if href.startswith("//"):
                href = "http:{0}".format(href)
            elif href.startswith("http"):
                pass
            else:
                href = "http://{0}.craigslist.org{1}".format(city, href)
            href_list.append(href)
    
    # Find all link tags in page - in order to move to the next page
    link_rels = soup.find_all('link')
    
    # If there is a next page, then set the next_page variable and call this function (recursively) to get the next page
    # Else return the title list and link list
    try:
        next_page = [a.get('href') for a in link_rels if a.get('rel')[0] == u'next'][0]
    except IndexError:
        return text_list, href_list
    tl, hl = get_car_links(next_page)
    
    # Aggregate and return title list and link list
    text_list+= tl
    href_list += hl
    return text_list, href_list

def get_city_from_url(url):
    """ Parse craigslist url to only return the city 
    
    Parameters
    ----------
    url (string): Any craigslist URL
    
    Returns
    -------
    city (string): The city from the subdomain of the URL
    """
    # TODO: 
    # cshaley - 10/17/2016
    # Make this regex instead?
    # Add error handling?
    return url.split('/')[2].split('.')[0]

def get_attrs(lnk):
    """ Get attributes of an object that is for sale on craigslist from the link 
    
    Parameters
    ----------
    lnk (string): Link to an item for sale on craigslist
    
    Returns
    -------
    d (dictionary): Dictionary of attributes of the item for sale on craigslist
    """
    
    # load webpage
    html = requests.get(lnk).text
    
    # Create beautifulsoup parser to parse page
    soup = bs(html, 'html.parser')
    
    # find all paragraphs and spans on page
    p_list = soup.find_all('p')
    spans = soup.find_all('span')
    
    # Set the price of the item from the attribute.
    # If price is not listed, set it to NaN.
    try:
        price = [s for s in spans if s.get('class') == [u'price']][0].contents[0]
    except:
        price = np.NaN
    
    # Get a list of the rest of the attributes of the item
    attrs = [p for p in p_list if p.get('class') == [u"attrgroup"]]
    attr_list = attrs[1].find_all('span')
    
    # Create a dictionary of the attributes of the item in the format {'attribute': 'value', ...}
    d = {}
    d['Price'] = price
    for attr in attr_list:
        try:
            d[attr.contents[0]] = attr.contents[1].contents[0]
        except IndexError:
            # If the attribute is listed but has no value, then ignore it
            pass
    return d

In [None]:
# General configuration steps
# What cities do you want to search? Just the ones in your state? Or in your region?
# Modifying state_list will automatically modify city_list
states = get_links_by_state()
state_list = ['texas', 'arkansas', 'alabama', 'mississippi', 'new mexico', 'colorado', 'oklahoma', 'louisiana', 'kansas']
#state_list = ['georgia']
city_list = []
for state in state_list:
    city_list += states[state]
print city_list

# Search strings and stop strings will filter your results
# If an item's title does not contain any of the search strings, then that item will be dropped
# If an item's title contains any of the stop strings, then that item will be dropped
search_strings = ['2009', '2010', '2011', '2012', '2013', '2014', '2015']
stop_strings = ['gti', 'chevrolet', 'chevy', 'ford', '3.0', 'touareg', 'nissan', 'gmc', 'chrysler',
                'dodge', 'saturn', 'mercedes', 'honda', 'buick', 'toyota', 'jeep', 'lincoln',
                'scion', 'yamaha', ]
#city_list = ['dallas', 'houston', 'austin']
#city = 'austin'
#url = "https://{}.craigslist.org/search/cta?query=tdi".format(city)

In [None]:
# This is where we run the meat of the program

ds = [] # List of dictionaries of data
city_list = ['dallas'] # Make city_list short for testing

# TODO:
# cshaley: 2016/10/17
# Parallelize all this business. Why can't I seem to make URL calls work in parallel?

# TODO:
# cshaley: 2016/10/17
# Restructure this to make it more linear.
# i.e. Create a dataframe with all of the links to load for all of the cities,
#      and then load all of those links.
# Drop duplicate entries before loading them?

# TODO
# cshaley: 2016/10/18
# Restructure the whole program - put in .py files and make it object oriented?

for city in city_list:
    # Create the query URL
    url = "https://{}.craigslist.org/search/cta?query=tdi".format(city)
    
    # Get links to all of the items in the search results
    tlst, hlst = get_car_links(url)
    
    # If there were any results
    if tlst and hlst:
        # Create dataframe with craigslist items for sale as rows
        df = pd.DataFrame(zip(tlst, hlst), columns=['Title', 'Link'])
        
        # Filter before loading individual URLS as loading each URL is slow.
        # Filter out items that don't contain any search strings
        df = df[df['Title'].str.lower().str.contains('|'.join(search_strings))]
        # Filter out items that contain stop strings
        df = df[~df['Title'].str.lower().str.contains('|'.join(stop_strings))]
        
        # Load each link and get the item attributes for each link.
        # Create a dictionary and append it t
        for lnk, title in zip(df['Link'].values, df['Title'].values):
            d = get_attrs(lnk)
            d['Link'] = lnk
            d['Title'] = title
            ds.append(d)

# Create a dataframe from the list of dictionaries
car_data = pd.DataFrame(ds)

In [None]:
# Print Number of results and clean data
print("Raw car data number of results: {}".format(len(car_data.index)))
cleaned_car_data = car_data.drop_duplicates()
print("Cleaned car data number of results: {}".format(len(cleaned_car_data.index)))
cleaned_car_data = cleaned_car_data[cleaned_car_data[u'title status: ']=='clean']
print("Cleaned car data with clean title number of results: {}".format(len(cleaned_car_data.index)))

In [None]:
car_data.head()

In [None]:
cleaned_car_data.head()

In [None]:
# Write data to disk - with a timestamp so you dont overwrite anything on accident
now = re.sub('[ .:-]', '', str(datetime.datetime.now()))
car_data.to_csv('raw_car_data_{}.csv'.format(now), index=False, encoding='utf-8')
cleaned_car_data.to_csv('cleaned_car_data_{}.csv'.format(now), index=False, encoding='utf-8')

In [None]:
# Read it for a sanity check
read_car_data = pd.read_csv('raw_car_data_{}.csv'.format(now))

In [None]:
read_car_data.head()