### Imports

In [None]:
import datetime
import numpy as np
import pandas as pd
import re
import crawler as cr

### Main Logic

In [None]:
# General configuration steps
# What cities do you want to search? Just the ones in your state? Or in your region?
# Modifying state_list will automatically modify city_list
states = cr.get_craigslist_urls_by_state()
state_list = ['texas', 'arkansas', 'alabama', 'mississippi', 'new mexico', 'colorado', 'oklahoma', 'louisiana', 'kansas']
city_list = []
for state in state_list:
    city_list += states[state]
print city_list

# Search strings and stop strings will filter your results
# If an item's title does not contain any of the search strings, then that item will be dropped
# If an item's title contains any of the stop strings, then that item will be dropped
search_strings = ['2009', '2010', '2011', '2012', '2013', '2014', '2015']
stop_strings = ['gti', 'chevrolet', 'chevy', 'ford', '3.0', 'touareg', 'nissan', 'gmc', 'chrysler',
                'dodge', 'saturn', 'mercedes', 'honda', 'buick', 'toyota', 'jeep', 'lincoln',
                'scion', 'yamaha', ]


In [None]:
# This is where we run the meat of the program

dfs = [] # List of items for which to download data
city_list = ['dallas'] # Make city_list short for testing

for city in city_list:
    # Create the query URL
    url = "https://{}.craigslist.org/search/cta?query=tdi".format(city)
    
    # Get links to all of the items in the search results
    tlst, hlst = cr.get_sale_item_links(url)
    
    # If there were any results
    if tlst and hlst:
        # Create dataframe with craigslist items for sale as rows and append to ds
        dfs.append(pd.DataFrame(zip(tlst, hlst), columns=['Title', 'Link']))

# Concatenate all possible items to download into a single dataframe
df = pd.concat(dfs)

# Filter before loading individual URLS as loading each URL is slow.
# Filter out items that don't contain any search strings
df = df[df['Title'].str.lower().str.contains('|'.join(search_strings))]
# Filter out items that contain stop strings
df = df[~df['Title'].str.lower().str.contains('|'.join(stop_strings))]

In [None]:
ds = [] #list of dictionaries of data to maek into a df
# Load each link and get the item attributes for each link.
# Create a dictionary and append it to the list of dicts
for lnk, title in zip(df['Link'].values, df['Title'].values):
    d = cr.get_sale_item_attrs(lnk)
    d['Link'] = lnk
    d['Title'] = title
    ds.append(d)

# Create a dataframe from the list of dictionaries
sale_item_data = pd.DataFrame(ds)

### Viewing and Saving

In [None]:
# Print Number of results and clean data
print("Raw car data number of results: {}".format(len(sale_item_data.index)))
cleaned_sale_item_data = sale_item_data.drop_duplicates()
print("Cleaned car data number of results: {}".format(len(cleaned_sale_item_data.index)))
cleaned_sale_item_data = cleaned_sale_item_data[cleaned_sale_item_data[u'title status: ']=='clean']
print("Cleaned car data with clean title number of results: {}".format(len(cleaned_sale_item_data.index)))

In [None]:
sale_item_data.head()

In [None]:
cleaned_sale_item_data.head()

In [None]:
# Write data to disk - with a timestamp so you dont overwrite anything on accident
now = re.sub('[ .:-]', '', str(datetime.datetime.now()))
sale_item_data.to_csv('raw_sale_item_data_{}.csv'.format(now), index=False, encoding='utf-8')
cleaned_sale_item_data.to_csv('cleaned_sale_item_data_{}.csv'.format(now), index=False, encoding='utf-8')

In [None]:
# Read it for a sanity check
read_sale_item_data = pd.read_csv('raw_sale_item_data_{}.csv'.format(now))

In [None]:
read_sale_item_data.head()