### Imports

In [None]:
import datetime
import numpy as np
import pandas as pd
import re
import crawler as cr

### Main Logic

In [None]:
# General configuration steps
# What cities do you want to search? Just the ones in your state? Or in your region?
# Modifying state_list will automatically modify city_list
states = cr.get_links_by_state()
state_list = ['texas', 'arkansas', 'alabama', 'mississippi', 'new mexico', 'colorado', 'oklahoma', 'louisiana', 'kansas']
city_list = []
for state in state_list:
    city_list += states[state]
print city_list

# Search strings and stop strings will filter your results
# If an item's title does not contain any of the search strings, then that item will be dropped
# If an item's title contains any of the stop strings, then that item will be dropped
search_strings = ['2009', '2010', '2011', '2012', '2013', '2014', '2015']
stop_strings = ['gti', 'chevrolet', 'chevy', 'ford', '3.0', 'touareg', 'nissan', 'gmc', 'chrysler',
                'dodge', 'saturn', 'mercedes', 'honda', 'buick', 'toyota', 'jeep', 'lincoln',
                'scion', 'yamaha', ]


In [None]:
# This is where we run the meat of the program

ds = [] # List of dictionaries of data
city_list = ['dallas'] # Make city_list short for testing

# TODO:
# cshaley: 2016/10/17
# Restructure this to make it more linear.
# i.e. Create a dataframe with all of the links to load for all of the cities,
#      and then load all of those links.
# Drop duplicate entries before loading them?

# TODO
# cshaley: 2016/10/18
# Restructure the whole program - put in .py files and make it object oriented?

for city in city_list:
    # Create the query URL
    url = "https://{}.craigslist.org/search/cta?query=tdi".format(city)
    
    # Get links to all of the items in the search results
    tlst, hlst = cr.get_car_links(url)
    
    # If there were any results
    if tlst and hlst:
        # Create dataframe with craigslist items for sale as rows
        df = pd.DataFrame(zip(tlst, hlst), columns=['Title', 'Link'])
        
        # Filter before loading individual URLS as loading each URL is slow.
        # Filter out items that don't contain any search strings
        df = df[df['Title'].str.lower().str.contains('|'.join(search_strings))]
        # Filter out items that contain stop strings
        df = df[~df['Title'].str.lower().str.contains('|'.join(stop_strings))]
        
        # Load each link and get the item attributes for each link.
        # Create a dictionary and append it t
        for lnk, title in zip(df['Link'].values, df['Title'].values):
            d = cr.get_attrs(lnk)
            d['Link'] = lnk
            d['Title'] = title
            ds.append(d)

# Create a dataframe from the list of dictionaries
car_data = pd.DataFrame(ds)

### Viewing and Saving

In [None]:
# Print Number of results and clean data
print("Raw car data number of results: {}".format(len(car_data.index)))
cleaned_car_data = car_data.drop_duplicates()
print("Cleaned car data number of results: {}".format(len(cleaned_car_data.index)))
cleaned_car_data = cleaned_car_data[cleaned_car_data[u'title status: ']=='clean']
print("Cleaned car data with clean title number of results: {}".format(len(cleaned_car_data.index)))

In [None]:
car_data.head()

In [None]:
cleaned_car_data.head()

In [None]:
# Write data to disk - with a timestamp so you dont overwrite anything on accident
now = re.sub('[ .:-]', '', str(datetime.datetime.now()))
car_data.to_csv('raw_car_data_{}.csv'.format(now), index=False, encoding='utf-8')
cleaned_car_data.to_csv('cleaned_car_data_{}.csv'.format(now), index=False, encoding='utf-8')

In [None]:
# Read it for a sanity check
read_car_data = pd.read_csv('raw_car_data_{}.csv'.format(now))

In [None]:
read_car_data.head()