In [123]:
# import dependencies
import requests
import json
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import pymongo
import re as re

# Output File (CSV)
output_data_file = "csv_outputs/craigslist_tools.csv"

In [124]:
pg_num = 1
pg_cnt = 0
per_pg = 120
query = 'tools'.replace(' ', '+')
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url = 'https://toronto.craigslist.org/search/sss?query='+ str(query) + '&sort=rel&s=' + str(pg_cnt)

In [125]:
result_set = []

while True:
        response = requests.get(url, headers=headers).text
        soup = bs(response,  'html.parser')
    
        a = soup.find_all('li', class_='result-row') 
        
        if len(a) == 0:
                break
        
        else: 
                #loop through returned results
                for item in a:
                        
                        #set empty variables
                        mini_list = []
                        mini_list.append(pg_num)

                        #error handling
                        try:
                                #identify and return title of listing
                                listing = item.find('a', class_='result-title').text
                                mini_list.append(listing)
                        except: 
                                mini_list.append('no name')
        
                        try: 
                                #identify and return price of listing
                                price = item.find('span', class_='result-price').text
                                mini_list.append(price)                                
                        except: 
                                mini_list.append('$0.00')
                                
                        try:
                                #identify and return url of listing                            
                                item_url = item.find('a', class_='result-title hdrlnk')['href']
                                mini_list.append(item_url)
                        except: 
                                mini_list.append('noURL')
                                
                        result_set.append(mini_list) 
                
                #iterate through pages
                pg_cnt += per_pg
                pg_num += 1
                url = 'https://toronto.craigslist.org/search/sss?query='+ str(query) + '&sort=rel&s=' + str(pg_cnt)

In [126]:
#establish dataframe from result set
query_df = pd.DataFrame(result_set)

In [127]:
query_df.head(5)

Unnamed: 0,0,1,2,3
0,1,Wissota skate sharpening sharpener portable ma...,$0.00,https://toronto.craigslist.org/tor/tls/d/wisso...
1,1,Master Lock Red Lock out locks,$40,https://toronto.craigslist.org/yrk/tls/d/woodb...
2,1,MEN'S BLACK LEATHER STEEL-TOED SAFETY DRESS/WO...,$65,https://toronto.craigslist.org/tor/tls/d/toron...
3,1,NEW! CANADA PRO-FIX 24 and 30 INCH BOW SAW BLADES,$6,https://toronto.craigslist.org/tor/tls/d/new-c...
4,1,"NEW! TOOLWAY 3"" AND 4"" PAINT BRUSHES",$8,https://toronto.craigslist.org/tor/tls/d/new-t...


In [128]:
query_df.columns = ["page", "item", "price", "url"]
query_df.head(5)

Unnamed: 0,page,item,price,url
0,1,Wissota skate sharpening sharpener portable ma...,$0.00,https://toronto.craigslist.org/tor/tls/d/wisso...
1,1,Master Lock Red Lock out locks,$40,https://toronto.craigslist.org/yrk/tls/d/woodb...
2,1,MEN'S BLACK LEATHER STEEL-TOED SAFETY DRESS/WO...,$65,https://toronto.craigslist.org/tor/tls/d/toron...
3,1,NEW! CANADA PRO-FIX 24 and 30 INCH BOW SAW BLADES,$6,https://toronto.craigslist.org/tor/tls/d/new-c...
4,1,"NEW! TOOLWAY 3"" AND 4"" PAINT BRUSHES",$8,https://toronto.craigslist.org/tor/tls/d/new-t...


In [129]:
# Identify incomplete rows
query_df.dtypes

page      int64
item     object
price    object
url      object
dtype: object

In [130]:
# Drop all rows with missing information
query_df = query_df.dropna(how='any')
query_df.count()

page     1119
item     1119
price    1119
url      1119
dtype: int64

In [131]:
# Delete extraneous column
del query_df['page']
query_df.head()

Unnamed: 0,item,price,url
0,Wissota skate sharpening sharpener portable ma...,$0.00,https://toronto.craigslist.org/tor/tls/d/wisso...
1,Master Lock Red Lock out locks,$40,https://toronto.craigslist.org/yrk/tls/d/woodb...
2,MEN'S BLACK LEATHER STEEL-TOED SAFETY DRESS/WO...,$65,https://toronto.craigslist.org/tor/tls/d/toron...
3,NEW! CANADA PRO-FIX 24 and 30 INCH BOW SAW BLADES,$6,https://toronto.craigslist.org/tor/tls/d/new-c...
4,"NEW! TOOLWAY 3"" AND 4"" PAINT BRUSHES",$8,https://toronto.craigslist.org/tor/tls/d/new-t...


In [132]:
# Make a copy of dataframe
new_df = query_df.copy()

# remove all dollar signs
new_df['price'] = [x.strip('$') for x in query_df['price']]
new_df.head()

Unnamed: 0,item,price,url
0,Wissota skate sharpening sharpener portable ma...,0.0,https://toronto.craigslist.org/tor/tls/d/wisso...
1,Master Lock Red Lock out locks,40.0,https://toronto.craigslist.org/yrk/tls/d/woodb...
2,MEN'S BLACK LEATHER STEEL-TOED SAFETY DRESS/WO...,65.0,https://toronto.craigslist.org/tor/tls/d/toron...
3,NEW! CANADA PRO-FIX 24 and 30 INCH BOW SAW BLADES,6.0,https://toronto.craigslist.org/tor/tls/d/new-c...
4,"NEW! TOOLWAY 3"" AND 4"" PAINT BRUSHES",8.0,https://toronto.craigslist.org/tor/tls/d/new-t...


In [133]:
#to_drop = ['electrician']
#new_df[~new_df['item'].isin(to_drop)]
dropped_df = new_df[~new_df['item'].isin(['lock'])]
dropped_df.head()

Unnamed: 0,item,price,url
0,Wissota skate sharpening sharpener portable ma...,0.0,https://toronto.craigslist.org/tor/tls/d/wisso...
1,Master Lock Red Lock out locks,40.0,https://toronto.craigslist.org/yrk/tls/d/woodb...
2,MEN'S BLACK LEATHER STEEL-TOED SAFETY DRESS/WO...,65.0,https://toronto.craigslist.org/tor/tls/d/toron...
3,NEW! CANADA PRO-FIX 24 and 30 INCH BOW SAW BLADES,6.0,https://toronto.craigslist.org/tor/tls/d/new-c...
4,"NEW! TOOLWAY 3"" AND 4"" PAINT BRUSHES",8.0,https://toronto.craigslist.org/tor/tls/d/new-t...


In [146]:
#searchfor = ['locks', 'stick']
#dropped_df2 = dropped_df[~dropped_df.item.str.contains('|'.join(searchfor))]
#dropped_df2

df = dropped_df[~dropped_df['item'].str.contains(['locks', 'stick'], flags=re.IGNORECASE)]
df.head()

TypeError: unhashable type: 'list'

In [135]:
#new_df[~new_df.item.str.contains("electrician")]
#new_df
dropped_df = new_df[~new_df['price'].isin(['0.00'])]
dropped_df.head()

Unnamed: 0,item,price,url
1,Master Lock Red Lock out locks,40,https://toronto.craigslist.org/yrk/tls/d/woodb...
2,MEN'S BLACK LEATHER STEEL-TOED SAFETY DRESS/WO...,65,https://toronto.craigslist.org/tor/tls/d/toron...
3,NEW! CANADA PRO-FIX 24 and 30 INCH BOW SAW BLADES,6,https://toronto.craigslist.org/tor/tls/d/new-c...
4,"NEW! TOOLWAY 3"" AND 4"" PAINT BRUSHES",8,https://toronto.craigslist.org/tor/tls/d/new-t...
5,NEW! MATRIX 7pc NUT DRIVER SET,15,https://toronto.craigslist.org/tor/tls/d/york-...


In [136]:
dropped_df2 = dropped_df[~dropped_df['item'].str.contains(["locks", "stick"], flags=re.IGNORECASE)]
dropped_df2

TypeError: unhashable type: 'list'

In [137]:
#new_df.dtypes
new_df["price"] = new_df.price.astype(float)
new_df

total = new_df["price"].sum()
total

482607.0

In [36]:
# output to csv
query_df.to_csv('csv_outputs/craigslist_' + query + '.csv')

In [37]:
print('-----Done-------')

-----Done-------
