In [1]:
import os
import numpy as np
import concurrent.futures
from pymongo import MongoClient
from pprint import pprint
from time import sleep
import random
from copy import deepcopy
import requests

#local helper files
from helpers import get_proxy_list, format_proxy 
from extractors import get_data 
import settings

In [2]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

#### Pull listing_url documents from mongoDB

In [3]:
db = MongoClient().get_database('philadelphia_home_prices')

In [4]:
col = db.get_collection('listing_urls')

In [5]:
cursor = db.listing_urls.find({})
listings = list(cursor)

In [6]:
#convert dictionary values to list
listing_urls_list = []
for i in listings:
    url = i['url']
    listing_urls_list.append(url)

#### New collection for listing data

In [7]:
db.list_collection_names()

['prices_messy', 'listing_data', 'listing_urls']

In [8]:
#uncomment the code you need to use during testing
#db.drop_collection('listing_data')
#db.create_collection('listing_data')

In [9]:
col = db.get_collection('listing_data')

In [10]:
#check number of documents in collection
col.count_documents({})

1101

#### Pick which urls in list by index

In [11]:
#initiate empty lists
scraped_urls = []
bad_proxies = []
bad_urls = []

In [12]:
#only run once
remaining_listing_urls_list = deepcopy(listing_urls_list)

In [13]:
#code for each scrape
def scrape_requests(url):
    listing_data = {}
    try:
        PROXY = random.choice(proxies_address) #pick random proxy from list
        proxies_address.remove(PROXY) #checkout proxy
        proxies = format_proxy(PROXY) 

        headers = settings.headers

       
        try: 
            response = requests.get(url,headers=headers,proxies=proxies) #page request
        except:
            response = False
            
        soup = BeautifulSoup(response.text, 'html.parser')
        listing_data = get_data(soup, url)

        sleep(random.uniform(60,120))
        
        #check if scrape was successful. Then move url out
        if response.status_code == 200 and listing_data['Address'] != np.nan:
            scraped_urls.append(url)
            remaining_listing_urls_list.remove(url)
            col.insert_one(listing_data)
            proxies_address.append(PROXY)
            print(listing_data)
    
    except:
        if response == False:
            bad_proxies.append(PROXY) #remove proxy if no response from page
            print('no response, bad proxy')
        elif response.status_code != 200: #remove proxy if not 200 response from page
            bad_proxies.append(PROXY)
            print('not 200, bad proxy')
        else:
            bad_urls.append(url) #quarantine url that is not functioning
            remaining_listing_urls_list.remove(url)
    
    print('______________________________________________________')
    print(f'   Remaining urls: {len(remaining_listing_urls_list)}')
    print(f'     Scraped urls: {len(scraped_urls)}')
    print(f'Good proxies left: {len(proxies_address)}')
    print(f'      Bad proxies: {len(bad_proxies)}')
    
    #space out requests if there are a low number of proxies left
    if len(proxies_address) < 100:
        sleep(random.uniform(60,120))
        
    return

In [14]:
#multi-thread with concurrent.features
def threaded_scrape(remaining_listing_urls_list, threads):
    threads = min(threads, len(proxies_address))
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        executor.map(scrape_requests, remaining_listing_urls_list)

In [15]:
count = 0
while len(remaining_listing_urls_list) > 0:
    if count%5 == 0:
        proxies_address = get_proxy_list() #Get free proxies from https://free-proxy-list.net/¶
    threaded_scrape(remaining_listing_urls_list, settings.threads)
    count =+ 1
    print('new round of proxies')
    print(f'   Remaining urls: {len(remaining_listing_urls_list)}')
    print(f'     Scraped urls: {len(scraped_urls)}')

2020-11-09 15:17:27,320 root   DEBUG    === Initialized Proxy Parsers ===
2020-11-09 15:17:27,320 root   DEBUG    	 FreeProxy parser of 'http://free-proxy-list.net' with required bandwidth: '150' KBs
2020-11-09 15:17:27,321 root   DEBUG    	 PremProxy parser of 'https://premproxy.com/list/' with required bandwidth: '150' KBs
2020-11-09 15:17:27,721 root   DEBUG    Added 300 proxies from FreeProxy
2020-11-09 15:17:28,325 http_request_randomizer.requests.parsers.PremProxyParser DEBUG    Pages: {'', '04.htm', '07.htm', '06.htm', '03.htm', '05.htm', '02.htm'}
2020-11-09 15:17:28,907 http_request_randomizer.requests.parsers.js.UnPacker INFO     JS UnPacker init path: https://premproxy.com/js/169f7.js
2020-11-09 15:17:29,462 http_request_randomizer.requests.parsers.js.UnPacker DEBUG    portmap: {'ra3a4': '8080', 're660': '8088', 'r1b47': '8000', 'rac76': '3128', 'r796a': '9999', 'r32b4': '999', 'r56b1': '50616', 'r2007': '34930', 'r9b25': '80', 'rd711': '8888', 'rfade': '51915', 'r3460': '63

no response, bad proxy
______________________________________________________
   Remaining urls: 1100
     Scraped urls: 0
Good proxies left: 553
      Bad proxies: 1
no response, bad proxy
______________________________________________________
   Remaining urls: 1100
     Scraped urls: 0
Good proxies left: 552
      Bad proxies: 2
no response, bad proxy
______________________________________________________
   Remaining urls: 1100
     Scraped urls: 0
Good proxies left: 551
      Bad proxies: 3
no response, bad proxy
______________________________________________________
   Remaining urls: 1100
     Scraped urls: 0
Good proxies left: 550
      Bad proxies: 4
no response, bad proxy
______________________________________________________
   Remaining urls: 1100
     Scraped urls: 0
Good proxies left: 549
      Bad proxies: 5
no response, bad proxy
______________________________________________________
   Remaining urls: 1100
     Scraped urls: 0
Good proxies left: 548
      Bad proxies: 

     Scraped urls: 2no response, bad proxy
Good proxies left: 516
      Bad proxies: 40
______________________________________________________
   Remaining urls: 1098
     Scraped urls: 2
Good proxies left: 515
      Bad proxies: 40

______________________________________________________
   Remaining urls: 1098
     Scraped urls: 2
Good proxies left: 514
      Bad proxies: 40
no response, bad proxy
______________________________________________________
   Remaining urls: 1098
     Scraped urls: 2
Good proxies left: 513
      Bad proxies: 41
no response, bad proxy
______________________________________________________
   Remaining urls: 1098
     Scraped urls: 2
Good proxies left: 512
      Bad proxies: 42
{'Address': '1918 S Juniper St,', 'Locality': 'Philadelphia', 'Region': 'PA', 'Postal Code': '19148', 'Price': 318000, 'Days on Market': 76, 'SQFT House': 1096, 'SQFT Lot': 713, 'Full Baths': 1, 'Half Baths': 1, 'Bedrooms': 3, 'Garage': 'No', 'Master Bath': 'No', 'Cooling': ['Wall Uni

   Remaining urls: 1091
     Scraped urls: 9
Good proxies left: 507
      Bad proxies: 47
no response, bad proxy
______________________________________________________
   Remaining urls: 1091
     Scraped urls: 9
Good proxies left: 506
      Bad proxies: 48
no response, bad proxy
______________________________________________________
   Remaining urls: 1091
     Scraped urls: 9
Good proxies left: 505
no response, bad proxy      Bad proxies: 50
______________________________________________________

   Remaining urls: 1091
     Scraped urls: 9
Good proxies left: 505
      Bad proxies: 50
no response, bad proxy
______________________________________________________
   Remaining urls: 1090
     Scraped urls: 10
Good proxies left: 503{'Address': '818 Lawler St,', 'Locality': 'Philadelphia', 'Region': 'PA', 'Postal Code': '19116', 'Price': 264000, 'Days on Market': 56, 'SQFT House': 1480, 'SQFT Lot': 3397, 'Full Baths': 2, 'Half Baths': 1, 'Bedrooms': 4, 'Garage': 'No', 'Master Bath': 'No',

KeyboardInterrupt: 