In [3]:
import requests
from lxml import etree

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd


def get_proxies(country='Taiwan', PhantomJs_executable_path='/usr/local/Cellar/phantomjs/2.1.1/bin/phantomjs', path_or_buf = 'proxies.csv', trskip=2, verbose = 1):
    def verbose_print(s1="", s2="", v=1):
        if v == 1:
            print(s1)
        elif v == 2:
            print(s2)
        else:
            return
    
    
    def clean_text(text):
        import re
        if text is None:
            text = ''
            return text
        
        text = text.encode('latin_1', errors='ignore').decode('utf8', errors='ignore')
        text = re.sub(r'[\t\n\r]', r'', text)
        return text
    # ################################################################################
    # step 1. use PhantomJs to get .js rendered content
    # ################################################################################
    browser = webdriver.PhantomJS(executable_path = PhantomJs_executable_path)
    browser.get('http://www.gatherproxy.com/proxylist/country/?c={}'.format(country))

    # ################################################################################
    # step 2. click "Show Full List" button to generate full proxies list
    # ################################################################################
    try:
        element = WebDriverWait(browser, 1).until(
                EC.presence_of_element_located((By.XPATH, '//div[@id="body"]/form/p/input[@type="submit" and @class="button"]')))
        verbose_print('button "Show Full List" found')

        element.click()
        verbose_print('button "Show Full List" clicked', v = verbose)
        
    except:
        verbose_print('button "Show Full List" not found', v = verbose)
        
    # ################################################################################        
    # step 3. generate selector
    # ################################################################################
    selector = etree.HTML(browser.page_source)
    verbose_print('resolve responsed page content', verbose)

    # ################################################################################
    # step 4. resolve how many pages
    # ################################################################################
    pages = selector.xpath('//div[@id="body"]/form[@id="psbform"]/div[@class="pagenavi"]/a')
    verbose_print('{} pages found'.format(len(pages)), verbose)

    # ################################################################################
    # setp 5. resolve trs for first page
    # ################################################################################
    trs = selector.xpath('//div[@class="proxy-list"]/table[@id="tblproxy"]/tbody/tr')  
    key = [clean_text(th.text) for th in trs[0].xpath('./th')]
    
    proxies_list = list()
    for tr in trs[trskip:]:
        proxies_list.append([ "" if not td.xpath('./text()') else td.xpath('./text()')[0] for td in tr.xpath('./td')])
    verbose_print('page 1 done', verbose)
    
    # ################################################################################        
    # step 6. resolve trs for the rest pages
    # ################################################################################
    for i, page in enumerate(pages, 2):

        # step 6-1. click nextpage's link
        try:
            element = WebDriverWait(browser, 1).until(
                EC.presence_of_element_located((By.XPATH, '//div[@id="body"]/form[@id="psbform"]/div[@class="pagenavi"]/a[@href="#{}"]'.format(i))))
            verbose_print('<a href=#{0}> for page {0} found'.format(i), verbose)

            element.click()
            verbose_print('<a href=#{0}> for page {0} clicked'.format(i), verbose)
        except:
            verbose_print('<a href=#{0}> for page {0} not found'.format(i), verbose)
            
        # step 6-2. resolve trs for ith page
        selector = etree.HTML(browser.page_source)
        trs = selector.xpath('//div[@class="proxy-list"]/table[@id="tblproxy"]/tbody/tr')
        for tr in trs[trskip:]:
            proxies_list.append([ "" if not td.xpath('./text()') else td.xpath('./text()')[0] for td in tr.xpath('./td')])
        verbose_print('page {} done'.format(i), verbose)

    else:
        verbose_print('total {} proxies resolved'.format(len(proxies_list)), verbose)

    # ################################################################################
    # step 7. build proxies DataFrame
    # ################################################################################
    proxies = pd.DataFrame(proxies_list, columns=key)
    verbose_print('create proxies DataFrame', verbose)
    

    # ################################################################################
    # step 8. write out proxies.csv
    # ################################################################################
    proxies.to_csv(path_or_buf = path_or_buf, index = False)
    verbose_print('write proxies to csv as "{}"'.format(path_or_buf), verbose)
    
    return proxies

def proxy_pool(df, prefix = "http://", cat=":",col_ip="Ip Address", col_port="Port"):
    proxies_list = list()
    for ip, port in zip(df[col_ip], df[col_port]):
        proxies_list.append(prefix+ip+cat+port)
               
    return proxies_list

In [5]:
example = get_proxies()

button "Show Full List" found
button "Show Full List" clicked
resolve responsed page content
1 pages found
page 1 done
<a href=#2> for page 2 found
<a href=#2> for page 2 clicked
page 2 done
total 32 proxies resolved
create proxies DataFrame
write proxies to csv as "proxies.csv"


In [7]:
example

Unnamed: 0,Last update,Ip Address,Port,Anonymity level,Country,City,Uptime (L/D),Response times
0,2m 43s ago,203.74.4.0,80,Elite,Taiwan,,/,130ms
1,2m 45s ago,203.74.4.3,80,Elite,Taiwan,,/,138ms
2,5m 26s ago,203.74.4.6,80,Elite,Taiwan,,/,50ms
3,5m 26s ago,203.74.4.5,80,Elite,Taiwan,,/,48ms
4,5m 26s ago,203.74.4.1,80,Elite,Taiwan,,/,41ms
5,5m 26s ago,203.74.4.2,80,Elite,Taiwan,,/,50ms
6,15m 9s ago,203.74.4.4,80,Elite,Taiwan,,/,39ms
7,15m 13s ago,203.74.4.7,80,Elite,Taiwan,,/,48ms
8,40m 41s ago,59.126.48.8,8080,Transparent,Taiwan,,/,204ms
9,40m 59s ago,220.143.172.28,3128,Transparent,Taiwan,,/,136ms


In [6]:
proxy_pool(example)

['http://203.74.4.0:80',
 'http://203.74.4.3:80',
 'http://203.74.4.6:80',
 'http://203.74.4.5:80',
 'http://203.74.4.1:80',
 'http://203.74.4.2:80',
 'http://203.74.4.4:80',
 'http://203.74.4.7:80',
 'http://59.126.48.8:8080',
 'http://220.143.172.28:3128',
 'http://220.143.169.88:3128',
 'http://211.72.239.245:3128',
 'http://218.161.87.19:53281',
 'http://210.61.209.197:3128',
 'http://111.251.84.160:80',
 'http://180.176.101.45:3128',
 'http://140.114.79.118:80',
 'http://114.27.145.234:3128',
 'http://1.172.203.195:3128',
 'http://59.126.48.8:80',
 'http://60.249.19.50:8080',
 'http://114.27.142.188:3128',
 'http://61.223.116.213:3128',
 'http://36.238.194.87:8080',
 'http://36.234.126.39:3128',
 'http://61.58.209.237:53281',
 'http://114.39.50.232:3128',
 'http://61.223.86.98:53281',
 'http://36.238.119.237:80',
 'http://61.31.131.134:8998',
 'http://61.224.64.113:3128',
 'http://1.165.171.137:3128']