eliasdabbas · ska-ibees · Apr 21, 2022
diff --git a/advertools/code_recipes/example_crawl_1.jl b/advertools/code_recipes/example_crawl_1.jl
@@ -0,0 +1 @@
+{"url": "https://www.example.com", "title": "Example Domain", "viewport": "width=device-width, initial-scale=1", "charset": "utf-8", "h1": "Example Domain", "body_text": "This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission. More information...", "size": 1256, "proxy": "http://p.webshare.io:80", "download_timeout": 180.0, "download_slot": "www.example.com", "download_latency": 0.7258133888244629, "depth": 0, "status": 200, "links_url": "https://www.iana.org/domains/example", "links_text": "More information...", "links_nofollow": "False", "ip_address": "212.32.244.131", "crawl_time": "2022-04-21 16:18:30", "resp_headers_content-length": "648", "resp_headers_accept-ranges": "bytes", "resp_headers_age": "562646", "resp_headers_cache-control": "max-age=604800", "resp_headers_content-type": "text/html; charset=UTF-8", "resp_headers_date": "Thu, 21 Apr 2022 16:18:30 GMT", "resp_headers_etag": "\"3147526947+gzip\"", "resp_headers_expires": "Thu, 28 Apr 2022 16:18:30 GMT", "resp_headers_last-modified": "Thu, 17 Oct 2019 07:18:26 GMT", "resp_headers_server": "ECS (dcb/7F5D)", "resp_headers_vary": "Accept-Encoding", "resp_headers_x-cache": "HIT", "request_headers_accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "request_headers_accept-language": "en", "request_headers_user-agent": "advertools/0.14.0a8", "request_headers_accept-encoding": "gzip, deflate", "request_headers_proxy-authorization": "Basic bGxqbHVrdGUtR0ItTkwtcm90YXRlOjNyM2c4cXV6c2pwMQ=="}
diff --git a/advertools/code_recipes/spider_strategies.py b/advertools/code_recipes/spider_strategies.py
@@ -266,3 +266,26 @@
 ====================  ===================  =================================================================  ===============================================================================================================
 
 """
+
+import advertools as adv
+import pandas as pd
+
+url_list = ['https://www.example.com']
+output_file = "path_to_file"
+meta = {"proxy": "your_proxy_endpoint"}
+
+adv.crawl(
+    url_list, meta,
+    output_file,
+    follow_links=True, exclude_url_params=True,
+    custom_settings={
+        "ROBOTSTXT_OBEY": False,
+        "CLOSESPIDER_TIMEOUT": 10,
+        "CONCURRENT_REQUESTS_PER_DOMAIN": 8,
+        "CLOSESPIDER_PAGECOUNT": 1000,
+        "DEPTH_LIMIT": 4,
+    }
+)
+df = pd.read_json(output_file, lines=True)
+df_col = df.columns
+print(df_col)
diff --git a/advertools/spider.py b/advertools/spider.py
@@ -642,7 +642,7 @@ class SEOSitemapSpider(Spider):
     }
 
 
-    def __init__(self, url_list, follow_links=False,
+    def __init__(self, url_list, meta, follow_links=False,
                  allowed_domains=None,
                  exclude_url_params=None,
                  include_url_params=None,
@@ -652,6 +652,7 @@ def __init__(self, url_list, follow_links=False,
                  xpath_selectors=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.start_urls = json.loads(json.dumps(url_list.split(',')))
+        self.meta = eval(json.loads(json.dumps(meta)))
         self.allowed_domains = json.loads(json.dumps(allowed_domains.split(',')))
         self.follow_links = eval(json.loads(json.dumps(follow_links)))
         self.exclude_url_params = eval(json.loads(json.dumps(exclude_url_params)))
@@ -668,7 +669,7 @@ def __init__(self, url_list, follow_links=False,
     def start_requests(self):
         for url in self.start_urls:
             try:
-                yield Request(url, callback=self.parse, errback=self.errback)
+                yield Request(url, callback=self.parse, errback=self.errback, meta=self.meta)
             except Exception as e:
                 self.logger.error(repr(e))
 
@@ -814,12 +815,12 @@ def parse(self, response):
                         include_url_regex=self.include_url_regex)
                     if cond:
                         yield Request(page, callback=self.parse,
-                                      errback=self.errback)
+                                      errback=self.errback, meta=self.meta)
                     # if self.skip_url_params and urlparse(page).query:
                     #     continue
 
 
-def crawl(url_list, output_file, follow_links=False,
+def crawl(url_list, meta, output_file, follow_links=False,
           allowed_domains=None,
           exclude_url_params=None,
           include_url_params=None,
@@ -952,6 +953,7 @@ def crawl(url_list, output_file, follow_links=False,
 
     command = ['scrapy', 'runspider', spider_path,
                '-a', 'url_list=' + ','.join(url_list),
+               '-a', 'meta=' + json.dumps(meta),
                '-a', 'allowed_domains=' + ','.join(allowed_domains),
                '-a', 'follow_links=' + str(follow_links),
                '-a', 'exclude_url_params=' + str(exclude_url_params),