Merge pull request #26 from yangguang760/master

添加期货数据爬虫
foolcage · Aug 9, 2018 · 269a361 · 269a361
2 parents bbfa8d2 + a10d0ae
commit 269a361
Show file tree

Hide file tree

Showing 11 changed files with 590 additions and 4 deletions.
diff --git a/cleanData.py b/cleanData.py
@@ -0,0 +1,10 @@
+for i in stockIds:
+    path = get_event_path(i,'finance_report')
+    try:
+        if os.path.exists(path):
+            df = pd.read_csv(path)
+            df = index_df_with_time(df,index='reportDate')
+    except Exception as e:
+        print(path)
+        if os.path.exists(path):
+            os.remove(path)
diff --git a/fooltrader/api/technical.py b/fooltrader/api/technical.py
@@ -340,7 +340,7 @@ def get_trading_dates(security_item, dtype='list', ignore_today=False, source='1
 
 def kdata_exist(security_item, year, quarter, fuquan=None, source='163'):
     df = get_kdata(security_item, fuquan=fuquan, source=source)
-    if "{}Q{}".format(year, quarter) in df.index:
+    if pd.Period("{}Q{}".format(year, quarter)).end_time < df.index.max():
         return True
     return False
 

diff --git a/fooltrader/spiders/chinafuture/future_cffex_spider.py b/fooltrader/spiders/chinafuture/future_cffex_spider.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+
+import os
+from datetime import datetime
+
+import scrapy
+from scrapy import Request
+from scrapy import signals
+import pandas as pd
+
+from fooltrader.api.technical import parse_shfe_data, parse_shfe_day_data
+from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path
+from fooltrader.utils.utils import to_timestamp
+
+
+class FutureCffexSpider(scrapy.Spider):
+    name = "future_cffex_spider"
+
+    custom_settings = {
+        # 'DOWNLOAD_DELAY': 2,
+        # 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
+
+    }
+
+    def __init__(self, name=None, **kwargs):
+        super().__init__(name, **kwargs)
+        self.trading_dates = None
+
+    def start_requests(self):
+        if self.dataType is None or self.dataType=='dayk':
+            daterange=pd.date_range(start='2006-06-30',end=pd.Timestamp.today())
+            daterange=daterange[daterange.dayofweek<5]
+            for i in daterange:
+                the_dir = get_exchange_cache_path(security_type='future',exchange='cffex',data_type='day_kdata',the_date=to_timestamp(i))+".csv"
+                if not os.path.exists(the_dir):
+                    yield Request(url="http://www.cffex.com.cn/sj/hqsj/rtj/"+i.strftime("%Y%m/%d/%Y%m%d")+"_1.csv",callback=self.download_cffex_history_data_file,meta={'filename':the_dir})
+        elif self.dataType =='inventory':
+            daterange=pd.date_range(start='2006-06-30',end=pd.Timestamp.today())
+            k=['IF','IC','IH','T','TF']
+            daterange=daterange[daterange.dayofweek<5]
+            for i in daterange:
+                for j in k:
+                    the_dir = get_exchange_cache_path(security_type='future',exchange='cffex',data_type='inventory',the_date=to_timestamp(i))+j+".csv"
+                    if not os.path.exists(the_dir):
+                        yield Request(url="http://www.cffex.com.cn/sj/ccpm/"+i.strftime("%Y%m/%d/")+j+"_1.csv",callback=self.download_cffex_history_data_file,meta={'filename':the_dir})
+
+
+
+
+    def download_cffex_history_data_file(self,response):
+        content_type_header = response.headers.get('content-type', None)
+        the_path = response.meta['filename']
+
+        if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv':
+            with open(the_path, "wb") as f:
+                f.write(response.body)
+                f.flush()
+
+        else:
+            self.logger.error(
+                "get cffex year  data failed:the_path={} url={} content type={} ".format(
+                                                                                                 the_path,
+                                                                                                 response.url,
+                                                                                                 content_type_header))
diff --git a/fooltrader/spiders/chinafuture/future_czce_spider.py b/fooltrader/spiders/chinafuture/future_czce_spider.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+
+import os
+from datetime import datetime
+import pandas as pd
+
+import scrapy
+from scrapy import Request
+from scrapy import signals
+
+from fooltrader.api.technical import parse_shfe_data, parse_shfe_day_data
+from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path
+from fooltrader.utils.utils import to_timestamp
+
+
+class FutureCzceSpider(scrapy.Spider):
+    name = "future_czce_spider"
+
+    custom_settings = {
+        # 'DOWNLOAD_DELAY': 2,
+        # 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
+
+    }
+
+    def __init__(self, name=None, **kwargs):
+        super().__init__(name, **kwargs)
+        self.trading_dates = None
+
+    def start_requests(self):
+        if self.dataType is None:
+            today = pd.Timestamp.today()
+            for date in pd.date_range(start=today.date()-pd.Timedelta(days=today.dayofyear-1),end=today):
+                the_dir = get_exchange_cache_path(security_type='future',exchange='czce',the_date=to_timestamp(date),data_type='day_kdata')+'.xls'
+                if(date.dayofweek<5 and not os.path.exists(the_dir)):
+                    yield Request(url="http://www.czce.com.cn/portal/DFSStaticFiles/Future/"+date.strftime("%Y/%Y%m%d")+"/FutureDataDaily.xls",callback=self.download_czce_kline_data,meta={'filename':the_dir})
+        elif self.dataType=='historyk':
+            yield Request(url="http://www.czce.com.cn/portal/jysj/qhjysj/lshqxz/A09112017index_1.htm",callback=self.download_czce_history_data)
+        elif self.dataType=='inventory':
+            today = pd.Timestamp.today()
+            for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=450),end=today):
+                the_dir = get_exchange_cache_path(security_type='future',exchange='czce',the_date=to_timestamp(date),data_type='inventory')+'.xls'
+                if(date.dayofweek<5 and not os.path.exists(the_dir)):
+                    yield Request(url="http://www.czce.com.cn/portal/DFSStaticFiles/Future/"+date.strftime("%Y/%Y%m%d")+"/FutureDataHolding.xls",callback=self.download_czce_kline_data,meta={'filename':the_dir})
+
+
+
+    def download_czce_kline_data(self,response):
+        content_type_header = response.headers.get('content-type', None)
+        the_path = response.meta['filename']
+
+        if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/x-zip-compressed' or content_type_header.decode("utf-8") == 'application/excel':
+            with open(the_path, "wb") as f:
+                f.write(response.body)
+                f.flush()
+
+        else:
+            self.logger.error(
+                "get czce year  data failed:the_path={} url={} content type={} ".format(
+                                                                                                 the_path,
+                                                                                                 response.url,
+                                                                                                 content_type_header))
+
+    def download_czce_history_data(self,response):
+        the_dir = get_exchange_cache_dir(security_type='future', exchange='czce')
+        for filepath in response.xpath('//a[contains(@href,"zip")]').xpath('@href').extract():
+            yield Request(url="http://www.czce.com.cn/"+filepath,
+                      meta={'filename':os.path.join(the_dir,("" if filepath.split("/")[-2] == "exchange" else filepath.split("/")[-2]) +filepath.split("/")[-1])},
+                      callback=self.download_czce_history_data_file)
+
+    def download_czce_history_data_file(self,response):
+        content_type_header = response.headers.get('content-type', None)
+        the_path = response.meta['filename']
+
+        if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/x-zip-compressed':
+            with open(the_path, "wb") as f:
+                f.write(response.body)
+                f.flush()
+
+        else:
+            self.logger.error(
+                "get shfe year  data failed:the_path={} url={} content type={} ".format(
+                                                                                                 the_path,
+                                                                                                 response.url,
+                                                                                                 content_type_header))
diff --git a/fooltrader/spiders/chinafuture/future_dce_spider.py b/fooltrader/spiders/chinafuture/future_dce_spider.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+import os
+from datetime import datetime
+import pandas as pd
+
+import scrapy
+from scrapy import Request,FormRequest
+from scrapy import signals
+
+from fooltrader.api.technical import parse_shfe_data, parse_shfe_day_data
+from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path
+from fooltrader.utils.utils import to_timestamp
+
+
+class FutureDceSpider(scrapy.Spider):
+    name = "future_dce_spider"
+
+    custom_settings = {
+        # 'DOWNLOAD_DELAY': 2,
+        # 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
+
+    }
+
+    def __init__(self, name=None, **kwargs):
+        super().__init__(name, **kwargs)
+
+    def start_requests(self):
+        if self.dataType is None:
+            return self.request_currentyear_kdata()
+        elif self.dataType == 'historyk':
+            return self.request_history_kdata()
+        elif self.dataType == 'inventory':
+            return self.request_inventory_data()
+        else:
+            return self.request_currentyear_kdata()
+
+    def request_inventory_data(self):
+        today = pd.Timestamp.today()
+        requests = []
+        for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=520),end=today):
+            the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_inventory")+'.zip'
+            if(date.dayofweek<5 and not os.path.exists(the_dir)):
+                requests.append(FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportMemberDealPosiQuotesBatchData.html",formdata={
+            'batchExportFlag':'batch',
+            'contract.contract_id':'all',
+            'contract.variety_id':'a',
+            'year':str(date.year),
+                'month':str(date.month-1),
+                'day':str(date.day),
+                'memberDealPosiQuotes.trade_type':'0',
+                'memberDealPosiQuotes.variety':'all'
+            },callback=self.download_dce_kline_data,meta={
+                'filename':the_dir
+            }))
+        return requests
+
+    def request_currentyear_kdata(self):
+        today = pd.Timestamp.today()
+        requests=[]
+        for date in pd.date_range(start=today.date()-pd.Timedelta(days=today.dayofyear-1),end=today):
+            the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_kdata")+'.xls'
+            if(date.dayofweek<5 and not os.path.exists(the_dir)):
+                requests.append( FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportDayQuotesChData.html",formdata={
+            'year':str(date.year),
+                'month':str(date.month-1),
+                'day':str(date.day),
+                'dayQuotes.trade_type':'0',
+                'dayQuotes.variety':'all',
+                'exportType':'excel'
+            },callback=self.download_dce_kline_data,meta={
+                'filename':the_dir
+            }))
+        return requests
+
+    def request_history_kdata(self):
+        return [Request(url="http://www.dce.com.cn/dalianshangpin/xqsj/lssj/index.html",callback=self.download_dce_history_data)]
+
+
+    def download_dce_history_data(self,response):
+        the_dir = get_exchange_cache_dir(security_type='future', exchange='dce')
+        for filepath in response.css('input').xpath('@rel').extract():
+            yield Request(url="http://www.dce.com.cn/"+filepath,
+                      meta={'filename':os.path.join(the_dir,filepath.split("/")[-1])},
+                      callback=self.download_dce_history_data_file)
+
+
+    def download_dce_kline_data(self,response):
+        content_type_header = response.headers.get('content-type', None)
+        if content_type_header is None:
+            content_type_header = response.headers.get('Content-Type',None)
+        the_path = response.meta['filename']
+
+        if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/octet-stream;charset=utf-8':
+            with open(the_path, "wb") as f:
+                f.write(response.body)
+                f.flush()
+
+        else:
+            self.logger.error(
+                "get dce year kline data failed:the_path={} url={} content type={} ".format(
+                                                                                                 the_path,
+                                                                                                 response.url,
+                                                                                                 content_type_header))
+
+    def download_dce_history_data_file(self,response):
+        content_type_header = response.headers.get('content-type', None)
+        the_path = response.meta['filename']
+
+        if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv':
+            with open(the_path, "wb") as f:
+                f.write(response.body)
+                f.flush()
+
+        else:
+            self.logger.error(
+                "get shfe year  data failed:the_path={} url={} content type={} ".format(
+                                                                                                 the_path,
+                                                                                                 response.url,
+                                                                                                 content_type_header))
diff --git a/fooltrader/spiders/chinafuture/future_shfe_spider.py b/fooltrader/spiders/chinafuture/future_shfe_spider.py
@@ -2,6 +2,7 @@
 
 import os
 from datetime import datetime
+import pandas as pd
 
 import scrapy
 from scrapy import Request
@@ -27,6 +28,15 @@ def __init__(self, name=None, **kwargs):
 
     def start_requests(self):
         self.trading_dates = self.settings.get("trading_dates")
+        if self.dataType or self.dataType=='inventory':
+            today = pd.Timestamp.today()
+            for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=520),end=today):
+                the_dir=get_exchange_cache_path(security_type='future',exchange='shfe',the_date=to_timestamp(date),data_type='inventory')+'.json'
+                if date.dayofweek<5 and not os.path.exists(the_dir):
+                    yield Request(url=self.get_day_inventory_url(the_date=date.strftime('%Y%m%d')),
+                              meta={'the_date': date,
+                                    'the_path': the_dir},
+                              callback=self.download_shfe_data_by_date)
 
         if self.trading_dates:
             # 每天的数据
@@ -96,5 +106,8 @@ def get_year_k_data_url(self, the_year):
     def get_day_kdata_url(self, the_date):
         return 'http://www.shfe.com.cn/data/dailydata/kx/kx{}.dat'.format(the_date)
 
+    def get_day_inventory_url(self, the_date):
+        return 'http://www.shfe.com.cn/data/dailydata/kx/pm{}.dat'.format(the_date)
+
     def get_trading_date_url(self):
         return 'http://www.shfe.com.cn/bourseService/businessdata/calendar/20171201all.dat'
diff --git a/fooltrader/spiders/chinastock/sina_category_spider.py b/fooltrader/spiders/chinastock/sina_category_spider.py
@@ -98,4 +98,4 @@ def spider_closed(self, spider, reason):
             self.sh_df.to_csv(get_security_list_path('stock', 'sh'), index=False)
         if self.sz_df[self.category_type].any():
             self.sz_df.to_csv(get_security_list_path('stock', 'sz'), index=False)
-        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
+        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
diff --git a/fooltrader/transform/__init__.py b/fooltrader/transform/__init__.py