Skip to content

Commit

Permalink
Merge pull request #26 from yangguang760/master
Browse files Browse the repository at this point in the history
添加期货数据爬虫
  • Loading branch information
foolcage committed Aug 9, 2018
2 parents bbfa8d2 + a10d0ae commit 269a361
Show file tree
Hide file tree
Showing 11 changed files with 590 additions and 4 deletions.
10 changes: 10 additions & 0 deletions cleanData.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
for i in stockIds:
path = get_event_path(i,'finance_report')
try:
if os.path.exists(path):
df = pd.read_csv(path)
df = index_df_with_time(df,index='reportDate')
except Exception as e:
print(path)
if os.path.exists(path):
os.remove(path)
2 changes: 1 addition & 1 deletion fooltrader/api/technical.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def get_trading_dates(security_item, dtype='list', ignore_today=False, source='1

def kdata_exist(security_item, year, quarter, fuquan=None, source='163'):
df = get_kdata(security_item, fuquan=fuquan, source=source)
if "{}Q{}".format(year, quarter) in df.index:
if pd.Period("{}Q{}".format(year, quarter)).end_time < df.index.max():
return True
return False

Expand Down
64 changes: 64 additions & 0 deletions fooltrader/spiders/chinafuture/future_cffex_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-

import os
from datetime import datetime

import scrapy
from scrapy import Request
from scrapy import signals
import pandas as pd

from fooltrader.api.technical import parse_shfe_data, parse_shfe_day_data
from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path
from fooltrader.utils.utils import to_timestamp


class FutureCffexSpider(scrapy.Spider):
name = "future_cffex_spider"

custom_settings = {
# 'DOWNLOAD_DELAY': 2,
# 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,

}

def __init__(self, name=None, **kwargs):
super().__init__(name, **kwargs)
self.trading_dates = None

def start_requests(self):
if self.dataType is None or self.dataType=='dayk':
daterange=pd.date_range(start='2006-06-30',end=pd.Timestamp.today())
daterange=daterange[daterange.dayofweek<5]
for i in daterange:
the_dir = get_exchange_cache_path(security_type='future',exchange='cffex',data_type='day_kdata',the_date=to_timestamp(i))+".csv"
if not os.path.exists(the_dir):
yield Request(url="http://www.cffex.com.cn/sj/hqsj/rtj/"+i.strftime("%Y%m/%d/%Y%m%d")+"_1.csv",callback=self.download_cffex_history_data_file,meta={'filename':the_dir})
elif self.dataType =='inventory':
daterange=pd.date_range(start='2006-06-30',end=pd.Timestamp.today())
k=['IF','IC','IH','T','TF']
daterange=daterange[daterange.dayofweek<5]
for i in daterange:
for j in k:
the_dir = get_exchange_cache_path(security_type='future',exchange='cffex',data_type='inventory',the_date=to_timestamp(i))+j+".csv"
if not os.path.exists(the_dir):
yield Request(url="http://www.cffex.com.cn/sj/ccpm/"+i.strftime("%Y%m/%d/")+j+"_1.csv",callback=self.download_cffex_history_data_file,meta={'filename':the_dir})




def download_cffex_history_data_file(self,response):
content_type_header = response.headers.get('content-type', None)
the_path = response.meta['filename']

if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv':
with open(the_path, "wb") as f:
f.write(response.body)
f.flush()

else:
self.logger.error(
"get cffex year data failed:the_path={} url={} content type={} ".format(
the_path,
response.url,
content_type_header))
84 changes: 84 additions & 0 deletions fooltrader/spiders/chinafuture/future_czce_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-

import os
from datetime import datetime
import pandas as pd

import scrapy
from scrapy import Request
from scrapy import signals

from fooltrader.api.technical import parse_shfe_data, parse_shfe_day_data
from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path
from fooltrader.utils.utils import to_timestamp


class FutureCzceSpider(scrapy.Spider):
name = "future_czce_spider"

custom_settings = {
# 'DOWNLOAD_DELAY': 2,
# 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,

}

def __init__(self, name=None, **kwargs):
super().__init__(name, **kwargs)
self.trading_dates = None

def start_requests(self):
if self.dataType is None:
today = pd.Timestamp.today()
for date in pd.date_range(start=today.date()-pd.Timedelta(days=today.dayofyear-1),end=today):
the_dir = get_exchange_cache_path(security_type='future',exchange='czce',the_date=to_timestamp(date),data_type='day_kdata')+'.xls'
if(date.dayofweek<5 and not os.path.exists(the_dir)):
yield Request(url="http://www.czce.com.cn/portal/DFSStaticFiles/Future/"+date.strftime("%Y/%Y%m%d")+"/FutureDataDaily.xls",callback=self.download_czce_kline_data,meta={'filename':the_dir})
elif self.dataType=='historyk':
yield Request(url="http://www.czce.com.cn/portal/jysj/qhjysj/lshqxz/A09112017index_1.htm",callback=self.download_czce_history_data)
elif self.dataType=='inventory':
today = pd.Timestamp.today()
for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=450),end=today):
the_dir = get_exchange_cache_path(security_type='future',exchange='czce',the_date=to_timestamp(date),data_type='inventory')+'.xls'
if(date.dayofweek<5 and not os.path.exists(the_dir)):
yield Request(url="http://www.czce.com.cn/portal/DFSStaticFiles/Future/"+date.strftime("%Y/%Y%m%d")+"/FutureDataHolding.xls",callback=self.download_czce_kline_data,meta={'filename':the_dir})



def download_czce_kline_data(self,response):
content_type_header = response.headers.get('content-type', None)
the_path = response.meta['filename']

if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/x-zip-compressed' or content_type_header.decode("utf-8") == 'application/excel':
with open(the_path, "wb") as f:
f.write(response.body)
f.flush()

else:
self.logger.error(
"get czce year data failed:the_path={} url={} content type={} ".format(
the_path,
response.url,
content_type_header))

def download_czce_history_data(self,response):
the_dir = get_exchange_cache_dir(security_type='future', exchange='czce')
for filepath in response.xpath('//a[contains(@href,"zip")]').xpath('@href').extract():
yield Request(url="http://www.czce.com.cn/"+filepath,
meta={'filename':os.path.join(the_dir,("" if filepath.split("/")[-2] == "exchange" else filepath.split("/")[-2]) +filepath.split("/")[-1])},
callback=self.download_czce_history_data_file)

def download_czce_history_data_file(self,response):
content_type_header = response.headers.get('content-type', None)
the_path = response.meta['filename']

if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/x-zip-compressed':
with open(the_path, "wb") as f:
f.write(response.body)
f.flush()

else:
self.logger.error(
"get shfe year data failed:the_path={} url={} content type={} ".format(
the_path,
response.url,
content_type_header))
120 changes: 120 additions & 0 deletions fooltrader/spiders/chinafuture/future_dce_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# -*- coding: utf-8 -*-

import os
from datetime import datetime
import pandas as pd

import scrapy
from scrapy import Request,FormRequest
from scrapy import signals

from fooltrader.api.technical import parse_shfe_data, parse_shfe_day_data
from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path
from fooltrader.utils.utils import to_timestamp


class FutureDceSpider(scrapy.Spider):
name = "future_dce_spider"

custom_settings = {
# 'DOWNLOAD_DELAY': 2,
# 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,

}

def __init__(self, name=None, **kwargs):
super().__init__(name, **kwargs)

def start_requests(self):
if self.dataType is None:
return self.request_currentyear_kdata()
elif self.dataType == 'historyk':
return self.request_history_kdata()
elif self.dataType == 'inventory':
return self.request_inventory_data()
else:
return self.request_currentyear_kdata()

def request_inventory_data(self):
today = pd.Timestamp.today()
requests = []
for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=520),end=today):
the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_inventory")+'.zip'
if(date.dayofweek<5 and not os.path.exists(the_dir)):
requests.append(FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportMemberDealPosiQuotesBatchData.html",formdata={
'batchExportFlag':'batch',
'contract.contract_id':'all',
'contract.variety_id':'a',
'year':str(date.year),
'month':str(date.month-1),
'day':str(date.day),
'memberDealPosiQuotes.trade_type':'0',
'memberDealPosiQuotes.variety':'all'
},callback=self.download_dce_kline_data,meta={
'filename':the_dir
}))
return requests

def request_currentyear_kdata(self):
today = pd.Timestamp.today()
requests=[]
for date in pd.date_range(start=today.date()-pd.Timedelta(days=today.dayofyear-1),end=today):
the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_kdata")+'.xls'
if(date.dayofweek<5 and not os.path.exists(the_dir)):
requests.append( FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportDayQuotesChData.html",formdata={
'year':str(date.year),
'month':str(date.month-1),
'day':str(date.day),
'dayQuotes.trade_type':'0',
'dayQuotes.variety':'all',
'exportType':'excel'
},callback=self.download_dce_kline_data,meta={
'filename':the_dir
}))
return requests

def request_history_kdata(self):
return [Request(url="http://www.dce.com.cn/dalianshangpin/xqsj/lssj/index.html",callback=self.download_dce_history_data)]


def download_dce_history_data(self,response):
the_dir = get_exchange_cache_dir(security_type='future', exchange='dce')
for filepath in response.css('input').xpath('@rel').extract():
yield Request(url="http://www.dce.com.cn/"+filepath,
meta={'filename':os.path.join(the_dir,filepath.split("/")[-1])},
callback=self.download_dce_history_data_file)


def download_dce_kline_data(self,response):
content_type_header = response.headers.get('content-type', None)
if content_type_header is None:
content_type_header = response.headers.get('Content-Type',None)
the_path = response.meta['filename']

if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/octet-stream;charset=utf-8':
with open(the_path, "wb") as f:
f.write(response.body)
f.flush()

else:
self.logger.error(
"get dce year kline data failed:the_path={} url={} content type={} ".format(
the_path,
response.url,
content_type_header))

def download_dce_history_data_file(self,response):
content_type_header = response.headers.get('content-type', None)
the_path = response.meta['filename']

if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv':
with open(the_path, "wb") as f:
f.write(response.body)
f.flush()

else:
self.logger.error(
"get shfe year data failed:the_path={} url={} content type={} ".format(
the_path,
response.url,
content_type_header))
13 changes: 13 additions & 0 deletions fooltrader/spiders/chinafuture/future_shfe_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
from datetime import datetime
import pandas as pd

import scrapy
from scrapy import Request
Expand All @@ -27,6 +28,15 @@ def __init__(self, name=None, **kwargs):

def start_requests(self):
self.trading_dates = self.settings.get("trading_dates")
if self.dataType or self.dataType=='inventory':
today = pd.Timestamp.today()
for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=520),end=today):
the_dir=get_exchange_cache_path(security_type='future',exchange='shfe',the_date=to_timestamp(date),data_type='inventory')+'.json'
if date.dayofweek<5 and not os.path.exists(the_dir):
yield Request(url=self.get_day_inventory_url(the_date=date.strftime('%Y%m%d')),
meta={'the_date': date,
'the_path': the_dir},
callback=self.download_shfe_data_by_date)

if self.trading_dates:
# 每天的数据
Expand Down Expand Up @@ -96,5 +106,8 @@ def get_year_k_data_url(self, the_year):
def get_day_kdata_url(self, the_date):
return 'http://www.shfe.com.cn/data/dailydata/kx/kx{}.dat'.format(the_date)

def get_day_inventory_url(self, the_date):
return 'http://www.shfe.com.cn/data/dailydata/kx/pm{}.dat'.format(the_date)

def get_trading_date_url(self):
return 'http://www.shfe.com.cn/bourseService/businessdata/calendar/20171201all.dat'
2 changes: 1 addition & 1 deletion fooltrader/spiders/chinastock/sina_category_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,4 @@ def spider_closed(self, spider, reason):
self.sh_df.to_csv(get_security_list_path('stock', 'sh'), index=False)
if self.sz_df[self.category_type].any():
self.sz_df.to_csv(get_security_list_path('stock', 'sz'), index=False)
spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Empty file.

0 comments on commit 269a361

Please sign in to comment.