In [1]:
# coding: utf-8-sig
"""
A downloader that catch the data from Chinese land data website.

Yuhao Zhu

20170411: Write functions to get urls and contents. Write codes to parse out information.
20170412: Write code to save information of all stocks to csv. 
"""

import requests, re, os, sys, csv, datetime, time
from bs4 import BeautifulSoup

Define functions.

In [2]:
def get_url_content(url):
    """
    Open the url to get the content of that url.
    The content is the basic information of the firms.
    """
    try:
        r=requests.get(url, timeout=60)
        r.encoding='utf-8-sig'
        url_content = r.text
    except:
        print('Loading information from {} timeout!'.format(url))
        url_content = ""
        error_file_name = 'error_report.csv'
        if os.path.exists(error_file_name):
            mode = 'a'
        else:
            mode = 'w'
        csv_file = open(error_file_name, mode, newline='', encoding='utf-8-sig')
        writer = csv.writer(csv_file)
        writer.writerow(['{}'.format(url)])
        csv_file.close()
    return url_content

In [3]:
def get_information(url_content):
    """
    Get information from every url.
    """
    soup = BeautifulSoup(url_content, 'lxml')
    basic = soup.find_all('table', {'class': 'sssjcon-information'})
    business = soup.find_all('div', {'class': 'sssjcon-information-box'})
    basic_information = soup.find_all('td')
    data = []
    # Basic information
    for tag in basic_information:
        entry = re.findall('</span>(.*?)</td>', str(tag))
        if entry:
            entry = entry[0]
            data.append(entry)
    data[0] = re.findall('>(.*?)</a>', data[0])[0]
    # Business information
    business_information = re.findall('<p>(.*?)</p>', str(business))
    data = data + business_information
    return data

The main file.

In [4]:
url = "http://d.qianzhan.com/xdata/listedinfo/"
stock_list = []
with open("stock_list.txt", 'r', newline='', encoding='utf-8-sig') as file_list:
    for row in file_list:
        row = row.replace('\n', '')
        stock_list.append(row)

url_list = []
for stock in stock_list:
    url = 'http://d.qianzhan.com/xdata/listedinfo/{}.html'.format(stock)
    url_list.append(url)

In [5]:
head_row = [u'公司名称', u'法人代表', u'英文名称', u'独立董事', u'股票代码',
            u'工商登记', u'股票名称', u'联系人', u'所属行业', u'联系电话',
            u'注册日期', u'电子邮箱', u'注册资本', u'公司网址', u'注册地址',
            u'主营业务', u'经营范围', u'公司历史']

In [6]:
file_name = 'Chinese_firms_information.csv'
with open(file_name, 'w', newline='', encoding='utf-8-sig') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(head_row)
    for url in url_list:
        url_content = get_url_content(url)
        try:
            data = get_information(url_content)
            print('Get information from {}'.format(url))
        except:
            print('No information from {}'.format(url))
        writer.writerow(data)

Get information from http://d.qianzhan.com/xdata/listedinfo/603665.html
Get information from http://d.qianzhan.com/xdata/listedinfo/603630.html
Get information from http://d.qianzhan.com/xdata/listedinfo/600212.html
Get information from http://d.qianzhan.com/xdata/listedinfo/300620.html
Get information from http://d.qianzhan.com/xdata/listedinfo/603098.html
Get information from http://d.qianzhan.com/xdata/listedinfo/002851.html
Get information from http://d.qianzhan.com/xdata/listedinfo/603578.html
Get information from http://d.qianzhan.com/xdata/listedinfo/300618.html
Get information from http://d.qianzhan.com/xdata/listedinfo/002497.html
Get information from http://d.qianzhan.com/xdata/listedinfo/603138.html
Get information from http://d.qianzhan.com/xdata/listedinfo/603955.html
Get information from http://d.qianzhan.com/xdata/listedinfo/300083.html
Get information from http://d.qianzhan.com/xdata/listedinfo/300310.html
Get information from http://d.qianzhan.com/xdata/listedinfo/6039