In [None]:
import time
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re

def house_scrap(house, writer):
    title = house.find("div",{"class":"info-panel clear"}).a.string

    neighbourhood = re.search("^([\w\-]+)" , title).group(0)
    layout = re.search("[0-9]室[0-9]厅" , title).group(0)
    square = re.sub("平米", "", re.search("[0-9]+(\.[0-9]+)?平米" , title).group(0))
    room = re.sub("室", "", re.search("[0-9]室", layout).group(0))
    lv_room = room = re.sub("厅", "", re.search("[0-9]厅", layout).group(0))

    zones = house.find("div",{"class":"con"}).findAll("a")

    district = zones[0].string
    area = ""
    if len(zones) > 1:
        area = zones[1].string

    details = house.find("div",{"class":"con"}).findAll("span")

    type = ""
    floor = ""
    remodel = ""
    facing = ""
    for detail in details:
        item = detail.next_sibling.string.strip()
        if '层' in item:
            if '区/' in item:
                type = re.sub("区/", "", re.search("^.区/" , level).group(0))
            elif '层/' in item:
                type = re.sub("层/", "", re.search("^.层/" , level).group(0))
            floor = re.sub("层", "", re.search("[0-9]+层" , level).group(0))
        elif '装' in item:
            remodel = item
        elif '朝' in item:
            facing = item

    subline = ""
    subdist = ""
    policy = ""
    intro = house.find("div",{"class":"introduce"})
    if intro is not None: 
        intro = intro.findAll("span")
        for item in intro:
            if re.search("[0-9]+号线", item.string) is not None:
                subline = re.sub("号线", "", re.search("[0-9]+号线", item.string).group(0))
                subdist = re.sub("米", "", re.search("[0-9]+米", item.string).group(0))
            elif re.search("满", item.string) is not None:
                policy = item.string.strip()

    houseinfo = house.findAll("div",{"class":"div-cun"})
    date = houseinfo[0].string
    price_per_sqm = houseinfo[1].contents[0]
    price = houseinfo[2].contents[0]

    square_10 = round(pd.to_numeric(square), -1)
    
    info = [neighbourhood, district, area, square, square_10, 
            room, lv_room, type, floor, remodel, facing, subline, subdist, policy,
            date, price, price_per_sqm]
    # print(info)
    writer.writerow(info)

def page_scrap(url, writer):
    html = urlopen(url)
    bsObj = BeautifulSoup(html)
    infos = bsObj.find("div", {"class":"list-wrap"})
    for house in infos.findAll("li"):
        house_scrap(house, writer)
        
file = "chengjiao.csv"
url = "http://sh.lianjia.com/chengjiao/d"
n_pages = 400

start = time.time()

with open(file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["neighbourhood", "square", "square_10", "district", "area", 
                     "room", "lv_room", "type", "floor", "remodel", "facing", "subline", "subdist", "policy",
                     "price", "price_per_sqm", "date"])
    # for i in range(n_pages, n_pages + 1):
    for i in range(230, n_pages):
        print("Page", i+1, "| Record", 20*i+1, "~", 20*(i+1))
        page_scrap(url+str(i+1),writer)

end = time.time()
print(end - start)