In [1]:
# Import 
import pandas as pd
from sqlalchemy import create_engine
from config import db_password, baidu_api
import requests
import json

In [2]:
# Read Files
shanghai = pd.read_csv('../Resources/Shanghai Selling Information (not Cleaned).csv')
beijing = pd.read_csv('../Resources/beijing Selling Information (not Cleaned).csv')
guangzhou = pd.read_csv('../Resources/guangzhou Selling Information (not Cleaned).csv')

In [3]:
def data_cleaning(City, x):
    x = pd.DataFrame(x)
    # Rename Columns
    if City != 'beijing':
        x = x.rename(columns={
        '0': 'HouseName',
        '1': 'District',
        '2': 'id',
        '3': 'TotalPrice',
        '4': 'Price',
        '5': 'ConstructionTime & Buildingtype',
        '6': 'HouseInfo',
        '7': 'Floor',
        '8': 'Square',
        '9': 'FloorType',
        '10': 'SquareInside',
        '11': 'BuildingType',
        '12': 'Orientation',
        '13': 'BuildingStructure',
        '14': 'RenovationCondition',
        '15': 'LadderRatio',
        '16': 'Elevator',
        '17': 'PostDate',
        '18': 'TransactionOwnership',
        '19': 'LastTimeTransaction',
        '20': 'HousePurpose',
        '21': 'FiveYearProperty',
        '22': 'PropertyOwnership',
        '23': 'Mortage',
        '24': 'Ownership',
        '25': 'Record'})
        x = x.drop(['SquareInside','Orientation','LadderRatio','PostDate','TransactionOwnership','LastTimeTransaction', 'HousePurpose','FiveYearProperty','PropertyOwnership','Mortage','Ownership','Record'], axis = 1)
        x = pd.DataFrame(x.dropna(axis=1))
    else:
        x = x.rename(columns={
        '0': 'HouseName',
        '1': 'District',
        '2': 'id',
        '3': 'TotalPrice',
        '4': 'Price',
        '5': 'ConstructionTime & Buildingtype',
        '6': 'HouseInfo',
        '7': 'Floor',
        '8': 'Square',
        '9': 'FloorType',
        '10': 'SquareInside',
        '11': 'BuildingType',
        '12': 'Orientation',
        '13': 'BuildingStructure',
        '14': 'RenovationCondition',
        '15': 'LadderRatio',
        '16': 'HeatingMethod',
        '17': 'Elevator',
        '18': 'PostDate',
        '19': 'TransactionOwnership',
        '20': 'LastTimeTransaction',
        '21': 'HousePurpose',
        '22': 'FiveYearProperty',
        '23': 'PropertyOwnership',
        '24': 'Mortage',
        '25': 'Ownership'})
        x = x.drop(['SquareInside','Orientation','LadderRatio','HeatingMethod', 'PostDate','TransactionOwnership','LastTimeTransaction', 'HousePurpose','FiveYearProperty'], axis = 1)
        x = pd.DataFrame(x.dropna(axis=1))
    # Get unit price
    x['Price'] =x['Price'].str.extract('(\d+)', expand=False)
    # Get Total Price
    x['TotalPrice'] = x['TotalPrice']*10000
    # Get Time
    def tradeTime_Year(j):
        return j[0:4]
    x['ConstructionTime'] = x['ConstructionTime & Buildingtype'].apply(tradeTime_Year)
    x = x.drop(columns='ConstructionTime & Buildingtype')
    # Get House Info
    x['BedRooms'] = x.apply(lambda x: x.HouseInfo[0] if x.HouseInfo != "" else 0, axis=1)
    x['LivingRooms'] = x.apply(lambda x: x.HouseInfo[2] if x.HouseInfo != "" else 0, axis=1)
    x['Kitchens'] = x.apply(lambda x: x.HouseInfo[4] if x.HouseInfo != "" else 0, axis=1)
    x['BathRooms'] = x.apply(lambda x: x.HouseInfo[6] if x.HouseInfo != "" else 0, axis=1)
    x = x.drop(columns='HouseInfo')
    # Get Floor Info
    x['Floor'] = x.apply(lambda x: x.Floor[0:3] if x.Floor != "" else 0, axis=1)
    x['Floor'] = x['Floor'].str.replace('顶层', 'Top')
    x['Floor'] = x['Floor'].str.replace('底层', 'First Floor')
    x['Floor'] = x['Floor'].str.replace('高楼层', 'High')
    x['Floor'] = x['Floor'].str.replace('低楼层', 'Low')
    x['Floor'] = x['Floor'].str.replace('中楼层', 'Middle')
    x['Floor'] = x['Floor'].str.replace('地下室', 'Basement')
    # Clean Square
    x['Square'] = x['Square'].str.extract(r'(\d+\.?\d*)')
    # Clean floor_type
    x = x.groupby('FloorType').filter(lambda x : len(x)>2)
    # Clean Renovation Condition
    x = x.groupby('RenovationCondition').filter(lambda x : len(x)>2)
    x['RenovationCondition'] = x['RenovationCondition'].map({
    '毛坯' : 'Rough',
    '简装' : 'Simplicity',
    '精装' : 'Hardcover',
    '其他':  'Other'})
    # Clean ConstructionTime
    x['ConstructionTime'] = x['ConstructionTime'].str.replace('未知年建','NaN')
    # Clean Floor Type
    x['FloorType'] = x['FloorType'].map({
    '平层' : 'Flat',
    '复式' : 'Duplex',
    '错层' : 'Split-Level',
    '暂无数据':  'Other'})
    # Clean Building Type
    x['BuildingType'] = x['BuildingType'].map({
    '板楼' : 'Plate',
    '塔楼' : 'Tower',
    '板塔结合' : 'Plate & Tower',
    '暂无数据':  'Other',
    '平房': 'Bungalow'})
    # Clean Building Structure
    x['BuildingStructure'] = x['BuildingStructure'].map({
    '钢混结构':'Steel/Concrete',
    '砖混结构':'Brick/Concrete',
    '混合结构':'Mixed',
    '未知结构':'Unknown',
    '砖木结构':'Brick/Wood',
    '框架结构':'Steel'}) 
    # Clean Elevator
    x['Elevator'] = x['Elevator'].map({
    '有':'Yes',
    '无':'No',
    '暂无数据':'Unknown'
    })
    # Add Lat and Lng
    x['Lng'] = ''
    x['Lat'] = ''
    # Add City Name Columns, and Change Columns Order
    x['City'] = City
    Order = ['City','HouseName','Lng','Lat','District','id','TotalPrice','Price','Floor','Square','FloorType','BuildingType','BuildingStructure','RenovationCondition','Elevator','ConstructionTime','BedRooms','LivingRooms','Kitchens','BathRooms']
    x = x[Order]
    
    return x
    

In [4]:
shanghai = data_cleaning('shanghai', shanghai)
beijing = data_cleaning('beijing', beijing)
guangzhou = data_cleaning('guangzhou', guangzhou)

In [5]:
# Get Lat & Lng for shanghai, and save file
Lng_Shanghai = []
Lat_Shanghai = []
for i in shanghai.values:
    address = f'上海市'+i[1]
    url = 'http://api.map.baidu.com/geocoding/v3/?address={}&output=json&ak={}'.format(address,baidu_api)
    data = requests.get(url)
    json_data = json.loads(data.text) # parse json string to python
    i[2] = json_data['result']['location']['lng']
    Lng_Shanghai.append(i[2])
    i[3] = json_data['result']['location']['lat']
    Lat_Shanghai.append(i[3])

shanghai['Lng'] = Lng_Shanghai
shanghai['Lat'] = Lat_Shanghai

shanghai.to_csv('../Resources/Shanghai Selling Information (Cleaned).csv')

In [None]:
# Get Lat & Lng for beijing, and save file
Lng_Beijing = []
Lat_Beijing = []
for i in beijing.values:
    address = f'北京市'+i[1]
    url = 'http://api.map.baidu.com/geocoding/v3/?address={}&output=json&ak={}'.format(address,baidu_api)
    data = requests.get(url)
    json_data = json.loads(data.text)
    i[2] = json_data['result']['location']['lng']
    Lng_Beijing.append(i[2])
    i[3] = json_data['result']['location']['lat']
    Lat_Beijing.append(i[3])

beijing['Lng'] = Lng_Beijing
beijing['Lat'] = Lat_Beijing

beijing.to_csv('../Resources/Beijing Selling Information (Cleaned).csv')

In [None]:
# Get Lat & Lng for shanghai, and save file
Lng_Guangzhou = []
Lat_Guangzhou = []
for i in guangzhou.values:
    address = f'广州市'+i[1]
    url = 'http://api.map.baidu.com/geocoding/v3/?address={}&output=json&ak={}'.format(address,baidu_api)
    data = requests.get(url)
    json_data = json.loads(data.text)
    i[2] = json_data['result']['location']['lng']
    Lng_Guangzhou.append(i[2])
    i[3] = json_data['result']['location']['lat']
    Lat_Guangzhou.append(i[3])

guangzhou['Lng'] = Lng_Guangzhou
guangzhou['Lat'] = Lat_Guangzhou

guangzhou.to_csv('../Resources/Guangzhou Selling Information (Cleaned).csv')

In [None]:
# To SQL
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/Final Project"
engine = create_engine(db_string)
shanghai.to_sql(name='selling_shanghai', con=engine, if_exists='replace')
beijing.to_sql(name='selling_beijing', con=engine, if_exists='replace')
guangzhou.to_sql(name='selling_guangzhou', con=engine, if_exists='replace')