In [1]:
import pandas as pd

In [2]:
data_path = '../anjuke_community_original.csv'
df = pd.read_csv(data_path, encoding='gbk')
df.head(2)

Unnamed: 0,community_id,community_name,average_price,url,address,age,longitude,latitude,community_id.1,property_type,...,level_one_num,level_comm_num,school_score,nursery_num,junior_num,middle_num,commerce_score,restaurant_num,bank_num,supermarket_num
0,1,莲园小区,57483 元/平米,https://shanghai.anjuke.com/community/view/1/,［浦东-北蔡］莲园路518弄,1996年,31.19201,121.564893,1,公寓,...,1,237,7,5,3,2,5,306,54,128
1,100,碧云国际社区晓园,84345 元/平米,https://shanghai.anjuke.com/community/view/100/,［浦东-碧云］红枫路358弄,2006年,31.247488,121.598943,100,公寓,...,0,0,5,4,1,2,6,189,24,35


In [3]:
df.columns

Index(['community_id', 'community_name', 'average_price', 'url', 'address',
       'age', 'longitude', 'latitude', 'community_id.1', 'property_type',
       'total_construction_area', 'volume_rate', 'developer',
       'property_company', 'property_fee', 'total_houses', 'parking_space',
       'green_rate', 'greet', 'comfort', 'traffic_score', 'metro_station_num',
       'bus_station_num', 'hospital_score', 'level_three_num', 'level_two_num',
       'level_one_num', 'level_comm_num', 'school_score', 'nursery_num',
       'junior_num', 'middle_num', 'commerce_score', 'restaurant_num',
       'bank_num', 'supermarket_num'],
      dtype='object')

In [4]:
# 去除房价的单位（元/平米）
df['average_price'] = df['average_price'].apply(lambda x : x.split(' ')[0])

In [5]:
# 删除重复的列community_id.1
del df['community_id.1']

In [6]:
df['total_construction_area'].head(5)

0     3000m?
1    50000m?
2    26300m?
3    30000m?
4       暂无数据
Name: total_construction_area, dtype: object

In [7]:
# 去除m后面的？，以及暂无数据转为numpy的nan
from numpy import nan as NA
def convert_total_construction_area(x):
    if 'm' in x:
        return x.rstrip('m?')
    else:
        return NA

In [8]:
df['total_construction_area'] = df['total_construction_area'].apply(convert_total_construction_area)

In [9]:
df['total_construction_area'].head()

0     3000
1    50000
2    26300
3    30000
4      NaN
Name: total_construction_area, dtype: object

In [10]:
# 拆分地址
df['address'][0]

'［浦东-北蔡］莲园路518弄'

In [11]:
def convert_address(ser_obj):
    district = []
    township = []
    address = []
    for i in range(0, len(ser_obj)):
        dist_town = ser_obj[i].split('］')[0].lstrip('［')
        dist = dist_town.split('-')[0]
        town = dist_town.split('-')[1]
        add = ser_obj[i].split('］')[1]
        district.append(dist)
        township.append(town)
        address.append(add)
    return district, township, address

In [12]:
district, township, address = convert_address(df['address'])
dist_ser_obj = pd.Series(district)
town_ser_obj = pd.Series(township)
add_ser_obj = pd.Series(address)
df['district'] = dist_ser_obj
df['township'] = town_ser_obj
df['address'] = add_ser_obj

In [13]:
df.head(3)

Unnamed: 0,community_id,community_name,average_price,url,address,age,longitude,latitude,property_type,total_construction_area,...,school_score,nursery_num,junior_num,middle_num,commerce_score,restaurant_num,bank_num,supermarket_num,district,township
0,1,莲园小区,57483,https://shanghai.anjuke.com/community/view/1/,莲园路518弄,1996年,31.19201,121.564893,公寓,3000,...,7,5,3,2,5,306,54,128,浦东,北蔡
1,100,碧云国际社区晓园,84345,https://shanghai.anjuke.com/community/view/100/,红枫路358弄,2006年,31.247488,121.598943,公寓,50000,...,5,4,1,2,6,189,24,35,浦东,碧云
2,1000,闵行水仙苑,57626,https://shanghai.anjuke.com/community/view/1000/,畹町路500弄,1998年,31.111473,121.406853,公寓,26300,...,0,0,0,0,0,0,0,0,闵行,春申


In [14]:
df['property_fee'] = df['property_fee'].apply(lambda x : x.split('元')[0])

In [15]:
df['total_houses'] = df['total_houses'].apply(lambda x : x.split('户')[0])

In [16]:
df.head(100)

Unnamed: 0,community_id,community_name,average_price,url,address,age,longitude,latitude,property_type,total_construction_area,...,school_score,nursery_num,junior_num,middle_num,commerce_score,restaurant_num,bank_num,supermarket_num,district,township
0,1,莲园小区,57483,https://shanghai.anjuke.com/community/view/1/,莲园路518弄,1996年,31.192010,121.564893,公寓,3000,...,7,5,3,2,5,306,54,128,浦东,北蔡
1,100,碧云国际社区晓园,84345,https://shanghai.anjuke.com/community/view/100/,红枫路358弄,2006年,31.247488,121.598943,公寓,50000,...,5,4,1,2,6,189,24,35,浦东,碧云
2,1000,闵行水仙苑,57626,https://shanghai.anjuke.com/community/view/1000/,畹町路500弄,1998年,31.111473,121.406853,公寓,26300,...,0,0,0,0,0,0,0,0,闵行,春申
3,10000,虹桥向日葵公寓,66057,https://shanghai.anjuke.com/community/view/10000/,中山西路669弄,1996年,31.217635,121.420373,公寓,30000,...,9,0,0,0,7,161,151,188,长宁,中山公园
4,1002,仙霞路486弄,51321,https://shanghai.anjuke.com/community/view/1002/,仙霞路486弄,1994年,31.211309,121.398806,公寓,,...,8,0,0,0,10,0,98,97,长宁,仙霞
5,10020,中创大厦,17158,https://shanghai.anjuke.com/community/view/10020/,南京西路819号,1997年,31.236495,121.467344,其它,12800,...,0,0,0,0,0,0,0,0,静安,南京西路
6,1003,海逸公寓,77863,https://shanghai.anjuke.com/community/view/1003/,威宁路455号,2005年,31.221992,121.391314,公寓,110000,...,6,6,2,3,8,172,46,62,长宁,天山
7,10038,廖创兴金融中心,97639,https://shanghai.anjuke.com/community/view/10038/,南京西路288号,暂无数据,31.237466,121.476890,其它,70000,...,0,0,0,0,0,0,0,0,静安,南京西路
8,10039,天安别墅,37015,https://shanghai.anjuke.com/community/view/10039/,嘉松南路3888弄,2014年,31.101672,121.239165,别墅,110000,...,0,0,0,0,4,37,1,4,松江,佘山
9,1004,虹桥万博花园一期,81518,https://shanghai.anjuke.com/community/view/1004/,古北路69弄,2004年,31.223425,121.406764,公寓,120000,...,8,0,0,0,8,305,67,128,长宁,天山


In [17]:
df_new = df.replace('暂无数据',NA)
df_new.replace('-999', NA, inplace=True)

In [18]:
df_new.head(100)

Unnamed: 0,community_id,community_name,average_price,url,address,age,longitude,latitude,property_type,total_construction_area,...,school_score,nursery_num,junior_num,middle_num,commerce_score,restaurant_num,bank_num,supermarket_num,district,township
0,1,莲园小区,57483,https://shanghai.anjuke.com/community/view/1/,莲园路518弄,1996年,31.192010,121.564893,公寓,3000,...,7.0,5.0,3.0,2.0,5.0,306.0,54.0,128.0,浦东,北蔡
1,100,碧云国际社区晓园,84345,https://shanghai.anjuke.com/community/view/100/,红枫路358弄,2006年,31.247488,121.598943,公寓,50000,...,5.0,4.0,1.0,2.0,6.0,189.0,24.0,35.0,浦东,碧云
2,1000,闵行水仙苑,57626,https://shanghai.anjuke.com/community/view/1000/,畹町路500弄,1998年,31.111473,121.406853,公寓,26300,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,闵行,春申
3,10000,虹桥向日葵公寓,66057,https://shanghai.anjuke.com/community/view/10000/,中山西路669弄,1996年,31.217635,121.420373,公寓,30000,...,9.0,0.0,0.0,0.0,7.0,161.0,151.0,188.0,长宁,中山公园
4,1002,仙霞路486弄,51321,https://shanghai.anjuke.com/community/view/1002/,仙霞路486弄,1994年,31.211309,121.398806,公寓,,...,8.0,0.0,0.0,0.0,10.0,0.0,98.0,97.0,长宁,仙霞
5,10020,中创大厦,17158,https://shanghai.anjuke.com/community/view/10020/,南京西路819号,1997年,31.236495,121.467344,其它,12800,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,静安,南京西路
6,1003,海逸公寓,77863,https://shanghai.anjuke.com/community/view/1003/,威宁路455号,2005年,31.221992,121.391314,公寓,110000,...,6.0,6.0,2.0,3.0,8.0,172.0,46.0,62.0,长宁,天山
7,10038,廖创兴金融中心,97639,https://shanghai.anjuke.com/community/view/10038/,南京西路288号,,31.237466,121.476890,其它,70000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,静安,南京西路
8,10039,天安别墅,37015,https://shanghai.anjuke.com/community/view/10039/,嘉松南路3888弄,2014年,31.101672,121.239165,别墅,110000,...,0.0,0.0,0.0,0.0,4.0,37.0,1.0,4.0,松江,佘山
9,1004,虹桥万博花园一期,81518,https://shanghai.anjuke.com/community/view/1004/,古北路69弄,2004年,31.223425,121.406764,公寓,120000,...,8.0,0.0,0.0,0.0,8.0,305.0,67.0,128.0,长宁,天山


In [19]:
df_new.columns

Index(['community_id', 'community_name', 'average_price', 'url', 'address',
       'age', 'longitude', 'latitude', 'property_type',
       'total_construction_area', 'volume_rate', 'developer',
       'property_company', 'property_fee', 'total_houses', 'parking_space',
       'green_rate', 'greet', 'comfort', 'traffic_score', 'metro_station_num',
       'bus_station_num', 'hospital_score', 'level_three_num', 'level_two_num',
       'level_one_num', 'level_comm_num', 'school_score', 'nursery_num',
       'junior_num', 'middle_num', 'commerce_score', 'restaurant_num',
       'bank_num', 'supermarket_num', 'district', 'township'],
      dtype='object')

In [20]:
df_new.to_csv('anjuke_community_clean.csv', columns=['community_id', 'community_name', 'url', 'district', 'township', 'address', 'age',
       'developer', 'property_company', 'property_type', 'property_fee',
       'longitude', 'latitude',
       'total_construction_area', 'volume_rate',
       'total_houses', 'parking_space',
       'green_rate', 'greet', 'comfort', 'traffic_score', 'metro_station_num',
       'bus_station_num', 'hospital_score', 'level_three_num', 'level_two_num',
       'level_one_num', 'level_comm_num', 'school_score', 'nursery_num',
       'junior_num', 'middle_num', 'commerce_score', 'restaurant_num',
       'bank_num', 'supermarket_num', 'average_price'], index=False)