In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

In [2]:
train = pd.read_csv(r'E:\PQJ\just do it\房租预测\train_data.csv')
test = pd.read_csv(r'E:\PQJ\just do it\房租预测\test_a.csv')

In [3]:
len = len(train)

train['rentType'][train['rentType']=='--'] = '未知方式'
print('rentType:',list(train['rentType']).count("未知方式")/len)
print('houseToward:',list(train['houseToward']).count("暂无数据")/len)
print('houseDecoration:',list(train['houseDecoration']).count("其他")/len)
print('buildYear:',list(train['buildYear']).count("暂无信息")/len)

rentType: 0.7423745173745174
houseToward: 0.02323841698841699
houseDecoration: 0.7007722007722008
buildYear: 0.06776061776061776


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


其中rentType和houseDecoration缺失值的比列达到70%，考虑将这两特征去除

In [4]:
def parseData(df):
    """
    预处理数据
    """
    # 去掉部分特征
    df.drop('ID', axis=1, inplace=True)#没意义的特征
    df.drop('communityName',axis=1, inplace=True)#没意义的特征
    df.drop('city',axis=1,inplace=True)#单调特征列，只有一个值
    df.drop('rentType',axis=1, inplace=True)
    df.drop('houseDecoration',axis=1,inplace=True)
    
    # 转换object类型数据
    columns = [ 'houseFloor', 'houseToward', 'region', 'plate']
    for col in columns:
        df[col] = df[col].astype('category')
        
    # 将buildYear列转换为整型数据
    tmp = df['buildYear'].copy()
    tmp2 = tmp[tmp!='暂无信息'].astype('int')#先转整形
    tmp[tmp=='暂无信息'] = tmp2.mode().iloc[0]#用众数填充独特的缺失值：'暂无信息'
    df['buildYear'] = tmp
    df['buildYear'] = df['buildYear'].astype('int')
    
    # 处理pv和uv的空值  uv和pv训练集和测试集分布类似，存在长尾现象，所以用中位数填充
    df['pv'].fillna(df['pv'].mean(),inplace=True)
    df['uv'].fillna(df['uv'].mean(),inplace=True)
    df['pv'] = df['pv'].astype('int')
    df['uv'] = df['uv'].astype('int')
    
    
    return df

In [8]:
print(test[['area']].describe())
print(test[['region','plate']].describe())

              area
count  2469.000000
mean     76.031839
std      29.080334
min      15.000000
25%      53.000000
50%      73.160000
75%      94.320000
max     150.000000
         region    plate
count      2469     2469
unique       14       63
top     RG00002  BK00056
freq        724      125


通过测试集看到area范围在15-150之间

In [9]:
def washData(df_train, df_test):
    """
    清洗数据
    """
    df_train = df_train[(df_train['area']<=230) & (df_train['area']>=5) & (train['remainNewNum'] < 3000)]#比例大概是99%
    df_train = df_train[df_train['tradeMoney']<=25000]
    
    
    return df_train, df_test

In [11]:
train["region"].value_counts(),test["region"].value_counts()

(RG00002    11437
 RG00005     5739
 RG00003     4186
 RG00010     3640
 RG00012     3368
 RG00004     3333
 RG00006     1961
 RG00007     1610
 RG00008     1250
 RG00013     1215
 RG00001     1157
 RG00014     1069
 RG00011      793
 RG00009      681
 RG00015        1
 Name: region, dtype: int64, RG00002    724
 RG00005    311
 RG00012    262
 RG00010    194
 RG00003    178
 RG00004    153
 RG00007    142
 RG00013     97
 RG00008     86
 RG00014     86
 RG00001     77
 RG00009     61
 RG00006     55
 RG00011     43
 Name: region, dtype: int64)

删掉训练集中得RG00015

In [14]:
print(train["plate"].value_counts(),test["plate"].value_counts())
train_pla = train["plate"].value_counts().keys()
test_pla = test["plate"].value_counts().keys()
print("*"*20)
# find different
for i in train_pla:
    if i not in test_pla:
        print(i)

BK00031    1958
BK00033    1837
BK00045    1816
BK00055    1566
BK00056    1516
BK00052    1375
BK00017    1305
BK00041    1266
BK00054    1256
BK00051    1253
BK00046    1227
BK00035    1156
BK00042    1137
BK00009    1016
BK00050     979
BK00043     930
BK00026     906
BK00047     880
BK00034     849
BK00013     834
BK00053     819
BK00028     745
BK00040     679
BK00060     671
BK00010     651
BK00029     646
BK00062     618
BK00022     614
BK00018     613
BK00064     590
           ... 
BK00037     444
BK00012     412
BK00038     398
BK00024     397
BK00020     384
BK00002     357
BK00065     348
BK00027     344
BK00039     343
BK00063     281
BK00057     278
BK00015     253
BK00006     231
BK00021     226
BK00007     225
BK00030     219
BK00066     219
BK00049     211
BK00008     210
BK00004     189
BK00048     165
BK00025     157
BK00023     127
BK00059     122
BK00044      98
BK00016      40
BK00036      33
BK00058      15
BK00032       3
BK00001       1
Name: plate, Length: 66,

In [None]:
def feature(df):
    """
    特征
    """
    # 将houseType转化为‘房间数’，‘厅数’，‘卫生间数’
    def parseRoom(info, index):
        res = int(info[index*2])
        return res
    df.insert(3,'室',None)
    df.insert(4, '厅', None)
    df.insert(5, '卫', None)
    df['室'] = df['houseType'].apply(parseRoom, index=0)#将这一列的每一个值以str形式传入parseRoom函数中，返回info[index]
    df['厅'] = df['houseType'].apply(parseRoom, index=1)
    df['卫'] = df['houseType'].apply(parseRoom, index=2)
    
    df['交易月份'] = df['tradeTime'].apply(lambda x: int(x.split('/')[1]))  #以 / 分割，返回list，取[1]即月份
    
    #region
    df = df.drop(df[df['region']=="RG00015"].index)
    
    df.drop('houseType', axis=1, inplace=True)
    df.drop('tradeTime', axis=1, inplace=True)
    
    categorical_feats = [ 'houseFloor', 'houseToward', 'region', 'plate']
    return df, categorical_feats