# Preprocessing of the dataset

The knowledge created on this dataset will be exported as transformers and saved on **libs/transformers.py** so it can be reusable in sklearn pipelines 

In [388]:
import sys
import os
sys.path.append(os.getcwd())

import pandas as pd
import numpy as np
import pipelines as andre_pipelines
from sklearn.feature_selection import VarianceThreshold

In [389]:
df = pd.read_csv('../beijing.csv', encoding="gbk")

  df = pd.read_csv('../beijing.csv', encoding="gbk")


# Deleting assured columns

In [390]:
df.drop(['url'], axis=1, inplace=True)

# Mutating or one-hot-encoding columns since string stuff 

In [391]:
# convert tradeTime date column to timestamp
df['tradeTime'] = pd.to_datetime(df['tradeTime']).astype(int)/ 10**9

In [392]:
# transforming id column to numeric
df['id'] = df.index + 1

In [393]:
# This column unfortunately has the same labels as string and int object.
# There is a label '#NAME?' which is not clear for what use it is. Since only 32 of them, we can fill it with the median if the column (Since discrete values)
df.groupby('livingRoom')['id'].nunique()

livingRoom
0            11
1         42138
2         77256
3         31567
4          3349
5           537
6           126
7            22
8             3
9             2
#NAME?       32
0            17
1         40248
2         83333
3         36044
4          3472
5           570
6           102
7            15
8             4
9             3
Name: id, dtype: int64

In [394]:
def fix_livingRoom(x):
    if x == '#NAME?':
        return np.nan
    else: return int(x)
df['livingRoom'] = df['livingRoom'].apply(fix_livingRoom)
df['livingRoom'] = df['livingRoom'].fillna(int(df['livingRoom'].median()))
df.groupby('livingRoom')['id'].nunique()

livingRoom
0.0        28
1.0     82386
2.0    160621
3.0     67611
4.0      6821
5.0      1107
6.0       228
7.0        37
8.0         7
9.0         5
Name: id, dtype: int64

In [395]:
df.groupby('drawingRoom')['id'].nunique()

drawingRoom
0        11300
1       109783
2        33418
3          474
4           31
5            5
0         8386
1       115876
2        39084
3          444
4           16
5            2
中 14         2
中 15         1
中 16         3
中 22         1
中 24         1
中 6          6
低 15         1
低 16         1
低 6          7
底 11         1
底 20         1
底 28         1
顶 6          2
高 12         1
高 14         2
高 6          1
Name: id, dtype: int64

In [396]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318851 entries, 0 to 318850
Data columns (total 25 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   318851 non-null  int64  
 1   Lng                  318851 non-null  float64
 2   Lat                  318851 non-null  float64
 3   Cid                  318851 non-null  int64  
 4   tradeTime            318851 non-null  float64
 5   DOM                  160874 non-null  float64
 6   followers            318851 non-null  int64  
 7   totalPrice           318851 non-null  float64
 8   price                318851 non-null  int64  
 9   square               318851 non-null  float64
 10  livingRoom           318851 non-null  float64
 11  drawingRoom          318851 non-null  object 
 12  kitchen              318851 non-null  int64  
 13  bathRoom             318851 non-null  object 
 14  floor                318851 non-null  object 
 15  buildingType     

## Missing values

In [397]:
missing=df.isnull().mean()>0 
missing[missing == True]

DOM                  True
buildingType         True
elevator             True
fiveYearsProperty    True
subway               True
communityAverage     True
dtype: bool

In [398]:
# dependant variables: DOM, buildingType, elevator, fiveYearsProperty, subway, communityAverage
# target variable: totalPrice
cols_to_use = ['DOM', 'buildingType', 'elevator', 'fiveYearsProperty', 'subway', 'communityAverage', 'totalPrice']
data_simp = df[cols_to_use]
data_simp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318851 entries, 0 to 318850
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   DOM                160874 non-null  float64
 1   buildingType       316830 non-null  float64
 2   elevator           318819 non-null  float64
 3   fiveYearsProperty  318819 non-null  float64
 4   subway             318819 non-null  float64
 5   communityAverage   318388 non-null  float64
 6   totalPrice         318851 non-null  float64
dtypes: float64(7)
memory usage: 17.0 MB


In [399]:
data_simp.isnull().mean()

DOM                  0.495457
buildingType         0.006338
elevator             0.000100
fiveYearsProperty    0.000100
subway               0.000100
communityAverage     0.001452
totalPrice           0.000000
dtype: float64

# Can i accept 0 as a value for these missing data?

From the six variables, only one have a missinge percentage greater than 5%, the DOM variable
- The DOM variable means **Active days on market** and it has a missing percentage of almost 50% but the .isnull function from pandas recognizes the value zero as null but zero may be an important label. It could mean that the property on sale could be zero active days on market. We need to find out if it is completely random or not. If it is not random, than probably some zeros are treated the same as null and as zero active days on market.

Hypothesen:
 - H(null): The data was produced in a random manner.  
 - H(alternative): The data was not produced in a random manner.

In [400]:
from statsmodels.sandbox.stats.runs import runstest_1samp

dom_values = df['DOM'].to_list()
print("mean: ", df['DOM'].mean())

# Perform Runs test
runstest_1samp(dom_values, cutoff=df['DOM'].mean())

mean:  28.822339221999826


(-313.444522743472, 0.0)

z_stat: -313.44; p-value: 0%  
Since the p-value is smaller than 5%, we need to accept the null hypothesis. 
This means, that the zero values of DOM are randomly generated. Let's analyse further

### Check for near-zero variance and eliminate columns with low variance

In [401]:
# Get variance of each column
df.var()

  df.var()


id                     8.472190e+09
Lng                    1.255617e-02
Lat                    8.460789e-03
Cid                    5.585881e+24
tradeTime              2.670821e+15
DOM                    2.523791e+03
followers              1.170268e+03
totalPrice             5.325977e+04
price                  4.712817e+08
square                 1.386420e+03
livingRoom             6.033926e-01
kitchen                1.201411e-02
buildingType           1.612538e+00
renovationCondition    1.720658e+00
buildingStructure      3.616666e+00
ladderRatio            6.284300e+08
elevator               2.440633e-01
fiveYearsProperty      2.288009e-01
subway                 2.397771e-01
district               7.910809e+00
communityAverage       4.985939e+08
dtype: float64

In [402]:
selector = VarianceThreshold(threshold=0.05) # 0.05 is 5% of the variance of the column^
df_filtered = selector.fit_transform(df) # fit and transform the data

ValueError: could not convert string to float: '中 14'

In [None]:
df_filtered

AttributeError: 'VarianceThreshold' object has no attribute 'var'