In [24]:
import os
import pandas as pd

In [25]:
def mkdir_if_not_exit(path):
    if not isinstance(path, str):
        path = os.path.join(*path)
    if not os.path.exists(path):
        os.makedirs(path)

In [35]:
data_file = '../data/house_tiny.csv'
mkdir_if_not_exit('../data')
with open(data_file, 'w') as f:
    f.write('NumRooms, Alley, Price\n')
    f.write('NA,Pave,127500\n')
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [36]:
data = pd.read_csv(data_file)

In [37]:
data

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


### Handling of missing data

NAN in dataset represent the missing value in datasets. So two way to handle it 1- imputation 2- deletion. In imputation we just replace NAN value with standard values (like mean of the data) or ones. In deletion we just ignore or remove those NAN.

iloc (integer location based indexing) use to imputation

loc (location based indexing)

In [38]:
inputs = data.iloc[:,0:2]
outputs = data.iloc[:,2]

In [39]:
inputs

Unnamed: 0,NumRooms,Alley
0,,Pave
1,2.0,
2,4.0,
3,,


In [40]:
# fill the NaN with mean value of the column
inputs = inputs.fillna(inputs.mean())
inputs

Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


In [41]:
inputs = pd.get_dummies(inputs, dummy_na=True)
inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


### Conversion of the data to the tensor format

In [43]:
from mxnet import np

In [44]:
x, y = np.array(inputs.values), np.array(outputs.values)

In [45]:
x

array([[3., 1., 0.],
       [2., 0., 1.],
       [4., 0., 1.],
       [3., 0., 1.]], dtype=float64)

In [46]:
y

array([127500, 106000, 178100, 140000], dtype=int64)