<h3>Part 1: Load Data</h3>
<p>Objectives: Predict 'harga' based on similiarity of samples to the actual data</p>

In [1]:
import requests
from contextlib import closing
import csv
# STEP 1: 
# read file "harga_rumah.txt"
url = "https://storage.googleapis.com/dqlab-dataset/harga_rumah.txt"
data_harga_rumah = []
with closing(requests.get(url, stream=True)) as r:
    f = (line.decode('utf-8') for line in r.iter_lines())
    data_harga_rumah = [row for row in csv.reader(f)]

# make a list of dictionary: harga_rumah
key_harga_rumah = data_harga_rumah[0]
harga_rumah = []
for baris_harga_rumah in data_harga_rumah[1:]:
    dict_harga_rumah = dict()
    for i in range(len(baris_harga_rumah)):
        dict_harga_rumah[key_harga_rumah[i]] = baris_harga_rumah[i]
    harga_rumah.append(dict_harga_rumah)

print(harga_rumah)

[{'tanah': '70', 'bangunan': '50', 'jarak_ke_pusat': '15', 'harga': '500'}, {'tanah': '70', 'bangunan': '60', 'jarak_ke_pusat': '30', 'harga': '400'}, {'tanah': '70', 'bangunan': '60', 'jarak_ke_pusat': '55', 'harga': '300'}, {'tanah': '100', 'bangunan': '50', 'jarak_ke_pusat': '30', 'harga': '700'}, {'tanah': '100', 'bangunan': '70', 'jarak_ke_pusat': '25', 'harga': '1000'}, {'tanah': '100', 'bangunan': '70', 'jarak_ke_pusat': '50', 'harga': '650'}, {'tanah': '120', 'bangunan': '100', 'jarak_ke_pusat': '20', 'harga': '2000'}, {'tanah': '120', 'bangunan': '80', 'jarak_ke_pusat': '50', 'harga': '1200'}, {'tanah': '150', 'bangunan': '100', 'jarak_ke_pusat': '50', 'harga': '1800'}, {'tanah': '150', 'bangunan': '90', 'jarak_ke_pusat': '15', 'harga': '3000'}]


<h3>Part 2: Convert Data into Pandas DataFrame</h3>

In [2]:
import numpy as np
import pandas as pd

col_names = key_harga_rumah
data_dict = {}

for col in col_names:
    temp_list = []
    for row in harga_rumah:
        temp_list.append(float(row[col]))
    data_dict[col] = temp_list

print(data_dict, "\n")

#convert data into dataframe: data_df
data_df = pd.DataFrame(data_dict)
print(data_df.head(), data_df.shape, sep='\n')

{'tanah': [70.0, 70.0, 70.0, 100.0, 100.0, 100.0, 120.0, 120.0, 150.0, 150.0], 'bangunan': [50.0, 60.0, 60.0, 50.0, 70.0, 70.0, 100.0, 80.0, 100.0, 90.0], 'jarak_ke_pusat': [15.0, 30.0, 55.0, 30.0, 25.0, 50.0, 20.0, 50.0, 50.0, 15.0], 'harga': [500.0, 400.0, 300.0, 700.0, 1000.0, 650.0, 2000.0, 1200.0, 1800.0, 3000.0]} 

   tanah  bangunan  jarak_ke_pusat   harga
0   70.0      50.0            15.0   500.0
1   70.0      60.0            30.0   400.0
2   70.0      60.0            55.0   300.0
3  100.0      50.0            30.0   700.0
4  100.0      70.0            25.0  1000.0
(10, 4)


<h3>Part 3: Define Functions and Transforming Data</h3>

In [3]:
#User-defined function
def min_value(data):
    temp_min = data[0]
    for each in data[1:]:
        if each < temp_min:
            temp_min = each
    return temp_min

def max_value(data):
    temp_max = data[0]
    for each in data[1:]:
        if each > temp_max:
            temp_max = each
    return temp_max

def transform_col(data):
    min_data = min_value(data)
    max_data = max_value(data)
    
    temp_list = []
    for each in data:
        hasil = (each - min_data)/(max_data - min_data)
        temp_list.append(hasil)    
    return temp_list

def transform(data, sample):
    sample_dict = {}
    for col in data.columns[:3]:
        max_data = max_value(data[col])
        min_data = min_value(data[col])
        temp_list = []
        for each in sample[col]:
            temp_list.append((each-min_data)/(max_data-min_data))
        sample_dict[col] = temp_list
    return pd.DataFrame(sample_dict)

def abs_value(num):
    if num < 0:
        return num * -1
    else:
        return num


#transforming data:
data_t_dict = {}
for col in col_names[:3]:
    data_t_dict[col] = transform_col(data_df[col].values)
data_t_dict[col_names[-1]] = data_df.iloc[:,-1].values # only 'harga' column not transformed

data_t_df = pd.DataFrame(data_t_dict)
print("TRANSFORMED DATA: ", data_t_df.head(), data_t_df.shape, sep='\n')


#samples we want to predict: sample
sample = {'tanah': [110,100,90], 'bangunan': [80,70,75], 'jarak_ke_pusat': [35,15,55]}
sample_df = pd.DataFrame(sample)
sample_t_df = transform(data_df, sample_df)
print("\n",sample_t_df, sample_t_df.shape, sep="\n")


TRANSFORMED DATA: 
   tanah  bangunan  jarak_ke_pusat   harga
0  0.000       0.0           0.000   500.0
1  0.000       0.2           0.375   400.0
2  0.000       0.2           1.000   300.0
3  0.375       0.0           0.375   700.0
4  0.375       0.4           0.250  1000.0
(10, 4)


   tanah  bangunan  jarak_ke_pusat
0  0.500       0.6             0.5
1  0.375       0.4             0.0
2  0.250       0.5             1.0
(3, 3)


<h3>Part 4: Predicting Houses Price</h3>

In [4]:
def pred_similiarity(data_dataframe,sample_dataframe):
    harga = []
    for i in range(sample_dataframe.shape[0]):
        temp_result = []
        for col in sample_dataframe.columns:
            temp_diff = data_dataframe[col] - sample_dataframe.loc[i, col]
            temp_result.append(np.array([abs_value(each) for each in temp_diff]))
        
        array_result = 0
        for each in temp_result:
            array_result += each
        
        harga.append(data_df['harga'][np.argmin(array_result)])
    
    return np.array(harga)

pred_harga = pred_similiarity(data_t_df, sample_t_df)
sample_t_df['harga'] = pred_harga

print(sample_t_df)

   tanah  bangunan  jarak_ke_pusat   harga
0  0.500       0.6             0.5  1200.0
1  0.375       0.4             0.0  1000.0
2  0.250       0.5             1.0   650.0
