In [1]:
import pandas as pd

In [2]:
def transform_price(price):
    try:
        price = price.replace(',', '.')
        
        # Step 2: Extract the numeric value and suffix
        suffix = price[-2:]  # Last two characters (e.g., "rb", "jt", "M")
        
        # Step 3: Handle suffix cases and multiply accordingly
        if suffix == 'jt':
            price_value = float(price[:-2]) * 1_000_000
        elif suffix == ' M':
            price_value = float(price[:-2]) * 1_000_000_000
        elif suffix == 'rb':
            price_value = float(price[:-2]) * 1_000
        else:
            price_value = float(price)
        return price_value
    
    except (ValueError, TypeError) as e:
        return None

In [3]:
def clean(data):
    data['bed'] = data['bed'].fillna(data['bed'].median())
    data['bath'] = data['bath'].fillna(data['bath'].median())
    data.area = data.area.astype(str)
    data['area'] = data['area'].str[:-3].astype(int)
    
    data['price'] = data['price'].apply(transform_price)
    data = data.dropna()
    return data

In [4]:
def remove_outliers(data):
    for col in ['area','price']:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
    
        #being a bit generous with 5xIQR, with the standard being 1.5x, but I'm being generous - only removing unrealistic scam/promotional postings.
        upper_bound = Q3 + 5 * IQR
        data = data[(data[col] <= upper_bound)]

    #IQR method doesn't really help with lower bounds, so I'll use judgment from what I know for the lower bounds.
    area_lower_bound = 15
    price_lower_bound = 10**7
    data = data[(data['area'] >= area_lower_bound)]
    data = data[(data['price'] >= price_lower_bound)]
    return data

In [5]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [6]:
train_data = clean(train_data)
train_data = remove_outliers(train_data)

test_data = clean(test_data)
test_data = remove_outliers(train_data)

In [7]:
train_data.head()

Unnamed: 0,url,location,bed,bath,area,price
0,https://www.rumah.com/listing-properti/dijual-...,"Jalan camar bintaro sektor 3, Bintaro, Tangera...",3.0,2.0,200,4800000000.0
1,https://www.rumah.com/listing-properti/dijual-...,"Ciputat Timur, Tangerang Selatan, Banten",4.0,4.0,200,2700000000.0
2,https://www.rumah.com/listing-properti/dijual-...,"Ciputat, Tangerang Selatan, Banten",4.0,3.0,110,1375000000.0
3,https://www.rumah.com/listing-properti/dijual-...,"Pondok Ranji, Bintaro, Tangerang Selatan, Banten",5.0,5.0,232,4250000000.0
4,https://www.rumah.com/listing-properti/dijual-...,"1 jl.Jombang ,Bintaro, Pondok Aren, Tangerang...",7.0,5.0,540,5500000000.0


In [9]:
train_data.describe()

Unnamed: 0,bed,bath,area,price
count,18948.0,18948.0,18948.0,18948.0
mean,3.739814,2.966382,178.152259,3278087000.0
std,1.324113,1.310335,135.614414,2840672000.0
min,1.0,1.0,22.0,13742640.0
25%,3.0,2.0,80.0,1380000000.0
50%,4.0,3.0,140.0,2458940000.0
75%,4.0,4.0,234.0,4250000000.0
max,10.0,10.0,1000.0,19000000000.0


Data is reasonable now (after removing outliers). We can proceed to feature engineering.

In [11]:
train_data.to_csv('train_cleaned.csv', index = False)
test_data.to_csv('test_cleaned.csv', index = False)