## Importing Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

## Data Ingestion

In [2]:
df_train = pd.read_csv("train_sample.csv")
df_test = pd.read_csv("test_sample.csv")

### Data Comparison

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sold_date   1136 non-null   object 
 1   sold_price  1136 non-null   float64
 2   year_built  1433 non-null   float64
 3   garage      1023 non-null   float64
 4   sqft        1150 non-null   float64
 5   type        1600 non-null   object 
 6   price       1594 non-null   float64
 7   transport   1600 non-null   bool   
 8   services    1600 non-null   int64  
 9   beds        1550 non-null   float64
 10  floors      1303 non-null   float64
 11  baths       1599 non-null   float64
 12  lot_sqft    792 non-null    float64
dtypes: bool(1), float64(9), int64(1), object(2)
memory usage: 151.7+ KB


In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sold_date   279 non-null    object 
 1   sold_price  279 non-null    float64
 2   year_built  357 non-null    float64
 3   garage      259 non-null    float64
 4   sqft        297 non-null    float64
 5   type        400 non-null    object 
 6   transport   400 non-null    bool   
 7   services    400 non-null    int64  
 8   beds        390 non-null    float64
 9   floors      317 non-null    float64
 10  baths       400 non-null    float64
 11  lot_sqft    200 non-null    float64
dtypes: bool(1), float64(8), int64(1), object(2)
memory usage: 34.9+ KB


# Preprocessing

## Batasan
- **Batas waktu program berjalan**: 10 detik  
- **Batas memori**: 128MB  
- **Input**: input standar atau `input.txt`  
- **Output**: output standar atau `output.txt`  

---

## Deskripsi

Siapkan data untuk pekerjaan selanjutnya dengan model machine learning. Untuk melakukan ini, gunakan metode dari pustaka berikut:

- `sklearn.preprocessing.MinMaxScaler`
- `sklearn.model_selection.train_test_split`

Tulis fungsi bernama:

```python
preprocessing(X: np.ndarray, y: np.ndarray, test_size=0.33) dengan parameter input berikut:
- `X` : Matriks NumPy berisi fitur objek.

- `y`: Vektor NumPy berisi label kebenaran (target).

test_size: Menentukan ukuran dataset pengujian sebagai test_size * 100% dari total sampel (default adalah 0.33).

Catatan bahwa data harus diacak sebelum dibagi.
Gunakan argumen random_state=1234 pada fungsi train_test_split untuk validasi fungsi yang benar.

Fungsi ini akan mengembalikan data yang telah diskalakan menggunakan metode MinMaxScaler, dan dibagi menjadi dataset pelatihan dan pengujian, di mana panjang dataset pengujian adalah test_size dari total sampel. Output harus berupa tuple dengan 4 item:


In [15]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

def preprocessing(X: np.ndarray, y: np.ndarray, test_size=0.33):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=1234
    )
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, y_train, X_test_scaled, y_test


In [16]:
X = df_train.drop(columns=['price']).to_numpy()
y = df_train['price'].to_numpy()

X_train, y_train, X_test, y_test = preprocessing(X, y)
# print(X_train)

ValueError: could not convert string to float: 'single_family'

In [None]:
import numpy as np

def train_test_split(X: np.ndarray, y: np.ndarray, test_size=0.33):
    np.random.seed(1234)
    n_samples = len(X)
    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    test_count = round(test_size * n_samples)
    test_indices = indices[:test_count]
    train_indices = indices[test_count:]

    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]

    return X_train, y_train, X_test, y_test


In [None]:
import numpy as np

def onehot_encoding(X: np.ndarray) -> np.ndarray:
    unique_vals = np.sort(np.unique(X))
    val_to_index = {val: idx for idx, val in enumerate(unique_vals)}
    
    onehot = np.zeros((X.shape[0], unique_vals.shape[0]), dtype=int)
    for i, val in enumerate(X):
        onehot[i, val_to_index[val]] = 1

    return onehot


In [None]:
import numpy as np

def minmax_scale(X: np.ndarray) -> np.ndarray:
    X_min = X.min(axis=0)
    X_max = X.max(axis=0)
    denom = X_max - X_min
    denom[denom == 0] = 1  # Hindari pembagian dengan nol
    return (X - X_min) / denom
