In [None]:
# Jalankan ini hanya sekali untuk setup
import os
import zipfile
from kaggle.api.kaggle_api_extended import KaggleApi

# Posisikan kaggle.json ke folder kerja
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()

# Inisialisasi API
api = KaggleApi()
api.authenticate()

# Download file CSV dari dataset (dalam bentuk zip)
api.dataset_download_file(
    dataset='thedevastator/airbnb-prices-in-european-cities',
    file_name='london_weekdays.csv',
    path='.'
)

# ✅ Nama file zip yang diunduh akan menjadi 'london_weekdays.csv.zip'
zip_path = "london_weekdays.csv.zip"

# Cek apakah file ZIP ada sebelum ekstraksi
if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(".")
    print("Berhasil diekstrak.")
else:
    print(f"File {zip_path} tidak ditemukan. Cek apakah download berhasil.")

Dataset URL: https://www.kaggle.com/datasets/thedevastator/airbnb-prices-in-european-cities
File london_weekdays.csv.zip tidak ditemukan. Cek apakah download berhasil.


In [2]:
import pandas as pd

df = pd.read_csv("london_weekdays.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat
0,0,570.098074,Entire home/apt,False,False,2.0,False,0,0,10.0,98.0,1,5.301018,1.58899,209.632578,14.571793,467.597522,8.372724,-0.16032,51.46531
1,1,297.98443,Private room,False,True,2.0,True,1,0,10.0,99.0,1,2.198946,0.379262,553.891744,38.50163,961.472137,17.215961,-0.09683,51.50343
2,2,336.790611,Private room,False,True,2.0,False,1,0,10.0,96.0,1,2.322958,0.453178,428.287849,29.770764,959.042839,17.172462,-0.10554,51.52407
3,3,226.722171,Private room,False,True,2.0,True,1,0,10.0,99.0,1,5.707825,1.723977,195.694744,13.602959,452.232472,8.0976,-0.16575,51.46292
4,4,256.355982,Private room,False,True,3.0,False,0,0,9.0,98.0,1,3.257945,0.825417,329.477068,22.902317,735.32728,13.166649,-0.12055,51.53728


In [3]:
# Cek missing values
print(df.isnull().sum())

Unnamed: 0                    0
realSum                       0
room_type                     0
room_shared                   0
room_private                  0
person_capacity               0
host_is_superhost             0
multi                         0
biz                           0
cleanliness_rating            0
guest_satisfaction_overall    0
bedrooms                      0
dist                          0
metro_dist                    0
attr_index                    0
attr_index_norm               0
rest_index                    0
rest_index_norm               0
lng                           0
lat                           0
dtype: int64


In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv("london_weekdays.csv")

def kategori_harga(harga):
    if harga < 80:
        return "Murah"
    elif 80 <= harga <= 150:
        return "Sedang"
    else:
        return "Mahal"

df["harga_kategori"] = df["realSum"].apply(kategori_harga)

# Cek hasil
print(df["harga_kategori"].value_counts())
df[["realSum", "harga_kategori"]].head()

harga_kategori
Mahal     3745
Sedang     848
Murah       21
Name: count, dtype: int64


Unnamed: 0,realSum,harga_kategori
0,570.098074,Mahal
1,297.98443,Mahal
2,336.790611,Mahal
3,226.722171,Mahal
4,256.355982,Mahal


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Fitur yang dipakai
X = df[[
    "room_type",
    "person_capacity",
    "cleanliness_rating",
    "bedrooms",
    "dist"
]]
y = df["harga_kategori"]

# Kolom kategorikal
cat_cols = ["room_type"]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"  # kolom numerik langsung diproses
)

# Split data latih dan uji
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

# Split data dulu
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Hanya lakukan oversampling di training data
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Cek distribusi baru
from collections import Counter
print("Distribusi setelah oversampling:", Counter(y_train_resampled))

Distribusi setelah oversampling: Counter({'Mahal': 2996, 'Sedang': 2996, 'Murah': 2996})


In [7]:
# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

categorical_cols = ["room_type"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ],
    remainder="passthrough"
)

# Pipeline dengan Random Forest
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

# Latih model dengan data hasil oversampling
model_pipeline.fit(X_train_resampled, y_train_resampled)

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = model_pipeline.predict(X_test)

print("Akurasi:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Akurasi: 0.8158179848320694

Confusion Matrix:
 [[663   2  84]
 [  2   0   2]
 [ 75   5  90]]

Classification Report:
               precision    recall  f1-score   support

       Mahal       0.90      0.89      0.89       749
       Murah       0.00      0.00      0.00         4
      Sedang       0.51      0.53      0.52       170

    accuracy                           0.82       923
   macro avg       0.47      0.47      0.47       923
weighted avg       0.82      0.82      0.82       923



In [9]:
import joblib

# Simpan pipeline model ke file
joblib.dump(model_pipeline, "model_kategori_harga.pkl")

['model_kategori_harga.pkl']

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Target regresi: harga asli
y_reg = df["realSum"]

# Gunakan fitur yang sama
X = df[["room_type", "person_capacity", "cleanliness_rating", "bedrooms", "dist"]]

# Bagi data (tanpa oversampling untuk regresi)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

# Preprocessing
preprocessor_reg = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), ["room_type"])
    ],
    remainder="passthrough"
)

# Pipeline regresi
pipeline_reg = Pipeline(steps=[
    ("preprocessor", preprocessor_reg),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Latih model
pipeline_reg.fit(X_train_reg, y_train_reg)

# Evaluasi model
y_pred_reg = pipeline_reg.predict(X_test_reg)
print("RMSE:", mean_squared_error(y_test_reg, y_pred_reg, squared=False))
print("R² Score:", r2_score(y_test_reg, y_pred_reg))

# Simpan model regresi
joblib.dump(pipeline_reg, "model_prediksi_harga.pkl")

RMSE: 686.1818997850942
R² Score: 0.1402907058926478




['model_prediksi_harga.pkl']