In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("testFeatures.csv")


In [None]:
import numpy as np

# --- Eksik değer temizliği ---
cat_cols = ["ürün", "ürün kategorisi", "ürün üretim yeri", "market", "şehir"]
for col in cat_cols:
    train_df[col] = train_df[col].fillna("Eksik")
    test_df[col] = test_df[col].fillna("Eksik")

num_cols = ["ürün besin değeri"]
for col in num_cols:
    med = train_df[col].median()
    train_df[col] = train_df[col].fillna(med)
    test_df[col] = test_df[col].fillna(med)

# --- Tarihsel Özellikler ve Dönüşümler ---
for df in [train_df, test_df]:
    df["tarih"] = pd.to_datetime(df["tarih"])
    df["yıl"] = df["tarih"].dt.year
    df["ay"] = df["tarih"].dt.month
    df["hafta"] = df["tarih"].dt.isocalendar().week
    df["haftaici"] = df["tarih"].dt.weekday
    df["haftasonu"] = df["haftaici"].isin([5, 6]).astype(int)
    df["mevsim"] = df["ay"] % 12 // 3 + 1
    df["ürün_şehir"] = df["ürün"].astype(str) + "_" + df["şehir"].astype(str)
    df["ürün_kategori"] = df["ürün"].astype(str) + "_" + df["ürün kategorisi"].astype(str)
    df["şehir_market"] = df["şehir"].astype(str) + "_" + df["market"].astype(str)

# --- Frekans Özelliği ---
ürün_freq = train_df["ürün"].value_counts().to_dict()
train_df["ürün_freq"] = train_df["ürün"].map(ürün_freq)
test_df["ürün_freq"] = test_df["ürün"].map(ürün_freq)

# --- Log Dönüşümü ---
train_df["ürün besin değeri"] = np.log1p(train_df["ürün besin değeri"])
test_df["ürün besin değeri"] = np.log1p(test_df["ürün besin değeri"])


In [None]:
from sklearn.preprocessing import LabelEncoder

label_cols = [
    "ürün", "ürün kategorisi", "ürün üretim yeri", "market", "şehir",
    "ürün_şehir", "ürün_kategori", "şehir_market"
]

for col in label_cols:
    le = LabelEncoder()
    all_vals = pd.concat([train_df[col], test_df[col]]).astype(str)
    le.fit(all_vals)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))


In [None]:
for df in [train_df, test_df]:
    df["tarih"] = pd.to_datetime(df["tarih"])
    df["yıl"] = df["tarih"].dt.year
    df["ay"] = df["tarih"].dt.month
    df["hafta"] = df["tarih"].dt.isocalendar().week
    df["mevsim"] = df["ay"] % 12 // 3 + 1
    df["ürün_şehir"] = df["ürün"].astype(str) + "_" + df["şehir"].astype(str)



In [None]:
from sklearn.preprocessing import LabelEncoder

label_cols = ["ürün", "ürün kategorisi", "ürün üretim yeri", "market", "şehir", "ürün_şehir"]

for col in label_cols:
    le = LabelEncoder()
    all_vals = pd.concat([train_df[col], test_df[col]]).astype(str)
    le.fit(all_vals)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))


In [None]:
# Özellik ve hedef ayır
X = train_df.drop(columns=["tarih", "ürün fiyatı"])
y = train_df["ürün fiyatı"]


In [None]:
!pip install optuna --quiet
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
import lightgbm as lgb

# RMSE hesaplayıcı
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Optuna hedef fonksiyonu
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 20, 200),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }

    model = lgb.LGBMRegressor(**params)
    scores = cross_val_score(model, X, y, cv=3, scoring=make_scorer(rmse, greater_is_better=False))
    return -np.mean(scores)

# Optuna araması başlat
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# En iyi parametreleri al
best_params = study.best_params
best_params["random_state"] = 42


[I 2025-05-05 21:05:24,592] A new study created in memory with name: no-name-c5e3db11-128c-404e-8b35-b818e00f384d
[I 2025-05-05 21:05:31,164] Trial 0 finished with value: 5.600727980241846 and parameters: {'learning_rate': 0.23919345214646917, 'max_depth': 9, 'num_leaves': 70, 'subsample': 0.9958655875114446, 'colsample_bytree': 0.9815298106012842}. Best is trial 0 with value: 5.600727980241846.
[I 2025-05-05 21:05:35,972] Trial 1 finished with value: 5.84059923878602 and parameters: {'learning_rate': 0.17788310538425808, 'max_depth': 11, 'num_leaves': 22, 'subsample': 0.7648285776542907, 'colsample_bytree': 0.6858171744618093}. Best is trial 0 with value: 5.600727980241846.
[I 2025-05-05 21:05:41,291] Trial 2 finished with value: 6.801768311043638 and parameters: {'learning_rate': 0.16194491386431728, 'max_depth': 3, 'num_leaves': 139, 'subsample': 0.6477961020825846, 'colsample_bytree': 0.7840839371564658}. Best is trial 0 with value: 5.600727980241846.
[I 2025-05-05 21:05:49,030] Tr

In [None]:
!pip install optuna --upgrade --quiet


import optuna.visualization as vis
vis.plot_param_importances(study)


In [None]:
# Eğitim ve doğrulama ayrımı
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Optuna'dan gelen parametrelerle model
optuna_lgb_model = lgb.LGBMRegressor(**best_params)

# Modeli eğit
optuna_lgb_model.fit(X_train, y_train)

# Test setinde tahmin yap
X_test = test_df[X.columns]  # test_df zaten encoding işleminden geçmişti
optuna_preds = optuna_lgb_model.predict(X_test)


In [None]:
submission_df = pd.DataFrame({
    "id": test_df["id"],
    "ürün fiyatı": np.round(optuna_preds, 2)
})

submission_df.to_csv("final_submission_optuna.csv", index=False)


In [None]:
rf_sample_df = train_df.sample(n=45504, random_state=42)
X_rf = rf_sample_df.drop(columns=["tarih", "ürün fiyatı"])
y_rf = rf_sample_df["ürün fiyatı"]


In [None]:
from sklearn.model_selection import train_test_split

X = train_df.drop(columns=["tarih", "ürün fiyatı"])
y = train_df["ürün fiyatı"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = test_df[X.columns]


In [None]:
import lightgbm as lgb

lgb_model = lgb.LGBMRegressor(
    random_state=42,
    n_estimators=40,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8
)
lgb_model.fit(X_train, y_train)
lgb_preds = lgb_model.predict(X_test)


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=40,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_rf, y_rf)
rf_preds = rf_model.predict(X_test)


In [None]:
ensemble_preds = (lgb_preds + rf_preds) / 2


In [None]:
submission_df = pd.DataFrame({
    "id": test_df["id"],
    "ürün fiyatı": np.round(ensemble_preds, 2)
})

submission_df.to_csv("final_submission_rf_lgb.csv", index=False)


In [None]:
# Başlıkla birlikte yazdır
print("Tahminler virgülden sonra 2 basamakla:\n")
print(submission_df)

Tahminler virgülden sonra 2 basamakla:

          id  ürün fiyatı
0          0        86.65
1          1        33.60
2          2        37.52
3          3        23.81
4          4        37.18
...      ...          ...
45499  45499        56.96
45500  45500        57.63
45501  45501        58.34
45502  45502        58.86
45503  45503        64.31

[45504 rows x 2 columns]
