<a href="https://colab.research.google.com/github/enescicek/YZTA-Datathon-2025/blob/main/DNN9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install catboost lightgbm --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ✅ DNN V4 + Residual Ridge Boosting (Final Kurşun)

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import Ridge
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from google.colab import files

# 1. Verileri yükle
train = pd.read_csv("/content/drive/MyDrive/academy2025/train.csv")
test = pd.read_csv("/content/drive/MyDrive/academy2025/testFeatures.csv")

# 2. Tarih işle + yeni tarih bazlı feature'lar
def process_dates(df):
    df["tarih"] = pd.to_datetime(df["tarih"])
    df["yıl"] = df["tarih"].dt.year
    df["ay"] = df["tarih"].dt.month
    df["gün"] = df["tarih"].dt.day
    df["hafta_günü"] = df["tarih"].dt.weekday
    df["hafta_sonu_mu"] = df["hafta_günü"].apply(lambda x: 1 if x >= 5 else 0)
    df["sezon_index"] = df["ay"] * 100 + df["gün"]
    return df

train = process_dates(train)
test = process_dates(test)

# 3. Label encode kategorik sütunlar
cat_cols = ["ürün", "ürün kategorisi", "ürün üretim yeri", "market", "şehir"]
cat_dims = {}
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    cat_dims[col] = train[col].nunique()

# 4. Feature Engineering V3

def group_mean_feature(df_train, df_test, group_cols, target_col, new_feature):
    g = df_train.groupby(group_cols)[target_col].mean().reset_index()
    g.columns = group_cols + [new_feature]
    df_train = df_train.merge(g, on=group_cols, how="left")
    df_test = df_test.merge(g, on=group_cols, how="left")
    return df_train, df_test

train, test = group_mean_feature(train, test, ["ürün", "şehir"], "ürün fiyatı", "ürün_şehir_mean")
train, test = group_mean_feature(train, test, ["ürün", "market"], "ürün fiyatı", "ürün_market_mean")
train, test = group_mean_feature(train, test, ["ürün"], "ürün fiyatı", "ürün_global_mean")
train["fiyat_diff"] = train["ürün_global_mean"] - train["ürün_şehir_mean"]
test["fiyat_diff"] = test["ürün_global_mean"] - test["ürün_şehir_mean"]

# 5. Feature Engineering V4
std1 = train.groupby(["ürün", "şehir"])["ürün fiyatı"].std().reset_index().rename(columns={"ürün fiyatı": "ürün_şehir_fiyat_std"})
train = train.merge(std1, on=["ürün", "şehir"], how="left")
test = test.merge(std1, on=["ürün", "şehir"], how="left")

mean1 = train.groupby(["ürün", "ay"])["ürün fiyatı"].mean().reset_index().rename(columns={"ürün fiyatı": "ürün_ay_mean"})
train = train.merge(mean1, on=["ürün", "ay"], how="left")
test = test.merge(mean1, on=["ürün", "ay"], how="left")

mean2 = train.groupby(["ürün kategorisi", "ay"])["ürün fiyatı"].mean().reset_index().rename(columns={"ürün fiyatı": "kategori_ay_mean"})
train = train.merge(mean2, on=["ürün kategorisi", "ay"], how="left")
test = test.merge(mean2, on=["ürün kategorisi", "ay"], how="left")

std2 = train.groupby("ürün")["ürün fiyatı"].std().reset_index().rename(columns={"ürün fiyatı": "ürün_genel_std"})
train = train.merge(std2, on="ürün", how="left")
test = test.merge(std2, on="ürün", how="left")

mean3 = train.groupby("şehir")["ürün fiyatı"].mean().reset_index().rename(columns={"ürün fiyatı": "şehir_global_mean"})
train = train.merge(mean3, on="şehir", how="left")
test = test.merge(mean3, on="şehir", how="left")

train["ürün_şehir_vs_global"] = train["ürün_şehir_mean"] - train["şehir_global_mean"]
test["ürün_şehir_vs_global"] = test["ürün_şehir_mean"] - test["şehir_global_mean"]

# 6. Besin seviye
bins = [0, 100, 200, 1000]
labels = [0, 1, 2]
for df in [train, test]:
    df["besin_seviye"] = pd.cut(df["ürün besin değeri"], bins=bins, labels=labels).astype(int)

# 7. Eksik doldur
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)
train.drop(columns=["tarih"], inplace=True)
test.drop(columns=["tarih"], inplace=True)

# 8. DNN V4
target = "ürün fiyatı"
numerical_cols = [col for col in train.columns if col not in cat_cols + [target]]
X_num = train[numerical_cols]
X_test_num = test[numerical_cols]

scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)
X_test_num_scaled = scaler.transform(X_test_num)

cat_inputs = []
cat_embeds = []
for col in cat_cols:
    input_cat = Input(shape=(1,))
    embed = Embedding(input_dim=cat_dims[col]+1, output_dim=min(50, (cat_dims[col]+1)//2))(input_cat)
    embed = Flatten()(embed)
    cat_inputs.append(input_cat)
    cat_embeds.append(embed)

num_input = Input(shape=(X_num_scaled.shape[1],))
x = Concatenate()(cat_embeds + [num_input])
x = Dense(256)(x)
x = LeakyReLU()(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(128)(x)
x = LeakyReLU()(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(64)(x)
x = LeakyReLU()(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(32)(x)
x = LeakyReLU()(x)
x = BatchNormalization()(x)
output = Dense(1)(x)

model = Model(inputs=cat_inputs + [num_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss="mse")

X_train_input = [train[col].values for col in cat_cols] + [X_num_scaled]
X_test_input = [test[col].values for col in cat_cols] + [X_test_num_scaled]
y_train = train[target].values

early_stop = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, min_lr=1e-5)

model.fit(
    X_train_input, y_train,
    epochs=150,
    batch_size=256,
    verbose=1,
    callbacks=[early_stop, lr_scheduler]
)

# 9. Tahmin al
pred_dnn = model.predict(X_test_input).flatten()
pred_dnn_train = model.predict(X_train_input).flatten()

# 10. Residual hesapla ve Ridge ile öğren
residual = y_train - pred_dnn_train
ridge = Ridge()
ridge.fit(X_num_scaled, residual)
pred_residual = ridge.predict(X_test_num_scaled)

# 11. Final tahmin = DNN + Residual
final_pred = pred_dnn + pred_residual

submission = pd.DataFrame({
    "id": test["id"],
    "ürün fiyatı": final_pred
})
submission.to_csv("submission_dnnv4_residual.csv", index=False)
files.download("submission_dnnv4_residual.csv")

Epoch 1/150
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 12ms/step - loss: 226.6623 - learning_rate: 0.0010
Epoch 2/150
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - loss: 9.6978 - learning_rate: 0.0010
Epoch 3/150
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - loss: 8.5123 - learning_rate: 0.0010
Epoch 4/150
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - loss: 7.6164 - learning_rate: 0.0010
Epoch 5/150
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - loss: 6.6109 - learning_rate: 0.0010
Epoch 6/150
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - loss: 6.2713 - learning_rate: 0.0010
Epoch 7/150
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - loss: 6.0272 - learning_rate: 0.0010
Epoch 8/150
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 11ms/step - loss: 5.688

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>