# 03 - Model

Profit hedefiyle kisa bir model karsilastirmasi.

In [None]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

project_root = Path('..')
sys.path.append(str(project_root / 'src'))

from evaluate import evaluate_regression

data_path = project_root / 'data' / 'processed' / 'clean.csv'
df = pd.read_csv(data_path)

target = 'Profit'
if target not in df.columns:
    raise ValueError(f'Target column not found: {target}')

X = df.drop(columns=[target])
if target.lower() == 'profit' and 'profit_margin' in X.columns:
    X = X.drop(columns=['profit_margin'])
y = df[target]

date_cols = [col for col in X.columns if 'date' in col.lower()]
for col in date_cols:
    X[col] = pd.to_datetime(X[col], errors='coerce')
dt_cols = X.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, UTC]']).columns
if len(dt_cols) > 0:
    X = X.drop(columns=dt_cols)

num_cols = X.select_dtypes(include='number').columns.tolist()
cat_cols = [col for col in X.columns if col not in num_cols]

numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]
)
categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore'))]
)

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, num_cols), ('cat', categorical_transformer, cat_cols)]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

min_val = pd.to_numeric(y_train, errors='coerce').min()
shift = float(-min_val + 1.0) if min_val <= -0.999 else 0.0

def _transform(values):
    return np.log1p(values + shift)

def _inverse(values):
    return np.expm1(values) - shift

models = {
    'LinearRegression': TransformedTargetRegressor(
        regressor=LinearRegression(), func=_transform, inverse_func=_inverse
    ),
    'RandomForestRegressor': TransformedTargetRegressor(
        regressor=RandomForestRegressor(
            n_estimators=200,
            min_samples_leaf=2,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1,
        ),
        func=_transform,
        inverse_func=_inverse,
    ),
}

rows = []
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocess', clone(preprocessor)), ('model', model)])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    metrics = evaluate_regression(y_test, preds)
    rows.append({'model': name, **metrics})

results = pd.DataFrame(rows).sort_values('rmse')
results
