In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


GROSS_MARGIN = 0.40
MAX_ROI = 3.0
MIN_DURATION = 3
MAX_DURATION = 60

def load_and_preprocess_data(filepath):

    print(f"Veri dosyası yükleniyor: {filepath}")
    try:
        df = pd.read_csv('/content/kozmetik_dataset_gercekci_dagilim.csv')
        print(f"Veri başarıyla yüklendi - {len(df)} kayıt")
    except Exception as e:
        print(f"Veri yükleme hatası: {e}")
        return None, None


    missing_values = df.isnull().sum()
    if missing_values.sum() > 0:
        print(f"Eksik değerler tespit edildi:\n{missing_values[missing_values > 0]}")
        print("Eksik değerler medyan/mod ile doldurulacak")

        for col in df.select_dtypes(include=['int64', 'float64']).columns:
            df[col] = df[col].fillna(df[col].median())

        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].fillna(df[col].mode()[0])



    print(df.describe().T)


    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    for col in numeric_cols:
        if col in ['budget', 'revenue', 'clicks', 'conversions']:
            q1 = df[col].quantile(0.25)
            q3 = df[col].quantile(0.75)
            iqr = q3 - q1
            upper_bound = q3 + 1.5 * iqr
            lower_bound = q1 - 1.5 * iqr
            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
            if len(outliers) > 0:
                print(f"Aykırı değerler {col} sütununda: {len(outliers)} kayıt")

                df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)


    if 'campaign_date' in df.columns:
        df['campaign_date'] = pd.to_datetime(df['campaign_date'])
        df['year'] = df['campaign_date'].dt.year
        df['month'] = df['campaign_date'].dt.month
        df['day'] = df['campaign_date'].dt.day
        df['season'] = df['month'].apply(lambda x: 'Winter' if x in [12, 1, 2] else
                                         'Spring' if x in [3, 4, 5] else
                                         'Summer' if x in [6, 7, 8] else 'Fall')


    df['net_profit'] = df['revenue'] - df['budget']
    df['roi'] = df['net_profit'] / df['budget'].replace(0, 1)


    df['roi'] = df['roi'].clip(upper=MAX_ROI)


    categorical_cols = [col for col in df.columns if df[col].dtype == 'object']
    print(f"\nKategorik değişkenler: {categorical_cols}")

    label_encoders = {}
    for col in categorical_cols:
        print(f"{col} için benzersiz değerler: {df[col].nunique()}")
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le


    if 'campaign_id' in df.columns:
        df = df.drop(['campaign_id'], axis=1)
    if 'campaign_date' in df.columns:
        df = df.drop(['campaign_date'], axis=1)

    return df, label_encoders

def feature_importance_analysis(df, models, target_names):

    print("\n--- Özellik Önem Analizi ---")

    plt.figure(figsize=(14, len(target_names) * 5))

    for i, (target, model) in enumerate(models.items(), 1):
        if hasattr(model[-1], 'feature_importances_'):

            feature_names = df.drop(list(target_names.keys()) + ['roi'], axis=1).columns
            importances = model[-1].feature_importances_


            indices = np.argsort(importances)[::-1]

            plt.subplot(len(target_names), 1, i)
            plt.barh(range(len(indices)), importances[indices], align='center')
            plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
            plt.title(f'{target_names[target]} için Özellik Önemliliği')
            plt.xlabel('Önem Skoru')
            plt.tight_layout()

            print(f"\n{target_names[target]} için en önemli 5 özellik:")
            for j in range(5):
                if j < len(indices):
                    print(f"  {feature_names[indices[j]]}: {importances[indices[j]]:.4f}")

    plt.savefig('feature_importance.png')
    plt.close()
    print("\nÖzellik önem analizi grafiği 'feature_importance.png' olarak kaydedildi")

def train_models(df, tune_hyperparams=False):


    targets = {
        'duration_days': 'Kampanya Süresi',
        'clicks': 'Tıklanma Sayısı',
        'conversions': 'Dönüşüm Sayısı',
        'net_profit': 'Net Kar',
        'revenue': 'Gelir'
    }

    models = {}
    performance = {}


    X = df.drop(list(targets.keys()) + ['roi'], axis=1)


    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns


    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])


    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features)
        ])

    print("\n--- Model Eğitimi Başlıyor ---")
    for target, target_name in targets.items():
        print(f"\n{target_name} modeli eğitiliyor...")
        y = df[target]


        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)

        if target in ['duration_days', 'clicks']:
            base_model = GradientBoostingRegressor(random_state=42)
            if tune_hyperparams:
                param_grid = {
                    'model__n_estimators': [50, 100, 200],
                    'model__learning_rate': [0.01, 0.05, 0.1],
                    'model__max_depth': [3, 5, 7]
                }
        else:
            base_model = RandomForestRegressor(random_state=42)
            if tune_hyperparams:
                param_grid = {
                    'model__n_estimators': [50, 100, 200],
                    'model__max_depth': [None, 10, 20],
                    'model__min_samples_split': [2, 5, 10]
                }


        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', base_model)
        ])


        if tune_hyperparams:
            print(f"  Hiperparametre optimizasyonu yapılıyor...")
            grid_search = GridSearchCV(pipeline, param_grid, cv=3,
                                      scoring='neg_mean_squared_error', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            pipeline = grid_search.best_estimator_
            print(f"  En iyi parametreler: {grid_search.best_params_}")
        else:
            pipeline.fit(X_train, y_train)

        y_pred = pipeline.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)


        performance[target] = {
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2
        }

        print(f"  {target_name} Modeli - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.2f}")


        plt.figure(figsize=(10, 6))
        plt.scatter(y_test, y_pred, alpha=0.5)
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
        plt.xlabel('Gerçek Değerler')
        plt.ylabel('Tahmin Değerler')
        plt.title(f'{target_name} - Gerçek vs Tahmin')
        plt.savefig(f'{target}_pred_vs_actual.png')
        plt.close()

        models[target] = pipeline


    feature_importance_analysis(df, models, targets)

    return models, performance

def find_best_strategy(df, models, region, budget, brand, product_category, min_duration=MIN_DURATION, max_duration=MAX_DURATION):
    """En iyi kampanya stratejisini bul - daha kapsamlı analiz"""

    if budget <= 0:
        print("UYARI: Bütçe pozitif bir değer olmalıdır!")
        return None, None


    input_data = {
        'product_category': product_category,
        'brand': brand,
        'budget': budget,
        'region': region,
        'year': 2024,
        'month': 6,
        'day': 15,
        'season': 2
    }


    campaign_types = sorted(df['campaign_type'].unique())
    channels = sorted(df['channel'].unique())

    print(f"\nToplam {len(campaign_types) * len(channels)} strateji değerlendiriliyor...")

    results = []

    for campaign_type in campaign_types:
        for channel in channels:
            current_data = input_data.copy()
            current_data['campaign_type'] = campaign_type
            current_data['channel'] = channel


            for col in df.columns:
                if col not in current_data and col not in ['duration_days', 'clicks', 'conversions', 'net_profit', 'revenue', 'roi']:
                    if col in df.select_dtypes(include=['int64', 'float64']).columns:
                        current_data[col] = df[col].median()
                    else:
                        current_data[col] = df[col].mode()[0]

            input_df = pd.DataFrame([current_data])


            predictions = {}
            for target, model in models.items():
                pred_value = model.predict(input_df)[0]


                if target == 'duration_days':
                    pred_value = max(min_duration, min(max_duration, pred_value))
                elif target in ['clicks', 'conversions']:
                    pred_value = max(0, pred_value)

                predictions[target] = pred_value


            predictions['revenue'] = predictions['revenue']
            predictions['net_profit'] = predictions['revenue'] - budget


            if budget > 0:
                predictions['roi'] = predictions['net_profit'] / budget
            else:
                predictions['roi'] = 0


            predictions['roi'] = min(predictions['roi'], MAX_ROI)
            predictions['revenue'] = min(predictions['revenue'], budget * (MAX_ROI + 1))


            conversion_factor = predictions['conversions'] / budget if budget > 0 else 0
            score = (0.6 * predictions['roi'] +
                    0.3 * (predictions['net_profit'] / max(10000, budget)) +
                    0.1 * conversion_factor)


            results.append({
                'campaign_type': campaign_type,
                'channel': channel,
                'score': score,
                'predictions': predictions
            })


    results.sort(key=lambda x: x['score'], reverse=True)


    print("\n--- En İyi 3 Strateji ---")
    for i, result in enumerate(results[:3]):
        print(f"{i+1}. Strateji: Kampanya Tipi - {result['campaign_type']}, Kanal - {result['channel']}")
        print(f"   Skor: {result['score']:.4f}, ROI: {result['predictions']['roi']:.2f}, " +
              f"Net Kar: {result['predictions']['net_profit']:.2f} TL")


    if results:
        best_result = results[0]
        return (best_result['campaign_type'], best_result['channel']), best_result['predictions']
    else:
        return None, None

def user_interface(df, label_encoders, models):

    print("\n" + "="*50)
    print("*** Kozmetik Kampanya Optimizasyon Sistemi ***")
    print("="*50 + "\n")


    print("Mevcut Bölgeler:")
    for i, region in enumerate(sorted(label_encoders['region'].classes_)):
        print(f"  {i+1}. {region}")

    print("\nMevcut Markalar:")
    for i, brand in enumerate(sorted(label_encoders['brand'].classes_)):
        print(f"  {i+1}. {brand}")

    print("\nMevcut Ürün Kategorileri:")
    for i, category in enumerate(sorted(label_encoders['product_category'].classes_)):
        print(f"  {i+1}. {category}")

    print("\n" + "-"*50)


    while True:
        try:
            region = input("\nBölge seçiniz: ")
            if region not in label_encoders['region'].classes_:
                print(f"HATA: '{region}' geçerli bir bölge değil. Yukarıdaki listeden bir bölge seçiniz.")
                continue

            budget_input = input("Bütçe giriniz (TL): ")
            budget = float(budget_input)
            if budget <= 0:
                print("HATA: Bütçe pozitif bir sayı olmalıdır.")
                continue

            brand = input("Marka seçiniz: ")
            if brand not in label_encoders['brand'].classes_:
                print(f"HATA: '{brand}' geçerli bir marka değil. Yukarıdaki listeden bir marka seçiniz.")
                continue

            product_category = input("Ürün kategorisi seçiniz: ")
            if product_category not in label_encoders['product_category'].classes_:
                print(f"HATA: '{product_category}' geçerli bir ürün kategorisi değil. Yukarıdaki listeden bir kategori seçiniz.")
                continue

            break
        except ValueError as e:
            print(f"HATA: Geçersiz giriş. {e}")

    print("\nStratejiler değerlendiriliyor...")


    region_encoded = label_encoders['region'].transform([region])[0]
    brand_encoded = label_encoders['brand'].transform([brand])[0]
    product_category_encoded = label_encoders['product_category'].transform([product_category])[0]


    best_strategy, predictions = find_best_strategy(
        df, models, region_encoded, budget, brand_encoded, product_category_encoded)

    if best_strategy is None:
        print("Strateji değerlendirmesi başarısız oldu! Lütfen girdilerinizi kontrol ediniz.")
        return

    campaign_type, channel = best_strategy

    campaign_type_name = label_encoders['campaign_type'].inverse_transform([campaign_type])[0]
    channel_name = label_encoders['channel'].inverse_transform([channel])[0]


    print("\n" + "="*50)
    print("*** Önerilen Kampanya Stratejisi ***")
    print("="*50)
    print(f"Bölge: {region}")
    print(f"Marka: {brand}")
    print(f"Ürün Kategorisi: {product_category}")
    print(f"Bütçe: {budget:.2f} TL")
    print(f"\nKampanya Tipi: {campaign_type_name}")
    print(f"Kanal: {channel_name}")
    print(f"Önerilen Süre: {int(round(predictions['duration_days']))} gün")

    print("\n" + "-"*50)
    print("*** Beklenen Performans Göstergeleri ***")
    print("-"*50)
    print(f"Tahmini Tıklanma: {int(round(predictions['clicks']))}")
    print(f"Tahmini Dönüşüm: {int(round(predictions['conversions']))}")
    print(f"Dönüşüm Oranı: {(predictions['conversions'] / predictions['clicks'] * 100):.2f}%" if predictions['clicks'] > 0 else "N/A")
    print(f"Tahmini Brüt Gelir: {predictions['revenue']:.2f} TL")
    print(f"Tahmini Net Kar: {predictions['net_profit']:.2f} TL")
    print(f"Hesaplanan ROI: {predictions['roi']:.2f}x ({(predictions['roi']*100):.1f}%)")

    if budget > 0:
        print(f"Bütçe Başına Tıklama: {(predictions['clicks'] / budget):.4f} tık/TL")
        print(f"Bütçe Başına Dönüşüm: {(predictions['conversions'] / budget):.4f} dönüşüm/TL")
        print(f"Dönüşüm Başına Maliyet (CPA): {(budget / predictions['conversions']):.2f} TL" if predictions['conversions'] > 0 else "N/A")


    print("\n" + "-"*50)
    print("*** Güven Aralıkları (±15%) ***")
    print("-"*50)
    print(f"Net Kar: {predictions['net_profit']*.85:.2f} TL - {predictions['net_profit']*1.15:.2f} TL")
    print(f"ROI: {predictions['roi']*.85:.2f}x - {predictions['roi']*1.15:.2f}x")
    print(f"Dönüşüm: {int(predictions['conversions']*.85)} - {int(predictions['conversions']*1.15)}")

def visualize_data(df):

    print("\nVeri görselleştirmeleri hazırlanıyor...")


    plt.figure(figsize=(12, 6))
    category_roi = df.groupby('product_category')['roi'].mean().sort_values(ascending=False)
    category_roi.plot(kind='bar')
    plt.title('Ürün Kategorilerine Göre Ortalama ROI')
    plt.ylabel('Ortalama ROI')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('category_roi.png')
    plt.close()


    plt.figure(figsize=(10, 6))
    plt.scatter(df['budget'], df['roi'], alpha=0.5)
    plt.title('Bütçe ve ROI İlişkisi')
    plt.xlabel('Bütçe')
    plt.ylabel('ROI')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('budget_roi.png')
    plt.close()


    plt.figure(figsize=(12, 6))
    channel_conv = df.groupby('channel')['conversions'].mean().sort_values(ascending=False)
    channel_conv.plot(kind='bar')
    plt.title('Kanallara Göre Ortalama Dönüşüm')
    plt.ylabel('Ortalama Dönüşüm')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('channel_conversions.png')
    plt.close()


    plt.figure(figsize=(14, 7))
    campaign_metrics = df.groupby('campaign_type')[['clicks', 'conversions']].mean()
    campaign_metrics.plot(kind='bar')
    plt.title('Kampanya Türlerine Göre Ortalama Tıklama ve Dönüşüm')
    plt.ylabel('Ortalama Değer')
    plt.xticks(rotation=45)
    plt.legend(['Tıklama', 'Dönüşüm'])
    plt.tight_layout()
    plt.savefig('campaign_metrics.png')
    plt.close()

    print("Görselleştirmeler kaydedildi: category_roi.png, budget_roi.png, channel_conversions.png, campaign_metrics.png")

def main():


    data_file = 'kozmetik_dataset_gercek_dagilimli.csv'
    df, label_encoders = load_and_preprocess_data(data_file)

    if df is None:
        print(f"HATA: '{data_file}' dosyası yüklenemedi!")
        return


    visualize_data(df)


    try:
        models = joblib.load('kozmetik_models.pkl')
        label_encoders = joblib.load('label_encoders.pkl')
        print("\nÖnceden eğitilmiş modeller yüklendi.")
    except:

        tune_models = input("Hiperparametre optimizasyonu yapmak ister misiniz? (e/h, varsayılan: h): ").lower() == 'e'
        models, performance = train_models(df, tune_hyperparams=tune_models)


        joblib.dump(models, 'kozmetik_models.pkl')
        joblib.dump(label_encoders, 'label_encoders.pkl')



    while True:
        user_interface(df, label_encoders, models)
        another = input("\nBaşka bir tahmin yapmak ister misiniz? (e/h): ")
        if another.lower() != 'e':
            break

    print("\nProgram sonlandırıldı. Teşekkürler!")

if __name__ == "__main__":
    main()

Veri dosyası yükleniyor: kozmetik_dataset_gercek_dagilimli.csv
Veri başarıyla yüklendi - 30000 kayıt
                   count          mean           std       min         25%  \
campaign_id      30000.0  15000.500000   8660.398374      1.00   7500.7500   
discount_rate    30000.0     19.221900     12.083479      0.00     10.0000   
budget           30000.0  17178.481333   8034.451168   2000.00  11366.7500   
impressions      30000.0  44678.024867  27485.513238   5000.00  24470.0000   
clicks           30000.0   1433.346867   1222.440232     20.00    533.0000   
conversions      30000.0    142.870967    150.428628      0.00     41.0000   
avg_order_value  30000.0    104.805477     58.068888    -34.93     64.8275   
revenue          30000.0  17340.616137  26856.691739 -13369.86   3119.1350   
roi              30000.0      0.144357      0.804863     -0.80     -0.4100   
duration_days    30000.0     15.924167      7.787959      3.00      9.0000   

                       50%       75%    