<a href="https://colab.research.google.com/github/dilarashnn/online_retail_etl_project/blob/main/online_retail_etl_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# -------------------------
# 0️⃣ Kütüphaneler
# -------------------------
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# -------------------------
# 1️⃣ Dosya yolları ve output klasörleri
# -------------------------
DATA_PATH = 'data/raw_sales.csv'
CUSTOMERS_PATH = 'data/customers.csv'
PRODUCTS_PATH = 'data/products.csv'

OUTPUT_STATS = 'output/stats.csv'
OUTPUT_PROCESSED = 'output/processed_sales.csv'
OUTPUT_PLOTS = 'output/plots/'

os.makedirs('output', exist_ok=True)
os.makedirs(OUTPUT_PLOTS, exist_ok=True)

CHUNK_SIZE = 500

# -------------------------
# 2️⃣ Extract
# -------------------------
def extract_sales(path=DATA_PATH):
    for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
        yield chunk

df_customers = pd.read_csv(CUSTOMERS_PATH)
df_products = pd.read_csv(PRODUCTS_PATH)

# -------------------------
# 3️⃣ Transform + ML + Analiz
# -------------------------
def transform(chunk):
    # Merge customer ve product bilgileri
    df = chunk.merge(df_customers, on='customer_id', how='left')
    df = df.merge(df_products, on='product_id', how='left')

    # Temizlik
    df['order_date'] = pd.to_datetime(df['order_date'])
    df['discount'] = df['discount'].fillna(0)

    # Outlier temizleme (total hesaplamadan önce)
    df['total'] = df['price'] * df['quantity'] * (1 - df['discount'])
    upper_limit = df['total'].quantile(0.99)
    df = df[df['total'] <= upper_limit]

    # Yeni sütunlar
    df['age_group'] = pd.cut(df['customer_age'],
                             bins=[0,18,30,45,60,100],
                             labels=["teen","young","adult","mid-age","senior"])

    # ML modeli: total tahmini
    features = ['price', 'quantity', 'discount']
    X_train, X_test, y_train, y_test = train_test_split(df[features], df['total'], test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=50, random_state=42)
    model.fit(X_train, y_train)
    df['predicted_total'] = model.predict(df[features])

    # Grafik: gerçek vs tahmin
    plt.figure(figsize=(6,6))
    sns.scatterplot(x='total', y='predicted_total', data=df)
    plt.title('Gerçek vs Tahmin Edilen Total')
    plt.xlabel('Gerçek Total')
    plt.ylabel('Tahmin Total')
    plt.savefig(os.path.join(OUTPUT_PLOTS, f'total_vs_predicted_{np.random.randint(0,10000)}.png'))
    plt.close()

    return df

# -------------------------
# 4️⃣ Load ve stats.csv
# -------------------------
all_chunks = []

for chunk in extract_sales():
    transformed = transform(chunk)

    # İşlenmiş veri append
    transformed.to_csv(OUTPUT_PROCESSED, mode='a', index=False, header=not os.path.exists(OUTPUT_PROCESSED))

    # Tüm chunk’ları birleştirmek için listeye ekle
    all_chunks.append(transformed)

# Tüm veriyi birleştir ve stats.csv oluştur
all_data = pd.concat(all_chunks, ignore_index=True)
stats = all_data.describe(include='all').transpose()
stats.to_csv(OUTPUT_STATS)

print("✅ Pipeline tamamlandı!")
print(f"Processed data: {OUTPUT_PROCESSED}")
print(f"Stats summary: {OUTPUT_STATS}")
print(f"Plots: {OUTPUT_PLOTS}")


✅ Pipeline tamamlandı!
Processed data: output/processed_sales.csv
Stats summary: output/stats.csv
Plots: output/plots/
