<a href="https://colab.research.google.com/github/dilarashnn/online_retail_etl_project/blob/main/online_retail_etl_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing libraries for data handling, visualization, and machine learning
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


# Assigning data paths
DATA_PATH = 'data/raw_sales.csv'
CUSTOMERS_PATH = 'data/customers.csv'
PRODUCTS_PATH = 'data/products.csv'

OUTPUT_STATS = 'output/stats.csv'
OUTPUT_PROCESSED = 'output/processed_sales.csv'
OUTPUT_PLOTS = 'output/plots/'

os.makedirs('output', exist_ok=True)
os.makedirs(OUTPUT_PLOTS, exist_ok=True)

# Setting the size of data chunks to process at a time
CHUNK_SIZE = 500

# Extract
# Function to read sales data in chunks and load customers and products data
def extract_sales(path=DATA_PATH):
    for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
        yield chunk

df_customers = pd.read_csv(CUSTOMERS_PATH)
df_products = pd.read_csv(PRODUCTS_PATH)

# Transform
def transform(chunk):
    # Merge customer ve product
    df = chunk.merge(df_customers, on='customer_id', how='left')
    df = df.merge(df_products, on='product_id', how='left')

    # # Convert 'order_date' to datetime and fill missing 'discount' values with 0
    df['order_date'] = pd.to_datetime(df['order_date'])
    df['discount'] = df['discount'].fillna(0)

    # Calculate total price after discount and remove top 1% outliers
    df['total'] = df['price'] * df['quantity'] * (1 - df['discount'])
    upper_limit = df['total'].quantile(0.99)
    df = df[df['total'] <= upper_limit]

    # Categorize customer ages into groups like teen, young, adult, etc.
    df['age_group'] = pd.cut(df['customer_age'],
                             bins=[0,18,30,45,60,100],
                             labels=["teen","young","adult","mid-age","senior"])

    # Train a Random Forest to predict total and add predictions to the DataFrame
    features = ['price', 'quantity', 'discount']
    X_train, X_test, y_train, y_test = train_test_split(df[features], df['total'], test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=50, random_state=42)
    model.fit(X_train, y_train)
    df['predicted_total'] = model.predict(df[features])

    # Visualize actual vs predicted total and save the figure as a PNG file
    plt.figure(figsize=(6,6))
    sns.scatterplot(x='total', y='predicted_total', data=df)
    plt.title('Gerçek vs Tahmin Edilen Total')
    plt.xlabel('Gerçek Total')
    plt.ylabel('Tahmin Total')
    plt.savefig(os.path.join(OUTPUT_PLOTS, f'total_vs_predicted_{np.random.randint(0,10000)}.png'))
    plt.close()

    return df

# Load

all_chunks = []

# Process each sales chunk, save to CSV, and collect all chunks in a list
for chunk in extract_sales():
    transformed = transform(chunk)

    # Append processed data
    transformed.to_csv(OUTPUT_PROCESSED, mode='a', index=False, header=not os.path.exists(OUTPUT_PROCESSED))

    # Add chunk to list for later use
    all_chunks.append(transformed)

# Combine all chunks into one DataFrame, generate descriptive stats, and save as CSV
all_data = pd.concat(all_chunks, ignore_index=True)
stats = all_data.describe(include='all').transpose()
stats.to_csv(OUTPUT_STATS)

print("✅ Pipeline tamamlandı!")
print(f"Processed data: {OUTPUT_PROCESSED}")
print(f"Stats summary: {OUTPUT_STATS}")
print(f"Plots: {OUTPUT_PLOTS}")


✅ Pipeline tamamlandı!
Processed data: output/processed_sales.csv
Stats summary: output/stats.csv
Plots: output/plots/
