In [1]:
import os
import sqlite3
import pandas as pd
import logging
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

# Configure logging
LOGS_PATH = "logs/"
os.makedirs(LOGS_PATH, exist_ok=True)
logging.basicConfig(filename=os.path.join(LOGS_PATH, 'data_transformation.log'), level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Define paths
PROCESSED_DATA_PATH = "processed_data/"
DB_PATH = "database/churn_data.db"
os.makedirs("database", exist_ok=True)
LATEST_DATE = datetime.now().strftime('%Y-%m-%d')
KAGGLE_CLEANED_DATA = os.path.join(PROCESSED_DATA_PATH, "Kaggle_cleaned.csv")
HUGGINGFACE_CLEANED_DATA = os.path.join(PROCESSED_DATA_PATH, "Hugging Face_cleaned.csv")

# Load datasets
def load_data(file_path, source):
    if not os.path.exists(file_path):
        logging.error(f"{source} dataset not found at {file_path}")
        return None
    df = pd.read_csv(file_path)
    logging.info(f"Loaded {source} dataset with {df.shape[0]} rows and {df.shape[1]} columns.")
    return df

# Feature engineering
def feature_engineering(df):
    if 'tenure' in df.columns and 'MonthlyCharges' in df.columns:
        df['TotalSpend'] = df['tenure'] * df['MonthlyCharges']  # Example feature
    return df

# Normalize numerical features
def normalize_features(df):
    scaler = MinMaxScaler()
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df

# Store transformed data into SQLite
def store_in_database(df, table_name, db_path=DB_PATH):
    conn = sqlite3.connect(db_path)
    df.to_sql(table_name, conn, if_exists='replace', index=False)
    conn.close()
    logging.info(f"Stored transformed data in {table_name} table.")

if __name__ == "__main__":
    kaggle_df = load_data(KAGGLE_CLEANED_DATA, "Kaggle")
    hf_df = load_data(HUGGINGFACE_CLEANED_DATA, "Hugging Face")
    
    for source, df in zip(["Kaggle", "Hugging Face"], [kaggle_df, hf_df]):
        if df is not None:
            df = feature_engineering(df)
            df = normalize_features(df)
            store_in_database(df, f"{source.replace(' ', '_')}_transformed")
    
    logging.info("Data transformation completed.")
