In [None]:
import os
import sqlite3
import logging
import subprocess
from datetime import datetime

# Configure logging
LOGS_PATH = "logs/"
os.makedirs(LOGS_PATH, exist_ok=True)
logging.basicConfig(filename=os.path.join(LOGS_PATH, 'data_versioning.log'), level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Define paths
DATA_DIR = "processed_data/"
DB_PATH = "database/data_versions.db"
os.makedirs("database", exist_ok=True)

# Initialize SQLite database for version tracking
def initialize_versioning_db():
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS data_versions (
            version_id INTEGER PRIMARY KEY AUTOINCREMENT,
            dataset_name TEXT,
            version_tag TEXT,
            timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    conn.commit()
    conn.close()
    logging.info("Initialized data versioning database.")

# Commit dataset version using DVC
def commit_dataset_version(dataset_name, version_tag):
    try:
        dataset_path = os.path.join(DATA_DIR, dataset_name)
        if not os.path.exists(dataset_path):
            logging.error(f"Dataset {dataset_name} not found.")
            return

        # Add dataset to DVC tracking
        subprocess.run(["dvc", "add", dataset_path], check=True)
        subprocess.run(["git", "add", f"{dataset_path}.dvc"], check=True)
        subprocess.run(["git", "commit", "-m", f"Version {version_tag} of {dataset_name}"], check=True)
        subprocess.run(["git", "tag", version_tag], check=True)
        subprocess.run(["git", "push"], check=True)
        subprocess.run(["dvc", "push"], check=True)
        
        # Store version in database
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute("INSERT INTO data_versions (dataset_name, version_tag) VALUES (?, ?)", 
                       (dataset_name, version_tag))
        conn.commit()
        conn.close()

        logging.info(f"Successfully versioned {dataset_name} as {version_tag}.")
    except Exception as e:
        logging.error(f"Data versioning failed: {e}")

if __name__ == "__main__":
    initialize_versioning_db()
    
    # Example usage: Commit a new version of a dataset
    commit_dataset_version("Kaggle_cleaned.csv", f"v{datetime.now().strftime('%Y%m%d_%H%M%S')}")
