## Combined multiple data set into single dataset

In [3]:
import pandas as pd
import os

# --- Folder containing your CSVs ---
folder_path = r"C:\Users\desik\Desktop\cleaned output"

# --- List of years to combine ---
years = range(2015, 2026)

# --- Build full file paths ---
csv_files = [os.path.join(folder_path, f"amazon_india_{year}_cleaned.csv") for year in years]

# --- Read and combine all CSVs ---
combined_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

# --- Save to a single CSV file ---
output_path = os.path.join(folder_path, "amazon_india_combined_2015_2025.csv")
combined_df.to_csv(output_path, index=False)

print(f"✅ Combined CSV saved to: {output_path}")

✅ Combined CSV saved to: C:\Users\desik\Desktop\cleaned output\amazon_india_combined_2015_2025.csv


In [7]:
import pandas as pd
import mysql.connector
import os

# ✅ Load Combined Dataset
file_path = r"C:\Users\desik\Desktop\cleaned output\amazon_india_combined_2015_2025.csv"
df = pd.read_csv(file_path)
print("✅ Combined dataset loaded:", df.shape)

# ✅ Clean column names for MySQL compatibility
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("-", "_")
    .str.replace("(", "")
    .str.replace(")", "")
)

# ✅ Connect to MySQL
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="Revdesi@2302",
    database="amazon_db",
    allow_local_infile=True
)
cursor = conn.cursor()
cursor.execute("SET GLOBAL local_infile = 1;")

# ✅ Create Table Automatically
table_name = "amazon_transactions"

# Generate CREATE TABLE dynamically
columns_with_types = []
for col in df.columns:
    # Simple inference
    dtype = df[col].dtype
    if pd.api.types.is_integer_dtype(dtype):
        mysql_type = "INT"
    elif pd.api.types.is_float_dtype(dtype):
        mysql_type = "FLOAT"
    elif pd.api.types.is_bool_dtype(dtype):
        mysql_type = "BOOLEAN"
    else:
        mysql_type = "VARCHAR(255)"
    columns_with_types.append(f"`{col}` {mysql_type}")

create_table_query = f"""
CREATE TABLE IF NOT EXISTS {table_name} (
    {', '.join(columns_with_types)}
);
"""

cursor.execute(create_table_query)
print(f"✅ MySQL table `{table_name}` created or verified.")

# ✅ Save temp CSV for fast MySQL import
temp_csv_path = os.path.join(os.getcwd(), "temp_combined.csv")
df.to_csv(temp_csv_path, index=False)

# ✅ Load data using LOCAL INFILE
load_query = f"""
LOAD DATA LOCAL INFILE '{temp_csv_path.replace("\\", "/")}'
INTO TABLE {table_name}
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;
"""

cursor.execute(load_query)
conn.commit()
print(f"🎉 Data successfully loaded into `{table_name}` ({len(df):,} rows).")

# ✅ Create essential indexes for faster queries
indexes = [
    ("idx_order_date", "order_date"),
    ("idx_customer_id", "customer_id"),
    ("idx_product_id", "product_id"),
    ("idx_payment_method", "payment_method"),
    ("idx_category", "category")
]

for idx_name, col_name in indexes:
    try:
        cursor.execute(f"CREATE INDEX {idx_name} ON {table_name}({col_name});")
        print(f"✅ Index created on {col_name}")
    except:
        print(f"⚠️ Index already exists for {col_name}")

# ✅ Cleanup
os.remove(temp_csv_path)
cursor.close()
conn.close()
print("✅ MySQL connection closed.")


✅ Combined dataset loaded: (1127567, 38)
✅ MySQL table `amazon_transactions` created or verified.
🎉 Data successfully loaded into `amazon_transactions` (1,127,567 rows).
✅ Index created on order_date
✅ Index created on customer_id
✅ Index created on product_id
✅ Index created on payment_method
✅ Index created on category
✅ MySQL connection closed.


In [17]:
import mysql.connector
import pandas as pd
import os

# --- CONFIGURATION ---
DB_CONFIG = {
    "host": "localhost",
    "user": "root",
    "password": "Revdesi@2302",
    "database": "amazonsales_db",
    "allow_local_infile": True,
    "ssl_disabled": True  # ✅ Disable SSL for local connections
}

CSV_PATH = r"C:\Users\desik\Desktop\cleaned output\amazon_india_combined_2015_2025.csv"

# --- CONNECT TO MYSQL ---
print("🔗 Connecting to MySQL...")
conn = mysql.connector.connect(**DB_CONFIG)
cursor = conn.cursor()

# Enable local file import and adjust server settings
cursor.execute("SET GLOBAL local_infile = 1;")
cursor.execute("SET SESSION sql_mode = '';")
cursor.execute("SET GLOBAL max_allowed_packet = 512*1024*1024;")
cursor.execute("SET GLOBAL net_buffer_length = 1048576;")

# --- READ CSV ---
print("📥 Reading CSV data...")
df = pd.read_csv(CSV_PATH, encoding="utf-8")
df.columns = df.columns.str.strip()
df["order_date"] = pd.to_datetime(df["order_date"], errors="coerce")
df = df.dropna(subset=["order_date"])
print(f"✅ Loaded {len(df):,} valid rows from CSV.")

# --- DROP OLD TABLES ---
print("🧹 Dropping existing tables if any...")
cursor.execute("SET FOREIGN_KEY_CHECKS = 0;")
for tbl in ["transactions", "customers", "products", "time_dimension"]:
    cursor.execute(f"DROP TABLE IF EXISTS {tbl};")
cursor.execute("SET FOREIGN_KEY_CHECKS = 1;")
conn.commit()

# --- CREATE TABLES ---
print("🧱 Creating tables...")

cursor.execute("""
    CREATE TABLE customers (
        customer_id VARCHAR(100) PRIMARY KEY,
        customer_city VARCHAR(100),
        customer_state VARCHAR(100),
        customer_tier VARCHAR(50),
        customer_spending_tier VARCHAR(50),
        customer_age_group VARCHAR(50)
    ) ENGINE=InnoDB;
""")

cursor.execute("""
    CREATE TABLE products (
        product_id VARCHAR(100) PRIMARY KEY,
        product_name TEXT,
        category VARCHAR(255),
        subcategory VARCHAR(255),
        brand VARCHAR(255),
        product_weight_kg DECIMAL(10,3),
        is_prime_eligible TINYINT(1),
        product_rating DECIMAL(3,2)
    ) ENGINE=InnoDB;
""")

cursor.execute("""
    CREATE TABLE time_dimension (
        date DATE PRIMARY KEY,
        year INT,
        month INT,
        day INT,
        quarter INT,
        week INT,
        day_of_week VARCHAR(20),
        is_weekend TINYINT(1),
        is_holiday TINYINT(1),
        holiday_name VARCHAR(255)
    ) ENGINE=InnoDB;
""")

cursor.execute("""
    CREATE TABLE transactions (
        transaction_id VARCHAR(100) PRIMARY KEY,
        order_date DATE,
        customer_id VARCHAR(100),
        product_id VARCHAR(100),
        original_price_inr DECIMAL(10,2),
        discount_percent DECIMAL(5,2),
        discounted_price_inr DECIMAL(10,2),
        quantity INT,
        subtotal_inr DECIMAL(10,2),
        delivery_charges DECIMAL(10,2),
        final_amount_inr DECIMAL(10,2),
        payment_method VARCHAR(100),
        delivery_days INT,
        delivery_type VARCHAR(100),
        is_prime_member TINYINT(1),
        is_festival_sale TINYINT(1),
        festival_name VARCHAR(255),
        customer_rating DECIMAL(3,2),
        return_status VARCHAR(100),
        order_month INT,
        order_year INT,
        order_quarter INT,
        dup_key VARCHAR(255),
        dup_count INT,
        dup_status VARCHAR(100),
        flag_for_review TINYINT(1),
        FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
            ON DELETE SET NULL ON UPDATE CASCADE,
        FOREIGN KEY (product_id) REFERENCES products(product_id)
            ON DELETE SET NULL ON UPDATE CASCADE,
        FOREIGN KEY (order_date) REFERENCES time_dimension(date)
            ON DELETE SET NULL ON UPDATE CASCADE
    ) ENGINE=InnoDB;
""")

conn.commit()
print("✅ Tables created successfully.")

# --- PREPARE DIMENSIONS ---
print("🧩 Preparing dimension tables...")

customers = df[[
    "customer_id", "customer_city", "customer_state", "customer_tier",
    "customer_spending_tier", "customer_age_group"
]].drop_duplicates()

products = df[[
    "product_id", "product_name", "category", "subcategory", "brand",
    "product_weight_kg", "is_prime_eligible", "product_rating"
]].drop_duplicates()

time_dimension = (
    df[["order_date", "is_festival_sale", "festival_name"]]
    .drop_duplicates()
    .rename(columns={"order_date": "date"})
)
time_dimension["year"] = time_dimension["date"].dt.year
time_dimension["month"] = time_dimension["date"].dt.month
time_dimension["day"] = time_dimension["date"].dt.day
time_dimension["quarter"] = time_dimension["date"].dt.quarter
time_dimension["week"] = time_dimension["date"].dt.isocalendar().week
time_dimension["day_of_week"] = time_dimension["date"].dt.day_name()
time_dimension["is_weekend"] = time_dimension["day_of_week"].isin(["Saturday", "Sunday"]).astype(int)
time_dimension["is_holiday"] = time_dimension["is_festival_sale"].fillna(False).astype(int)
time_dimension["holiday_name"] = time_dimension["festival_name"].fillna("")
time_dimension = time_dimension.drop(columns=["is_festival_sale", "festival_name"])

transactions = df[[
    "transaction_id", "order_date", "customer_id", "product_id",
    "original_price_inr", "discount_percent", "discounted_price_inr",
    "quantity", "subtotal_inr", "delivery_charges", "final_amount_inr",
    "payment_method", "delivery_days", "delivery_type", "is_prime_member",
    "is_festival_sale", "festival_name", "customer_rating", "return_status",
    "order_month", "order_year", "order_quarter", "dup_key", "dup_count",
    "dup_status", "flag_for_review"
]].drop_duplicates()

# --- LOAD FUNCTION (optimized) ---
def load_table(df, table):
    temp_path = os.path.join(os.getcwd(), f"{table}_temp.csv")
    df.to_csv(temp_path, index=False, encoding="utf-8", lineterminator="\n")  # ✅ FIXED HERE
    path = os.path.abspath(temp_path).replace("\\", "/")
    try:
        query = f"""
        LOAD DATA LOCAL INFILE '{path}'
        INTO TABLE {table}
        FIELDS TERMINATED BY ',' 
        ENCLOSED BY '"'
        LINES TERMINATED BY '\n'
        IGNORE 1 ROWS;
        """
        cursor.execute(query)
        conn.commit()
        print(f"✅ Loaded {table}: {len(df):,} rows")
    except mysql.connector.Error as err:
        print(f"❌ Error loading {table}: {err}")
    finally:
        if os.path.exists(temp_path):
            os.remove(temp_path)

# --- LOAD ALL TABLES ---
print("🚀 Loading data into MySQL...")
load_table(customers, "customers")
load_table(products, "products")
load_table(time_dimension, "time_dimension")
load_table(transactions, "transactions")

# --- INDEXES ---
print("⚙️ Creating indexes...")
index_list = [
    ("transactions", "idx_order_date", "CREATE INDEX idx_order_date ON transactions(order_date)"),
    ("transactions", "idx_customer_id", "CREATE INDEX idx_customer_id ON transactions(customer_id)"),
    ("transactions", "idx_product_id", "CREATE INDEX idx_product_id ON transactions(product_id)"),
    ("products", "idx_category_subcategory", "CREATE INDEX idx_category_subcategory ON products(category, subcategory)"),
    ("customers", "idx_customer_state", "CREATE INDEX idx_customer_state ON customers(customer_state)"),
    ("time_dimension", "idx_year_month", "CREATE INDEX idx_year_month ON time_dimension(year, month)")
]

for table, name, query in index_list:
    try:
        cursor.execute(query)
        print(f"✅ Created index: {name}")
    except mysql.connector.Error as err:
        print(f"⚠️ {name} skipped ({err})")

conn.commit()
cursor.close()
conn.close()
print("🎉 Data successfully loaded into amazonsales_db with optimized schema and indexes.")




🔗 Connecting to MySQL...
📥 Reading CSV data...
✅ Loaded 1,127,567 valid rows from CSV.
🧹 Dropping existing tables if any...
🧱 Creating tables...
✅ Tables created successfully.
🧩 Preparing dimension tables...
🚀 Loading data into MySQL...
✅ Loaded customers: 459,885 rows
✅ Loaded products: 2,579 rows
✅ Loaded time_dimension: 4,828 rows
✅ Loaded transactions: 1,127,567 rows
⚙️ Creating indexes...
✅ Created index: idx_order_date
✅ Created index: idx_customer_id
✅ Created index: idx_product_id
✅ Created index: idx_category_subcategory
✅ Created index: idx_customer_state
✅ Created index: idx_year_month
🎉 Data successfully loaded into amazonsales_db with optimized schema and indexes.


In [1]:
import pandas as pd
import mysql.connector
import os

# --- MySQL Connection Configuration ---
# 🚨 IMPORTANT: Use the same credentials as your dashboard 🚨
MYSQL_CONFIG = {
    "host": "localhost",
    "user": "root", 
    "password": "Revdesi@2302", 
    "database": "amazonsales_db"
}

# --- Tables to Export ---
TABLES = [
    "transactions", 
    "products", 
    "customers", 
    "time_dimension"
]

OUTPUT_DIR = "data1"

def export_tables_to_csv():
    """Connects to MySQL, fetches data1 for specified tables, and saves them as CSV files."""
    
    print(f"Starting export process...")
    
    # 1. Create output directory if it doesn't exist
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"Created output directory: {OUTPUT_DIR}/")
    
    conn = None
    try:
        # 2. Establish the connection
        conn = mysql.connector.connect(**MYSQL_CONFIG)
        
        for table_name in TABLES:
            print(f"-> Exporting table: '{table_name}'...")
            query = f"SELECT * FROM `{table_name}`;"
            
            # 3. Read data into a pandas DataFrame
            df = pd.read_sql(query, conn)
            
            # 4. Save the DataFrame to a CSV file
            filepath = os.path.join(OUTPUT_DIR, f"{table_name}.csv")
            df.to_csv(filepath, index=False)
            
            print(f"   Successfully saved {len(df):,} rows to {filepath}")
        
        print("\n✅ Export complete!")
        
    except mysql.connector.Error as err:
        print(f"\n❌ Database Error: {err}. Please check your MySQL configuration.")
    except Exception as e:
        print(f"\n❌ An unexpected error occurred: {e}")
    finally:
        if conn and conn.is_connected():
            conn.close()
            print("MySQL connection closed.")

if __name__ == "__main__":
    export_tables_to_csv()

Starting export process...
Created output directory: data1/
-> Exporting table: 'transactions'...


  df = pd.read_sql(query, conn)


   Successfully saved 1,127,567 rows to data1\transactions.csv
-> Exporting table: 'products'...
   Successfully saved 2,004 rows to data1\products.csv
-> Exporting table: 'customers'...
   Successfully saved 354,968 rows to data1\customers.csv
-> Exporting table: 'time_dimension'...
   Successfully saved 4,015 rows to data1\time_dimension.csv

✅ Export complete!
MySQL connection closed.
