In [5]:
from pathlib import Path
import pandas as pd
import numpy as np


file_path = Path("D:\\Code\\Project\\Amazon\\data\\amazon_india_cleaned_data.csv")
amazon_2015_2025 = pd.read_csv(file_path)
amazon_2015_2025 = amazon_2015_2025.where(pd.notna(amazon_2015_2025), None)
amazon_2015_2025.head()


Unnamed: 0,transaction_id,order_date,customer_id,product_id,product_name,category,subcategory,brand,original_price_inr,discount_percent,...,return_status,order_month,order_year,order_quarter,product_weight_kg,is_prime_eligible,product_rating,is_duplicate,duplicate_type,price_outlier_IQR
0,TXN_2023_00063013,2023-07-23,CUST_2023_00018393,PROD_000454,Vivo Y95 64GB Black,electronics,Smartphones,Vivo,27340.84,21.57,...,Delivered,7,2023,3,0.2,True,3.5,False,,False
1,TXN_2021_00064486,2021-07-20,CUST_2015_00002865,PROD_000579,Realme Realme 3 128GB Black,electronics,Smartphones,Realme,32907.49,0.0,...,Delivered,7,2021,3,0.21,False,4.5,False,,False
2,TXN_2017_00065617,2017-11-16,CUST_2016_00004057,PROD_000295,Vivo V7 32GB Blue,electronics,Smartphones,Vivo,47052.18,21.91,...,Delivered,11,2017,4,0.24,True,4.3,False,,False
3,TXN_2020_00054393,2020-05-04,CUST_2020_00014574,PROD_001654,Alienware Pavilion 4GB RAM Silver,electronics,Laptops,Alienware,238725.44,59.6,...,Delivered,5,2020,2,1.85,True,3.6,False,,False
4,TXN_2018_00071646,2018-10-09,CUST_2018_00006275,PROD_000095,Motorola Moto X Play 16GB White,electronics,Smartphones,Motorola,25970.76,0.0,...,Delivered,10,2018,4,0.16,False,3.7,False,,False


In [6]:
import os
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
from urllib.parse import quote_plus

# Load environment variables
load_dotenv()

MYSQL_USER = os.getenv("MYSQL_USER")
MYSQL_PWD = os.getenv("MYSQL_PWD")
MYSQL_HOST = os.getenv("MYSQL_HOST")
MYSQL_PORT = os.getenv("MYSQL_PORT", "3306")
MYSQL_DB = os.getenv("MYSQL_DB")

# Engine WITHOUT database
engine_no_db = create_engine(
    f"mysql+pymysql://{MYSQL_USER}:{quote_plus(MYSQL_PWD)}@"
    f"{MYSQL_HOST}:{MYSQL_PORT}/"
)

# Create database if it does not exist
with engine_no_db.connect() as conn:
    conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {MYSQL_DB}"))
    conn.commit()

# Engine WITH database
engine = create_engine(
    f"mysql+pymysql://{MYSQL_USER}:{quote_plus(MYSQL_PWD)}@"
    f"{MYSQL_HOST}:{MYSQL_PORT}/{MYSQL_DB}"
)

# Push DataFrame to MySQL
amazon_2015_2025.to_sql(
    name="amazon_india_data_cleaned_2015_2025",
    con=engine,
    if_exists="replace",
    index=False,
    chunksize=1000,
    method="multi"
)

# Close engines
engine_no_db.dispose()
engine.dispose()

print("‚úÖ Data successfully written to MySQL")


‚úÖ Data successfully written to MySQL


In [8]:
from sqlalchemy import text

alter_table_sql = """
ALTER TABLE amazon_india_data_cleaned_2015_2025

-- core identifiers (keep NOT NULL)
MODIFY transaction_id VARCHAR(64) NOT NULL,
MODIFY order_date DATETIME NOT NULL,
MODIFY customer_id VARCHAR(64) NOT NULL,
MODIFY product_id VARCHAR(64) NOT NULL,

-- product info
MODIFY product_name VARCHAR(255) NOT NULL,
MODIFY category VARCHAR(100) NOT NULL,
MODIFY subcategory VARCHAR(100) NOT NULL,
MODIFY brand VARCHAR(100) NULL,

-- pricing
MODIFY original_price_inr DECIMAL(10,2) NOT NULL,
MODIFY discount_percent DECIMAL(5,2) NULL,
MODIFY discounted_price_inr DECIMAL(10,2) NOT NULL,
MODIFY quantity INT NOT NULL,
MODIFY subtotal_inr DECIMAL(12,2) NOT NULL,
MODIFY delivery_charges DECIMAL(8,2) NULL,
MODIFY final_amount_inr DECIMAL(12,2) NOT NULL,

-- customer info
MODIFY customer_city VARCHAR(100) NOT NULL,
MODIFY customer_state VARCHAR(100) NOT NULL,
MODIFY customer_tier VARCHAR(50) NOT NULL,
MODIFY customer_spending_tier VARCHAR(50) NOT NULL,
MODIFY customer_age_group VARCHAR(50) NULL,

-- logistics
MODIFY payment_method VARCHAR(50) NOT NULL,
MODIFY delivery_days VARCHAR(50) NULL,
MODIFY delivery_type VARCHAR(50) NOT NULL,

-- flags
MODIFY is_prime_member TINYINT(1) NOT NULL,
MODIFY is_festival_sale TINYINT(1) NOT NULL,
MODIFY festival_name VARCHAR(100) NULL,

-- ratings & returns
MODIFY customer_rating DECIMAL(3,2) NULL,
MODIFY return_status VARCHAR(50) NULL,

-- date parts
MODIFY order_month TINYINT NOT NULL,
MODIFY order_year SMALLINT NOT NULL,
MODIFY order_quarter TINYINT NOT NULL,

-- product attributes
MODIFY product_weight_kg DECIMAL(8,3) NULL,
MODIFY is_prime_eligible TINYINT(1) NOT NULL,
MODIFY product_rating DECIMAL(3,2) NULL;
"""

with engine.begin() as conn:
    conn.execute(text(alter_table_sql))

print("‚úÖ NOT NULL constraints safely removed where appropriate")


‚úÖ NOT NULL constraints safely removed where appropriate


In [10]:
from sqlalchemy import text

with engine.begin() as conn:
    conn.execute(text("""
        ALTER TABLE amazon_india_data_cleaned_2015_2025
        ADD PRIMARY KEY (transaction_id);
    """))

print("‚úÖ transaction_id set as PRIMARY KEY")


‚úÖ transaction_id set as PRIMARY KEY


In [17]:
import pandas as pd
from sqlalchemy import create_engine
from urllib.parse import quote_plus
import os

# ===============================
# 1Ô∏è‚É£ MySQL connection
# ===============================
MYSQL_USER = os.getenv("MYSQL_USER")
MYSQL_PWD = os.getenv("MYSQL_PWD")
MYSQL_HOST = os.getenv("MYSQL_HOST")
MYSQL_PORT = os.getenv("MYSQL_PORT", "3306")
MYSQL_DB = os.getenv("MYSQL_DB")

engine = create_engine(
    f"mysql+pymysql://{MYSQL_USER}:{quote_plus(MYSQL_PWD)}@{MYSQL_HOST}:{MYSQL_PORT}/{MYSQL_DB}"
)

# ===============================
# 2Ô∏è‚É£ CSV file path
# ===============================
csv_file_path = "D:\\Code\\Project\\Amazon\\data\\amazon_india_cleaned_data.csv"  # <-- change path

# ===============================
# 3Ô∏è‚É£ Load transaction_id column from CSV
# ===============================
df_csv_ids = pd.read_csv(csv_file_path, usecols=["transaction_id"])
csv_ids_set = set(df_csv_ids["transaction_id"].dropna())

# ===============================
# 4Ô∏è‚É£ Load transaction_id column from MySQL
# ===============================
table_name = "amazon_india_data_cleaned_2015_2025"
df_mysql_ids = pd.read_sql(f"SELECT transaction_id FROM {table_name}", con=engine)
mysql_ids_set = set(df_mysql_ids["transaction_id"].dropna())

# ===============================
# 5Ô∏è‚É£ Compare sets
# ===============================
if csv_ids_set == mysql_ids_set:
    print("‚úÖ All transaction_id values match between CSV and MySQL!")
else:
    print("‚ùå Mismatch found in transaction_id values!")

    missing_in_mysql = csv_ids_set - mysql_ids_set
    missing_in_csv = mysql_ids_set - csv_ids_set

    if missing_in_mysql:
        print(f"üö® transaction_id present in CSV but missing in MySQL: {list(missing_in_mysql)[:10]} ...")
    if missing_in_csv:
        print(f"üö® transaction_id present in MySQL but missing in CSV: {list(missing_in_csv)[:10]} ...")



‚úÖ All transaction_id values match between CSV and MySQL!


In [None]:
'''import pandas as pd
from sqlalchemy import create_engine
from urllib.parse import quote_plus
import os

# ===============================
# 1Ô∏è‚É£ User options (change these)
# ===============================
CSV_FILE_PATH = "D:\\Code\\Project\\Amazon\\data\\amazon_india_cleaned_data.csv"  # <-- change CSV path
TABLE_NAME = "amazon_india_data_cleaned_2015_2025"  # <-- MySQL table name
COLUMN_NAME = "transaction_id"  # <-- column to check

# ===============================
# 2Ô∏è‚É£ MySQL connection
# ===============================
MYSQL_USER = os.getenv("MYSQL_USER")
MYSQL_PWD = os.getenv("MYSQL_PWD")
MYSQL_HOST = os.getenv("MYSQL_HOST")
MYSQL_PORT = os.getenv("MYSQL_PORT", "3306")
MYSQL_DB = os.getenv("MYSQL_DB")

engine = create_engine(
    f"mysql+pymysql://{MYSQL_USER}:{quote_plus(MYSQL_PWD)}@{MYSQL_HOST}:{MYSQL_PORT}/{MYSQL_DB}"
)

# ===============================
# 3Ô∏è‚É£ Load column from CSV
# ===============================
df_csv = pd.read_csv(CSV_FILE_PATH, usecols=[COLUMN_NAME])
csv_values_set = set(df_csv[COLUMN_NAME].dropna())

# ===============================
# 4Ô∏è‚É£ Load column from MySQL
# ===============================
df_mysql = pd.read_sql(f"SELECT {COLUMN_NAME} FROM {TABLE_NAME}", con=engine)
mysql_values_set = set(df_mysql[COLUMN_NAME].dropna())

# ===============================
# 5Ô∏è‚É£ Compare sets
# ===============================
if csv_values_set == mysql_values_set:
    print(f"‚úÖ All {COLUMN_NAME} values match between CSV and MySQL!")
else:
    print(f"‚ùå Mismatch found in {COLUMN_NAME} values!")

    missing_in_mysql = csv_values_set - mysql_values_set
    missing_in_csv = mysql_values_set - csv_values_set

    if missing_in_mysql:
        print(f"üö® Present in CSV but missing in MySQL: {list(missing_in_mysql)[:10]} ...")
    if missing_in_csv:
        print(f"üö® Present in MySQL but missing in CSV: {list(missing_in_csv)[:10]} ...")'''
