In [2]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

faker = Faker('en_IN')
analyzer = SentimentIntensityAnalyzer()

# ---------- PARAMETERS ----------
NUM_PRODUCTS = 2000
NUM_CUSTOMERS = 50000
NUM_SALES_PER_PRODUCT = 365
CATEGORIES = ["Electronics", "Fashion", "Home Appliances", "Books", "Beauty", "Sports"]
BRANDS = ["BrandX", "BrandY", "BrandZ", "Alpha", "Beta", "Gamma"]
CHANNELS = ["Organic", "Paid", "Referral", "Social Media", "Email Marketing"]
DAILY_SALE_PROBABILITY = 0.4  # 40% of days have sales

# ---------- 1. PRODUCTS ----------
products = []
for pid in range(1, NUM_PRODUCTS + 1):
    products.append({
        "ProductID": pid,
        "ProductName": faker.catch_phrase(),
        "Category": random.choice(CATEGORIES),
        "Brand": random.choice(BRANDS),
        "BasePrice": round(random.uniform(200, 50000), 2),
        "LaunchDate": faker.date_between(start_date="-2y", end_date="today")
    })
df_products = pd.DataFrame(products)
df_products['LaunchDate'] = pd.to_datetime(df_products['LaunchDate'])

In [3]:
# ---------- 2. CUSTOMERS ----------
customers = []
for cid in range(1, NUM_CUSTOMERS + 1):
    customers.append({
        "CustomerID": cid,
        "Name": faker.name(),
        "Gender": random.choice(["Male", "Female"]),
        "City": faker.city(),
        "State": faker.state(),
        "Email": faker.email(),
        "RegistrationDate": faker.date_between(start_date="-3y", end_date="today"),
        "LoyaltyTier": random.choice(["Gold", "Silver", "Bronze"])
    })
df_customers = pd.DataFrame(customers)
df_customers['RegistrationDate'] = pd.to_datetime(df_customers['RegistrationDate'])

In [5]:
# ---------- 3. SALES (Optimized & Capped at ~5 lakh) ----------
sales = []
start_date = datetime(2024, 1, 1)

# Target size
MAX_SALES_ROWS = 500_000
rows_generated = 0

for _, product in df_products.iterrows():
    if rows_generated >= MAX_SALES_ROWS:
        break  # stop once we hit 5 lakh rows

    if random.random() < 0.01:
        continue

    eligible_ids = df_customers.loc[
        df_customers['RegistrationDate'] <= product['LaunchDate'], 'CustomerID'
    ].values
    if eligible_ids.size == 0:
        continue

    # Randomly pick some sale days for this product
    sale_days = sorted(random.sample(range(NUM_SALES_PER_PRODUCT), k=random.randint(50, 200)))

    for day in sale_days:
        if rows_generated >= MAX_SALES_ROWS:
            break

        sale_date = start_date + timedelta(days=day)
        seasonality_factor = 1.5 if sale_date.month in [10, 11, 12] else 1.0
        discount = round(random.uniform(0, 0.3), 2)
        competitor_price = product["BasePrice"] * random.uniform(0.9, 1.1)
        units_sold = int(np.random.poisson(5) * seasonality_factor * (1 + discount))
        returns = int(units_sold * random.uniform(0, 0.05))
        net_units = units_sold - returns

        # Outlier injection
        if random.random() < 0.0005:
            net_units *= 50
        if random.random() < 0.0005:
            net_units = -abs(net_units)
        revenue = round(net_units * product["BasePrice"] * (1 - discount), 2)
        if random.random() < 0.0005:
            revenue = 0

        sales.append({
            "ProductID": product["ProductID"],
            "CustomerID": random.choice(eligible_ids),
            "Date": sale_date,
            "UnitsSold": net_units,
            "Returns": returns,
            "Discount": discount,
            "OurPrice": round(product["BasePrice"] * (1 - discount), 2),
            "CompetitorPrice": round(competitor_price, 2),
            "Revenue": revenue,
            "AcquisitionChannel": random.choice(CHANNELS)
        })
        rows_generated += 1

df_sales = pd.DataFrame(sales)


In [6]:
# ---------- 4. REVIEWS ----------
reviews = []
if not df_sales.empty:
    product_purchasers = df_sales.groupby('ProductID')['CustomerID'].unique().to_dict()
    purchase_dates = df_sales.groupby(['ProductID', 'CustomerID'])['Date'].min().to_dict()
else:
    product_purchasers = {}
    purchase_dates = {}

for product_id, purchasers in product_purchasers.items():
    if len(purchasers) == 0:
        continue
    num_reviews = int(len(purchasers) * random.uniform(0.05, 0.3))
    for _ in range(num_reviews):
        customer_id = random.choice(purchasers)
        purchase_date = purchase_dates.get((product_id, customer_id), datetime(2024, 1, 1))
        review_date = faker.date_between(start_date=purchase_date, end_date="today")

        review_text = faker.sentence(nb_words=12)
        sentiment_score = analyzer.polarity_scores(review_text)["compound"]
        rating = min(max(int((sentiment_score + 1) * 2.5 + random.uniform(-0.5, 0.5)), 1), 5)

        reviews.append({
            "ProductID": product_id,
            "CustomerID": customer_id,
            "ReviewDate": review_date,
            "Rating": rating,
            "ReviewText": review_text,
            "SentimentScore": sentiment_score
        })
df_reviews = pd.DataFrame(reviews)

# ---------- 5. MISSING DATA ----------
for col in ["Discount", "OurPrice", "CompetitorPrice", "AcquisitionChannel"]:
    if not df_sales.empty:
        df_sales.loc[df_sales.sample(frac=0.01).index, col] = np.nan



In [7]:
# ---------- SAVE ----------
df_products.to_csv("D:\\powerbi\\projects\\Upcoming Project\\MarketPulse 360  Advanced ECommerce Competitor, Pricing & Sentiment Analytics\\products.csv", index=False)
df_customers.to_csv("D:\\powerbi\\projects\\Upcoming Project\\MarketPulse 360  Advanced ECommerce Competitor, Pricing & Sentiment Analytics\\customers.csv", index=False)
df_sales.to_csv("D:\\powerbi\\projects\\Upcoming Project\\MarketPulse 360  Advanced ECommerce Competitor, Pricing & Sentiment Analytics\\sales.csv", index=False)
df_reviews.to_csv("D:\\powerbi\\projects\\Upcoming Project\\MarketPulse 360  Advanced ECommerce Competitor, Pricing & Sentiment Analytics\\reviews.csv", index=False)

# ---------- SUMMARY ----------
print(f"Products: {len(df_products)} rows")
print(f"Customers: {len(df_customers)} rows")
print(f"Sales: {len(df_sales)} rows")
print(f"Reviews: {len(df_reviews)} rows")

Products: 2000 rows
Customers: 50000 rows
Sales: 248471 rows
Reviews: 42719 rows


In [3]:
# ---------- 2. CUSTOMERS (State-City Mapping) ----------
state_city_map = {
    "Andhra Pradesh": ["Visakhapatnam", "Vijayawada", "Guntur"],
    "Bihar": ["Patna", "Gaya", "Bhagalpur"],
    "Delhi": ["New Delhi", "Dwarka", "Rohini"],
    "Gujarat": ["Ahmedabad", "Surat", "Vadodara"],
    "Karnataka": ["Bengaluru", "Mysuru", "Mangaluru"],
    "Kerala": ["Kochi", "Thiruvananthapuram", "Kozhikode"],
    "Maharashtra": ["Mumbai", "Pune", "Nagpur"],
    "Punjab": ["Ludhiana", "Amritsar", "Jalandhar"],
    "Rajasthan": ["Jaipur", "Jodhpur", "Udaipur"],
    "Tamil Nadu": ["Chennai", "Coimbatore", "Madurai"],
    "Telangana": ["Hyderabad", "Warangal", "Nizamabad"],
    "Uttar Pradesh": ["Lucknow","Gorakhpur", "Kanpur", "Varanasi"],
    "West Bengal": ["Kolkata", "Howrah", "Durgapur"]
}

states = list(state_city_map.keys())

customers = []
for cid in range(1, NUM_CUSTOMERS + 1):
    state = random.choice(states)
    city = random.choice(state_city_map[state])
    customers.append({
        "CustomerID": cid,
        "Name": faker.name(),
        "Gender": random.choice(["Male", "Female"]),
        "City": city,
        "State": state,
        "Email": faker.email(),
        "RegistrationDate": faker.date_between(start_date="-3y", end_date="today"),
        "LoyaltyTier": random.choice(["Gold", "Silver", "Bronze"])
    })

df_customers = pd.DataFrame(customers)
df_customers['RegistrationDate'] = pd.to_datetime(df_customers['RegistrationDate'])

df_customers.to_csv("D:\\powerbi\\projects\\Upcoming Project\\MarketPulse 360  Advanced ECommerce Competitor, Pricing & Sentiment Analytics\\customers.csv", index=False)

In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Load your dataset
df = pd.read_csv("D:\\powerbi\\projects\\Upcoming Project\\MarketPulse 360  Advanced ECommerce Competitor, Pricing & Sentiment Analytics\\reviews.csv")

faker = Faker()

# --- Update Ratings ---
# More realistic distribution (skewed towards positive but still some negatives)
rating_choices = [1, 2, 3, 4, 5]
rating_weights = [0.10, 0.15, 0.20, 0.30, 0.25]  # adjust to your needs
df['Rating'] = np.random.choice(rating_choices, size=len(df), p=rating_weights)

# --- Update Sentiment Score ---
# Simple mapping: generate sentiment aligned with rating
def generate_sentiment(rating):
    if rating == 1:
        return round(random.uniform(-0.8, -0.4), 3)   # strong negative
    elif rating == 2:
        return round(random.uniform(-0.4, -0.1), 3)   # mild negative
    elif rating == 3:
        return round(random.uniform(-0.1, 0.1), 3)    # neutral / mixed
    elif rating == 4:
        return round(random.uniform(0.1, 0.4), 3)     # positive
    elif rating == 5:
        return round(random.uniform(0.4, 0.8), 3)     # strong positive

df['SentimentScore'] = df['Rating'].apply(generate_sentiment)

# Save updated dataset
df.to_csv("D:\\powerbi\\projects\\Upcoming Project\\MarketPulse 360  Advanced ECommerce Competitor, Pricing & Sentiment Analytics\\reviews_updated.csv", index=False)

print("✅ Updated reviews.csv with new Rating & SentimentScore columns saved as reviews_updated.csv")


✅ Updated reviews.csv with new Rating & SentimentScore columns saved as reviews_updated.csv
