In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load datasets
def load_data():
    customers = pd.read_csv("Customers.csv")
    products = pd.read_csv("Products.csv")
    transactions = pd.read_csv("Transactions.csv")
    return customers, products, transactions

In [None]:
# Data Cleaning
def clean_data(customers, products, transactions):
    # Convert date columns to datetime
    customers["SignupDate"] = pd.to_datetime(customers["SignupDate"])
    transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"])

    # Check for and handle missing values
    customers.fillna("Unknown", inplace=True)
    products.fillna("Unknown", inplace=True)
    transactions.fillna(0, inplace=True)

    return customers, products, transactions

print(customers)

#print(f"\033[1mcustomers\033[0m \n{customers}, \n \033[1mproducts\033[0m \n{products}, \n \033[1mtransactions\033[0m \n{transactions}")


In [None]:
# EDA Functions
def explore_customers(customers):
    print("\033[1mCustomer Data Overview:\033[0m")
    print(customers.info())
    print(customers.describe(include='all'))
    print(customers["Region"].value_counts())
    
    plt.figure(figsize=(8, 5))
    sns.countplot(data=customers, x="Region", order=customers["Region"].value_counts().index, palette="viridis")
    plt.title("Customer Distribution by Region")
    plt.xticks(rotation=45)
    plt.show()

#--------- for ignoring warnings
import warnings
warnings.filterwarnings('ignore')

explore_customers(customers)

In [None]:
def explore_products(products):
    print("\033[1mProduct Data Overview:\033[0m")
    print(products.info())
    print(products.describe(include='all'))
    print(products["Category"].value_counts())
    
    plt.figure(figsize=(8, 5))
    sns.countplot(data=products, x="Category", order=products["Category"].value_counts().index, palette="coolwarm")
    plt.title("Product Distribution by Category")
    plt.xticks(rotation=45)
    plt.show()

explore_products(products)

In [None]:
def explore_transactions(transactions):
    print("\033[1mTransaction Data Overview:\033[0m")
    print(transactions.info())
    print(transactions.describe())
    
    # Temporal Analysis
    transactions["MonthYear"] = transactions["TransactionDate"].dt.to_period("M").astype(str)
    monthly_sales = transactions.groupby("MonthYear")["TotalValue"].sum().reset_index()

    plt.figure(figsize=(12, 6))
    sns.lineplot(data=monthly_sales, x="MonthYear", y="TotalValue", color="green")
    plt.title("Monthly Sales Trend")
    plt.xticks(rotation=45)
    plt.savefig("monthly_sales_trend.png")
    plt.show()

    # Top Products
    top_products = transactions.groupby("ProductID")["TotalValue"].sum().sort_values(ascending=False).head(10)
    print("Top 10 Products by Sales:")
    print(top_products)

    
explore_transactions(transactions)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import numpy as np

# Load datasets
def load_data():
    customers = pd.read_csv("Customers.csv")
    products = pd.read_csv("Products.csv")
    transactions = pd.read_csv("Transactions.csv")
    return customers, products, transactions

# Data Cleaning
def clean_data(customers, products, transactions):
    # Convert date columns to datetime
    customers["SignupDate"] = pd.to_datetime(customers["SignupDate"])
    transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"])

    # Check for and handle missing values
    customers.fillna("Unknown", inplace=True)
    products.fillna("Unknown", inplace=True)
    transactions.fillna(0, inplace=True)

    return customers, products, transactions

# EDA Functions
def explore_customers(customers):
    print("Customer Data Overview:")
    print(customers.info())
    print(customers.describe(include='all'))
    print(customers["Region"].value_counts())
    
    plt.figure(figsize=(8, 5))
    sns.countplot(data=customers, x="Region", order=customers["Region"].value_counts().index, palette="viridis")
    plt.title("Customer Distribution by Region")
    plt.xticks(rotation=45)
    plt.savefig("customer_distribution_by_region.png")
    plt.show()

def explore_products(products):
    print("Product Data Overview:")
    print(products.info())
    print(products.describe(include='all'))
    print(products["Category"].value_counts())
    
    plt.figure(figsize=(8, 5))
    sns.countplot(data=products, x="Category", order=products["Category"].value_counts().index, palette="coolwarm")
    plt.title("Product Distribution by Category")
    plt.xticks(rotation=45)
    plt.savefig("product_distribution_by_category.png")
    plt.show()

def explore_transactions(transactions):
    print("Transaction Data Overview:")
    print(transactions.info())
    print(transactions.describe())
    
    # Temporal Analysis
    transactions["MonthYear"] = pd.to_datetime(transactions["TransactionDate"]).dt.to_period("M")
    monthly_sales = transactions.groupby("MonthYear")["TotalValue"].sum().reset_index()

    plt.figure(figsize=(12, 6))
    sns.lineplot(data=monthly_sales, x="MonthYear", y="TotalValue", color="green")
    plt.title("Monthly Sales Trend")
    plt.xticks(rotation=45)
    plt.savefig("monthly_sales_trend.png")
    plt.show()

    # Top Products
    top_products = transactions.groupby("ProductID")["TotalValue"].sum().sort_values(ascending=False).head(10)
    print("Top 10 Products by Sales:")
    print(top_products)

# Lookalike Model
def build_lookalike_model(customers, transactions):
    # Combine customer and transaction data
    customer_transactions = transactions.groupby("CustomerID").agg({"TotalValue": "sum", "Quantity": "sum"}).reset_index()
    customer_profiles = pd.merge(customers, customer_transactions, on="CustomerID", how="left").fillna(0)

    # Calculate similarity using cosine similarity
    features = ["TotalValue", "Quantity"]
    similarity_matrix = cosine_similarity(customer_profiles[features])

    # Find top 3 lookalikes for each customer
    lookalikes = {}
    for i, customer_id in enumerate(customer_profiles["CustomerID"]):
        similar_indices = np.argsort(similarity_matrix[i])[::-1][1:4]  # Exclude self
        similar_customers = [(customer_profiles["CustomerID"].iloc[j], round(similarity_matrix[i, j], 12)) for j in similar_indices]
        lookalikes[customer_id] = similar_customers

    # Save lookalikes for the first 20 customers
    output = []
    for cust in customer_profiles["CustomerID"].iloc[:20]:
        similar_customers = lookalikes[cust]
        for sim in similar_customers:
            output.append({"CustomerID": cust, "LookalikeID": sim[0], "Score": sim[1]})

    # Convert to DataFrame
    lookalike_df = pd.DataFrame(output)

    # Save to CSV in a clean format
    lookalike_df.to_csv("Lookalike.csv", index=False)
    print("Lookalike recommendations saved to Lookalike.csv.")

# Customer Segmentation
def perform_clustering(customers, transactions):
    # Combine customer and transaction data
    customer_transactions = transactions.groupby("CustomerID").agg({"TotalValue": "sum", "Quantity": "sum"}).reset_index()
    customer_profiles = pd.merge(customers, customer_transactions, on="CustomerID", how="left").fillna(0)

    # Use KMeans for clustering
    features = ["TotalValue", "Quantity"]
    kmeans = KMeans(n_clusters=4, random_state=42)
    customer_profiles["Cluster"] = kmeans.fit_predict(customer_profiles[features])

    # Calculate Davies-Bouldin Index
    db_index = davies_bouldin_score(customer_profiles[features], customer_profiles["Cluster"])
    print(f"Davies-Bouldin Index: {db_index}")

    # Visualize clusters
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=customer_profiles, x="TotalValue", y="Quantity", hue="Cluster", palette="tab10")
    plt.title("Customer Clusters")
    plt.savefig("customer_clusters.png")
    plt.show()

# Main Execution
# Load data
customers, products, transactions = load_data()

# Clean data
customers, products, transactions = clean_data(customers, products, transactions)

# Task 1: Explore Data
print("\n--- Exploring Customers ---\n")
explore_customers(customers)

print("\n--- Exploring Products ---\n")
explore_products(products)

print("\n--- Exploring Transactions ---\n")
explore_transactions(transactions)

# Task 2: Build Lookalike Model
print("\n--- Building Lookalike Model ---\n")
build_lookalike_model(customers, transactions)

# Task 3: Perform Clustering
print("\n--- Performing Customer Segmentation ---\n")
perform_clustering(customers, transactions)
