## Read Data Frame From RDS DB

In [None]:
!pip install pymysql
import pymysql
import boto3
import json
import pandas as pd
from IPython.display import display, HTML

def get_secret_name_from_ssm(ssm_param_name: str):
    """retrive secret name from ssm parameter"""
    # create ssm client
    client = boto3.client('ssm')

    # get db parameter
    secret_name = client.get_parameter(Name=ssm_param_name)['Parameter']['Value']

    return secret_name

def get_secret(secret_name, region_name):
    """Retrieve RDS database credentials from AWS Secrets Manager."""
    # Create a Secrets Manager client
    client = boto3.client('secretsmanager', region_name=region_name)

    try:
        # Get the secret value from Secrets Manager
        response = client.get_secret_value(SecretId=secret_name)

        # Secrets Manager returns the secret as a JSON string, parse it
        secret = json.loads(response['SecretString'])
        return secret

    except Exception as e:
        print(f"Error retrieving secret: {e}")
        raise

def get_rds_endpoint(rds_instance_identifier):
    """Retrieve RDS instance endpoint from AWS RDS."""
    rds_client = boto3.client('rds')
    response = rds_client.describe_db_clusters(
        DBClusterIdentifier=rds_instance_identifier
    )
    endpoint = response['DBClusters'][0]['Endpoint']
    return endpoint

def read_tables_from_database(rds_instance_identifier, db_name, username, password):
    """Connect to the RDS MySQL database and all the tables from the database."""
    # Get the RDS instance endpoint
    rds_client = boto3.client('rds')
    endpoint = rds_client.describe_db_clusters(
    DBClusterIdentifier=rds_instance_identifier,
    )['DBClusters'][0]['Endpoint']

    # Connect to the RDS MySQL instance
    connection = pymysql.connect(
        host=endpoint,
        user=username,
        password=password,
        database=db_name,
        port=3306,
        cursorclass=pymysql.cursors.DictCursor  # Return rows as dictionaries
    )

    try:
        with connection.cursor() as cursor:
            # Create SQL query to select data
            sql_query = f"SHOW TABLES;"
            cursor.execute(sql_query)
            # Fetch all rows from the table
            result = cursor.fetchall()
            # Get column names from the cursor description
            columns = [desc[0] for desc in cursor.description]

            # Create a pandas DataFrame with the results and column names
            df = pd.DataFrame(result, columns=columns) #, columns=columns)
            return df
    finally:
        connection.close()

def read_data_from_rds_table(rds_instance_identifier, db_name, table_name, username, password):
    """Connect to the RDS MySQL database and read data from a table."""
    # Get the RDS instance endpoint
    endpoint = get_rds_endpoint(rds_instance_identifier)

    # Connect to the RDS MySQL instance
    connection = pymysql.connect(
        host=endpoint,
        user=username,
        password=password,
        database=db_name,
        port=3306,
        cursorclass=pymysql.cursors.DictCursor  # Return rows as dictionaries
    )

    try:
        with connection.cursor() as cursor:
            # Create SQL query to select data
            sql_query = f"SELECT * FROM {table_name};"
            cursor.execute(sql_query)
            # Fetch all rows from the table
            result = cursor.fetchall()
            # Get column names from the cursor description
            columns = [desc[0] for desc in cursor.description]

            # Create a pandas DataFrame with the results and column names
            df = pd.DataFrame(result, columns=columns) #, columns=columns)
            return df
    finally:
        connection.close()

# Usage example
rds_instance_identifier = 'survaasdefault-db'
db_name = 'SurvaasDefaultDb'
table_name = 'customer_sample_data'
ssm_param_name = 'SurvaasDefaultDbSecretArn'
region = boto3.session.Session().region_name
username = get_secret(get_secret_name_from_ssm(ssm_param_name), region)["username"]
password = get_secret(get_secret_name_from_ssm(ssm_param_name), region)["password"]


tables = read_tables_from_database(rds_instance_identifier, db_name, username, password)
display(tables)

data = read_data_from_rds_table(rds_instance_identifier, db_name, table_name, username, password)
# pd.set_option('display.max_columns', None)
display(data)

In [None]:
!pip install seaborn
!pip install factor_analyzer
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from factor_analyzer import calculate_kmo

In [None]:
# Load the dataset
# file_path = "extended_survey_dataset.csv"  # Replace with your file path
# data = pd.read_csv(file_path) # uncomment to read .csv. file from local

# Drop non-numerical columns (e.g., RespondentID, Gender if present)
data_numeric = data.drop(columns=["RespondentID", "Gender"], errors="ignore")

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

In [None]:
# Perform PCA
pca = PCA(n_components=3)
pca_scores = pca.fit_transform(data_scaled)

# Create a DataFrame for PCA scores
pca_df = pd.DataFrame(pca_scores, columns=["PC1", "PC2", "PC3"])

In [None]:
# Correlation matrix
correlation_matrix = np.corrcoef(data_scaled.T)

# KMO MSA of the correlation matrix
kmo_all, kmo_msa = calculate_kmo(data_numeric)

print("KMO Measure of Sampling Adequacy (MSA):", kmo_msa)

In [None]:
# Plot the heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", 
            xticklabels=data_numeric.columns, yticklabels=data_numeric.columns, vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.show()

In [None]:
# 3D Scatter Plot of PCA scores
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot with labels for clarity
scatter = ax.scatter(
    pca_df["PC1"], pca_df["PC2"], pca_df["PC3"],
    c='blue', marker='o', edgecolor='k'
)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
plt.title("3D Scatter Plot of PCA Scores")
plt.show()

In [None]:
# Explained variance ratios
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratios:")
print(f"PC1: {explained_variance[0]:.2%}")
print(f"PC2: {explained_variance[1]:.2%}")
print(f"PC3: {explained_variance[2]:.2%}")

## Customer Segments (Based on Principal Components):

By examining the contributions of variables to each principal component, we can interpret the key customer segments:
### 1. PC1: General Preference and Engagement

    Variables: Strongly influenced by LikesProductA, OverallSatisfaction, and WillRecommend.
    Interpretation: This component identifies customers with a strong positive perception of the product and high likelihood to engage in word-of-mouth marketing. Customers with high scores in PC1 are brand promoters.

### 2. PC2: Shopping and Discount Preferences

    Variables: Correlated with PrefersOnlineShopping, ValuesDiscounts, and BrandLoyalty.
    Interpretation: This component highlights customers who prefer online shopping and are motivated by discounts. High PC2 scores likely belong to tech-savvy, deal-oriented shoppers.

### 3. PC3: Environmental and Technological Concerns

    Variables: Influenced by EnvironmentalConcern and TechSavviness.
    Interpretation: This component distinguishes eco-conscious customers from tech-savvy but less environmentally motivated individuals. High PC3 scores indicate environmentally aware shoppers.

## Extended Dataset Results and PCA Interpretation

The dataset has been extended to include 25 respondents, and a Principal Component Analysis (PCA) was performed. Here are the results:
Variance Explained by Principal Components:

    PC1: Explains 23.53% of the variance.
    PC2: Explains 21.59% of the variance.
    PC3: Explains 14.38% of the variance.

These three components collectively explain 59.5% of the total variance, sufficient to represent the dataset while simplifying it.
Customer Segments (Based on Principal Components):

By examining the contributions of variables to each principal component, we can interpret the key customer segments:
### 1. PC1: General Preference and Engagement

    Variables: Strongly influenced by LikesProductA, OverallSatisfaction, and WillRecommend.
    Interpretation: This component identifies customers with a strong positive perception of the product and high likelihood to engage in word-of-mouth marketing. Customers with high scores in PC1 are brand promoters.

### 2. PC2: Shopping and Discount Preferences

    Variables: Correlated with PrefersOnlineShopping, ValuesDiscounts, and BrandLoyalty.
    Interpretation: This component highlights customers who prefer online shopping and are motivated by discounts. High PC2 scores likely belong to tech-savvy, deal-oriented shoppers.

### 3. PC3: Environmental and Technological Concerns

    Variables: Influenced by EnvironmentalConcern and TechSavviness.
    Interpretation: This component distinguishes eco-conscious customers from tech-savvy but less environmentally motivated individuals. High PC3 scores indicate environmentally aware shoppers.

Sample Clusters from PCA:

Using the principal components, we can infer three primary clusters:

###    Brand Enthusiasts:
        High PC1 scores.
        Loyal customers with positive perceptions, willing to recommend products.
###    Deal Seekers:
        High PC2 scores.
        Price-sensitive customers who prioritize discounts and online shopping convenience.
###    Eco-Conscious Shoppers:
        High PC3 scores.
        Customers driven by environmental concerns and sustainability in purchasing decisions.