## Exercise 1 - Synthetic Data Generation

In [14]:
import numpy as np
import time


In [15]:
# Initialise seed using current time
current_time = int(time.time())
np.random.seed(current_time)

print('Seed set to:', current_time)

Seed set to: 1748841122


In [17]:
# Generate 5,000 customers' purchase amount in dollars. Data to follow normal distribution with mean of 100 and standard deviation of 25.
purchase_amounts = np.random.normal(loc=100, scale=25, size=5000)

purchase_amounts = np.clip(purchase_amounts, 0, None)

purchase_amounts

array([121.62075368, 121.57269779,  93.86991821, ...,  90.82954863,
        81.13844472,  87.58312066], shape=(5000,))

## Exercise 2 - Basic Data Analysis

In [18]:
# Average spending
average_spending = np.mean(purchase_amounts)
print(f'Average spending: {average_spending:.2f}')

Average spending: 100.32


In [19]:
# Standard deviation of spending
std_spending = np.std(purchase_amounts)
print(f"Spending standard deviation: {std_spending:.2f}")

Spending standard deviation: 24.93


In [20]:
#Customers who spend more than $100
high_value_customers = purchase_amounts[purchase_amounts > 100]

qty_high_value_customers = len(high_value_customers)
perc_high_value_customers = qty_high_value_customers / len(purchase_amounts) * 100

print(f'Number of high-value customers: {qty_high_value_customers}')
print(f'Percentage of high-value customers: {perc_high_value_customers:.2f} %')

Number of high-value customers: 2534
Percentage of high-value customers: 50.68 %


In [None]:
# Top 5% spenders
top_5_perc_threshold = np.percentile(purchase_amounts, 95)
top_5_perc_spenders = purchase_amounts[purchase_amounts > top_5_perc_threshold]
qty_top_5_perc_spenders = len(top_5_perc_spenders)

print(f'Number of top spenders: {qty_top_5_perc_spenders}')
print(f'Top 5% spending threshold: ${top_5_perc_threshold:.2f}')

Number of top spenders: 250
Top 5% spending threshold: $141.63504544747596


In [23]:
# Total revenue
total_revenue = np.sum(purchase_amounts)
print(f'Total revenue: ${total_revenue:.2f}')

Total revenue: $501612.14


## Exercise 3 - Advanced Data Analysis

In [26]:
# Define outliers
mean = np.mean(purchase_amounts)
std = np.std(purchase_amounts)

outlier_threshold_high = mean + 2 * std
outlier_threshold_low = mean - 2 * std

outliers_high = purchase_amounts[purchase_amounts > outlier_threshold_high]
outliers_low = purchase_amounts[purchase_amounts < outlier_threshold_low]
qty_outliers_high = len(outliers_high)
qty_outliers_low = len(outliers_low)

print(f'Outlier high spending threshold: ${outlier_threshold_high}')
print(f'Outlier low spending threshold: ${outlier_threshold_low}')

print(f'Number of high outliers: {qty_outliers_high}')
print(f'Number of low outliers: {qty_outliers_low}')

Outlier high spending threshold: $150.1818486405674
Outlier low spending threshold: $50.46300662996218
Number of high outliers: 103
Number of low outliers: 107


In [29]:
# Define bins
bins = [0, 50, 100, 150, 175]
labels = ['Very Low Spender', 'Low Spender', 'Average Spender', 'High Spender', 'Very High Spender']

# Digitize spending
spending_tiers = np.digitize(purchase_amounts, bins, right=True)

# Count customers in each tier
for i, label in enumerate(labels):
    count = np.sum(spending_tiers == i + 1)
    print(f'{label}: {count} customers')

Very Low Spender: 103 customers
Low Spender: 2363 customers
Average Spender: 2428 customers
High Spender: 99 customers
Very High Spender: 7 customers


In [32]:
# Simulate new purchase amount after offering discount
expected_increase_rate = 1.15
new_purchase_amounts = purchase_amounts * expected_increase_rate

new_total_revenue = np.sum(new_purchase_amounts)
revenue_increase = new_total_revenue - total_revenue

print(f'Projected total revenue after campaign: {new_total_revenue:.2f}')
print(f'Projected revenue increase: ${revenue_increase:.2f}')

Projected total revenue after campaign: 576853.96
Projected revenue increase: $75241.82
