# 1.1: Analyze historical sales data to identify patterns and trends

## Stratified Sampling

In [None]:
import random
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import sys
import plotly.express as px
import seaborn as sns
import plotly.express as px
from scipy.stats import beta
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [None]:
purchases = pd.read_csv('data/AmazonPurchases.csv')
survey = pd.read_csv('data/survey.csv')

In [None]:
purchases = purchases.drop(purchases.columns[0], axis=1)

In [None]:
purchases.info()
purchases.isnull().sum().sort_values()

In [None]:
purchases['Order Date'] = pd.to_datetime(purchases['Order Date'], format='%Y-%m-%d')
purchases['year'] = purchases['Order Date'].dt.year
purchases['month'] = purchases['Order Date'].dt.month
purchases['day'] = purchases['Order Date'].dt.day
purchases = purchases[purchases['Order Date'] < '2022-11-01']

In [None]:
survey.head()

In [None]:
survey.info()
survey.isnull().sum().sort_values()

In [None]:
survey = survey['Survey ResponseID'].to_frame()
survey.shape[0]

In [None]:
random.seed(21)
stratified_sample = survey.apply(lambda x: x.sample(frac=0.20,random_state = 21))
stratified_sample.info()


In [None]:
random.seed(247)
from datetime import datetime, timedelta

# Define the start and end dates
start_date = datetime(2018, 1, 1)
end_date = datetime(2022, 11, 1)

def generate_random_date(start, end):
    delta = end - start
    random_days = random.randint(0, delta.days)
    return start + timedelta(days=random_days)

stratified_sample['last_purchase_date'] = stratified_sample.apply(lambda x: generate_random_date(start_date, end_date), axis=1)

In [None]:
stratified_sample.head()

In [None]:
merged_df = pd.merge(purchases, stratified_sample, on='Survey ResponseID', how='left')
merged_df.info()
merged_df.isnull().sum()

In [None]:
merged_df['last_purchase_date'] = merged_df['last_purchase_date'].fillna(datetime(2022, 12, 31))
merged_df.isnull().sum()

In [None]:
final_df = merged_df[merged_df['Order Date'] <= merged_df['last_purchase_date']]
final_df.info()

In [None]:
final_df['Survey ResponseID'].nunique()

In [None]:
final_df = final_df.iloc[:, :8]

In [None]:
df_edit = final_df.copy()
df_edit['min_date'] = df_edit.groupby(['Survey ResponseID'])['Order Date'].transform("min")
df_edit['max_date'] = df_edit.groupby(['Survey ResponseID'])['Order Date'].transform("max")
plt.figure(figsize=(5,5))
plt.scatter(df_edit['min_date'],df_edit['max_date'], s = 1)

In [None]:
'''
import csv

from google.colab import drive
drive.mount('/content/drive')

csv.filename = 'final_df.csv'
final_df.to_csv('/content/drive/MyDrive/' + csv.filename, index=False)
'''

## Data Preparation

### Data files

### Simulation

In [None]:
purchases = pd.read_csv('data/dataprep_purchases.csv')
purchases.head()

In [None]:
customers = {
    'Survey ResponseID': purchases['Survey ResponseID'].unique()
}
customers = pd.DataFrame(customers)

### Adding Prime Column

In [None]:
# add prime column
purchases['Prime'] = 0

from collections import Counter

# amazon prime days from 2018-2022 and amazon prime early access sale 2022
date_ranges = {
    'July 16-17, 2018': ('2018-07-16', '2018-07-17'),
    'July 15-16, 2019': ('2019-07-15', '2019-07-16'),
    'October 13-14, 2020': ('2020-10-13', '2020-10-14'),
    'June 21-22, 2021': ('2021-06-21', '2021-06-22'),
    'July 12-13, 2022': ('2022-07-12', '2022-07-13'),
    'October 11-12, 2022': ('2022-10-11', '2022-10-12')
}

# getting all response ids in the date ranges
id_sets = []
for range_name, (start_date, end_date) in date_ranges.items():
    mask = (purchases['Order Date'] >= start_date) & (purchases['Order Date'] <= end_date)
    distinct_ids = set(purchases[mask]['Survey ResponseID'].unique())
    id_sets.append(distinct_ids)

# response ids that appear in at least 2 events
all_ids = [id for id_set in id_sets for id in id_set]
id_counts = Counter(all_ids)
common_ids = {id for id, count in id_counts.items() if count >= 2}

# make response ids in common_ids as Prime members (Prime = 1)
purchases.loc[purchases['Survey ResponseID'].isin(common_ids), 'Prime'] = 1

print(purchases[purchases['Survey ResponseID'].isin(common_ids)][['Survey ResponseID', 'Prime']].drop_duplicates())

In [None]:
# total number of unique response ids
total_unique_ids = purchases['Survey ResponseID'].nunique()

# number of unique response ids where Prime = 1
prime_unique_ids = purchases[purchases['Prime'] == 1]['Survey ResponseID'].nunique()

# percentages of unique response ids that are Prime = 1
prime_percentage = (prime_unique_ids / total_unique_ids) * 100
print(f"Percentage of total unique IDs that are prime: {prime_percentage:.2f}%")

In [None]:
# total distinct response ids
total_distinct_responses = purchases['Survey ResponseID'].nunique()
print("Total number of distinct Survey Response IDs:", total_distinct_responses)

In [None]:
# 65% of total survey response ids
TOTALCOUNT = int(0.65 * total_distinct_responses)
print("65% of total Survey Response IDs (TOTALCOUNT):", TOTALCOUNT)

In [None]:
# 39.87% is 1962 people
# We will need:
print(3199-1962)

In [None]:
# number of additional response ids to set as Prime
additional_needed = 1237

# get response ids that are non-Prime
non_prime_ids = purchases[purchases['Prime'] == 0]['Survey ResponseID'].unique().tolist()

# randomly set 1154 more response ids as prime
additional_prime_ids = set(random.sample(non_prime_ids, additional_needed))
purchases.loc[purchases['Survey ResponseID'].isin(additional_prime_ids), 'Prime'] = 1
print(f"Number of additional IDs set to Prime: {additional_needed}")

In [None]:
# number of unique response ids where Prime = 1
prime_unique_ids = purchases[purchases['Prime'] == 1]['Survey ResponseID'].nunique()

# percentages of unique response ids that are Prime = 1 now
prime_percentage = (prime_unique_ids / total_unique_ids) * 100
print(f"Percentage of total unique IDs that are prime: {prime_percentage:.2f}%")

In [None]:
purchases.head()

In [None]:
prime_users = purchases[purchases['Prime'] == 1]
prime_users = prime_users['Survey ResponseID'].drop_duplicates()

prime_users = pd.DataFrame(prime_users)
prime_users['Prime'] = 1
prime_users.head()

customers = pd.merge(customers, prime_users, on='Survey ResponseID', how='left')
customers['Prime'] = customers['Prime'].fillna(0).astype(int)
customers.head()

###Free trial for prime

In [None]:
customers['Free Trial'] = customers['Prime'].apply(lambda x: 1 if x == 1 else (1 if np.random.uniform(0, 1) > 0.3 else 0))

print(customers['Free Trial'].describe())

### Discounts Column

In [None]:
purchases['Order Date'] = pd.to_datetime(purchases['Order Date'])

sales_dates = [
    ('2021-06-21', '2021-06-22'),  # Amazon Prime Day 2021
    ('2022-07-12', '2022-07-13'),  # Amazon Prime Day 2022
    ('2018-07-15', '2018-07-17'),  # Amazon Prime Day 2018
    ('2019-07-15', '2019-07-17'),  # Amazon Prime Day 2019
    ('2022-10-11', '2022-10-12'),  # Amazon Prime Early Access Sale 2022
    ('2020-10-13', '2020-10-14'),  # Amazon Prime Day 2020
    ('2018-11-24', '2018-12-02'),  # Black Friday & Cyber Monday
    ('2019-11-24', '2019-12-02'),
    ('2020-11-24', '2020-12-02'),
    ('2021-11-24', '2021-12-02'),
    ('2022-11-24', '2022-12-02')
]

sales_periods = [(pd.to_datetime(start), pd.to_datetime(end)) for start, end in sales_dates]

# Discount for event days
discounts = [0, 0.3, 0.35, 0.4, 0.45, 0.5]
probabilities = [0.2, 0.2, 0.2, 0.2, 0.15, 0.05]

def assign_discount(order_date):
    for start, end in sales_periods:
        if start <= order_date <= end:
            return np.random.choice(discounts, p=probabilities)
    return np.nan  # NaN the non-sales dates

purchases['Discounts'] = purchases['Order Date'].apply(assign_discount)

purchases.head()

In [None]:
# Discount for normal days
normal_discounts = [0, 0.05, 0.1, 0.15, 0.2]
normal_probabilities = [0.6, 0.16, 0.12, 0.08, 0.04]

def assign_normal_discount(discount_value):
    if pd.isna(discount_value):
        return np.random.choice(normal_discounts, p=normal_probabilities)
    return discount_value

purchases['Discounts'] = purchases['Discounts'].apply(assign_normal_discount)

purchases.head()

#### surveyids that bought in event periods


In [None]:
purchases['Order Date'] = pd.to_datetime(purchases['Order Date'])

sales_dates = [
    ('2021-06-21', '2021-06-22'),  # Amazon Prime Day 2021
    ('2022-07-12', '2022-07-13'),  # Amazon Prime Day 2022
    ('2018-07-15', '2018-07-17'),  # Amazon Prime Day 2018
    ('2019-07-15', '2019-07-17'),  # Amazon Prime Day 2019
    ('2022-10-11', '2022-10-12'),  # Amazon Prime Early Access Sale 2022
    ('2020-10-13', '2020-10-14'),  # Amazon Prime Day 2020
    ('2018-11-24', '2018-12-02'),  # Black Friday & Cyber Monday
    ('2019-11-24', '2019-12-02'),
    ('2020-11-24', '2020-12-02'),
    ('2021-11-24', '2021-12-02'),
    ('2022-11-24', '2022-12-02')
]

sales_dates = [(pd.to_datetime(start), pd.to_datetime(end)) for start, end in sales_dates]

# purchases in date ranges
event_goers = purchases[
    purchases['Order Date'].apply(lambda date: any(start <= date <= end for start, end in sales_dates))
]

# keep only survey responseid column
event_goers = event_goers[['Survey ResponseID']].drop_duplicates()

event_goers

### CTC

In [None]:
#Simulating click to conversion rates for all marketing channels
random.seed(3101)
#Email
desired_mean = 0.047
size = len(customers.index)
beta_param = 10
alpha_param = desired_mean * beta_param / (1 - desired_mean)
data = beta.rvs(alpha_param, beta_param, size=size)
customers['Email CTC'] = data
#Display ad
desired_mean = 0.02
size = len(customers.index)
beta_param = 10
alpha_param = desired_mean * beta_param / (1 - desired_mean)
data = beta.rvs(alpha_param, beta_param, size=size)
customers['Display Ad CTC'] = data
#Video ad
desired_mean = 0.01
size = len(customers.index)
beta_param = 10
alpha_param = desired_mean * beta_param / (1 - desired_mean)
data = beta.rvs(alpha_param, beta_param, size=size)
customers['Video Ad CTC'] = data
#Search Engine Ad
desired_mean = 0.025
size = len(customers.index)
beta_param = 10
alpha_param = desired_mean * beta_param / (1 - desired_mean)
data = beta.rvs(alpha_param, beta_param, size=size)
customers['Search Engine Ad CTC'] = data
#Social media ad
desired_mean = 0.015
size = len(customers.index)
beta_param = 10
alpha_param = desired_mean * beta_param / (1 - desired_mean)
data = beta.rvs(alpha_param, beta_param, size=size)
customers['Social Media Ad CTC'] = data
#Average
customers['Average CTC'] = (customers['Email CTC'] + customers['Display Ad CTC'] + customers['Video Ad CTC'] + customers['Search Engine Ad CTC'] + customers['Social Media Ad CTC'])/5
print(customers['Average CTC'].describe())
plt.figure(figsize=(10, 6))
sns.histplot(customers['Average CTC'], kde=True)

In [None]:
# Setting some of the CTC rates to 0

customers['Email CTC'] = customers['Email CTC'].where(customers['Email CTC'] >= 0.02, 0)                                      #2%
customers['Display Ad CTC'] = customers['Display Ad CTC'].where(customers['Display Ad CTC'] >= 0.01, 0)                       #1%
customers['Video Ad CTC'] = customers['Video Ad CTC'].where(customers['Video Ad CTC'] >= 0.005, 0)                            #0.05%
customers['Search Engine Ad CTC'] = customers['Search Engine Ad CTC'].where(customers['Search Engine Ad CTC'] >= 0.02, 0)     #2%
customers['Social Media Ad CTC'] = customers['Social Media Ad CTC'].where(customers['Social Media Ad CTC'] >= 0.01, 0)        #1%
customers['Average CTC'] = customers['Average CTC'].where(customers['Average CTC'] >= 0.02, 0)                                #2%

### Paid for express shipping

In [None]:
## All Prime members will have Express Shipping == 1
np.random.seed(42)

p_temp = purchases[purchases['Prime'] == 0]
p_temp = p_temp[['Survey ResponseID', 'Order Date']].drop_duplicates()
p_temp_count = len(p_temp)
temp = np.random.binomial(1, 0.47, size = p_temp_count)
p_temp['Express Shipping'] = temp

purchases = pd.merge(purchases, p_temp, on=['Survey ResponseID', 'Order Date'], how='left')
purchases['Express Shipping'] = purchases['Express Shipping'].fillna(1).astype(int)
purchases.head()

In [None]:
express_cust = (
    purchases[purchases['Express Shipping'] == 1]
    .drop_duplicates(subset=['Survey ResponseID', 'Order Date'])
    .groupby('Survey ResponseID')
    .size()
    .reset_index(name='num_express')
)
express_cust.head()

In [None]:
all_cust = (
    purchases.drop_duplicates(subset=['Survey ResponseID', 'Order Date'])
    .groupby('Survey ResponseID')
    .size()
    .reset_index(name='num_express_all')
)
all_cust.head()

In [None]:
exp_ship = pd.merge(all_cust, express_cust, on='Survey ResponseID', how='left')
exp_ship['Express Shipping'] = exp_ship['num_express'] / exp_ship['num_express_all']
exp_ship.head()

customers = pd.merge(customers, exp_ship[['Survey ResponseID', 'Express Shipping']], on='Survey ResponseID', how='left')
customers.head()

In [None]:
sns.histplot(customers[customers['Prime'] == 0]['Express Shipping'], kde = True)
plt.show()

### Pre- tagging of customers

In [None]:
num_to_sample_25 = int(len(customers) * 0.25)
print(num_to_sample_25)
num_to_sample_45 = int(len(customers) * 0.45)
print(num_to_sample_45)

In [None]:
# Getting the impulse customers (45%)
impulse = event_goers.sample(n = num_to_sample_45, random_state=11)
impulse = customers[customers['Survey ResponseID'].isin(impulse['Survey ResponseID'])]
impulse_id = impulse['Survey ResponseID'].tolist()

# Getting the indecisive customers (25%)
non_impulse = customers[~customers['Survey ResponseID'].isin(impulse_id)].sample(n=num_to_sample_25, random_state=11)
non_impulse_id = non_impulse['Survey ResponseID'].tolist()

# Getting the remaining normal customers
remaining_customers = customers[~customers['Survey ResponseID'].isin(impulse_id + non_impulse_id)]

#### Cart abandonment rate

In [None]:
# for indecisive buyers
np.random.seed(42)
desired_mean = 0.85
size = len(non_impulse.index)
beta_param = 10
alpha_param = desired_mean * beta_param / (1 - desired_mean)
data = beta.rvs(alpha_param, beta_param, size=size)
non_impulse['Cart Abandonment Rate'] = data

In [None]:
# for remaining buyers
np.random.seed(42)
desired_mean = 0.7

size = len(impulse.index)
beta_param = 10
alpha_param = desired_mean * beta_param / (1 - desired_mean)
data = beta.rvs(alpha_param, beta_param, size=size)
impulse['Cart Abandonment Rate'] = data

size = len(remaining_customers.index)
beta_param = 10
alpha_param = desired_mean * beta_param / (1 - desired_mean)
data = beta.rvs(alpha_param, beta_param, size=size)
remaining_customers['Cart Abandonment Rate'] = data

In [None]:
sns.histplot(non_impulse['Cart Abandonment Rate'], kde=True)

#### Whether customer checks product reviews before purchasing

In [None]:
# For impulse buyers
np.random.seed(42)
size = len(impulse.index)
p = 0.3      # probability of getting 1
data = np.random.choice([0, 1], size=size, p=[1-p, p])
impulse['Check Review'] = data

In [None]:
# For indecisive buyers
np.random.seed(42)
size = len(non_impulse.index)
p = 0.8      # probability of getting 1
data = np.random.choice([0, 1], size=size, p=[1-p, p])
non_impulse['Check Review'] = data

In [None]:
# For remaining buyers
np.random.seed(42)
size = len(remaining_customers.index)
p = 0.6      # probability of getting 1
data = np.random.choice([0, 1], size=size, p=[1-p, p])
remaining_customers['Check Review'] = data

In [None]:
sns.histplot(remaining_customers['Check Review'], kde=True)

#### Return/refund rate

In [None]:
# For impulse buyers
np.random.seed(42)
desired_mean = 0.2
size = len(impulse.index)
beta_param = 10
alpha_param = desired_mean * beta_param / (1 - desired_mean)
data = beta.rvs(alpha_param, beta_param, size=size)
impulse['Refund rate'] = data

impulse.head()

In [None]:
# For remaining buyers
np.random.seed(42)
desired_mean = 0.05

size = len(non_impulse.index)
beta_param = 10
alpha_param = desired_mean * beta_param / (1 - desired_mean)
data = beta.rvs(alpha_param, beta_param, size=size)
non_impulse['Refund rate'] = data

size = len(remaining_customers.index)
beta_param = 10
alpha_param = desired_mean * beta_param / (1 - desired_mean)
data = beta.rvs(alpha_param, beta_param, size=size)
remaining_customers['Refund rate'] = data

In [None]:
sns.histplot(remaining_customers['Refund rate'], kde=True)

#### Time in Cart

In [None]:
# For impulse buyers
np.random.seed(42)
size = len(impulse.index)
data = np.random.exponential(scale=15, size=size) #average time = 15
impulse['Time in Cart'] = data

In [None]:
# For indecisive buyers
np.random.seed(42)
size = len(non_impulse.index)
data = np.random.exponential(scale=35, size=size) #average time = 35
non_impulse['Time in Cart'] = data

In [None]:
# For remaining buyers
np.random.seed(42)
size = len(remaining_customers.index)
data = np.random.exponential(scale=25, size=size) #average time = 25
remaining_customers['Time in Cart'] = data

In [None]:
sns.histplot(impulse['Time in Cart'], kde=True)

#### Time on Page

In [None]:
# For impulse buyers
np.random.seed(42)
size = len(impulse.index)
data = np.random.exponential(scale=1, size=size) #average time = 1
impulse['Time on Page'] = data

In [None]:
# For indecisive buyers
np.random.seed(42)
size = len(non_impulse.index)
data = np.random.exponential(scale=10, size=size) #average time = 10
non_impulse['Time on Page'] = data

In [None]:
# For remaining buyers
np.random.seed(42)
size = len(remaining_customers.index)
data = np.random.exponential(scale=5, size=size) #average time = 5
remaining_customers['Time on Page'] = data

In [None]:
sns.histplot(impulse['Time on Page'], kde=True)

### Combining columns together

In [None]:
impulse.head()

In [None]:
non_impulse.head()

In [None]:
remaining_customers.head()

In [None]:
customers = pd.concat([impulse, non_impulse, remaining_customers], axis=0, ignore_index=True)
customers.head()

In [None]:
customers.info()

In [None]:
p = purchases.copy()

In [None]:
p.head()

###RFM Scores

In [None]:
#Lifetime RFM
purchases = pd.read_csv('data/purchases.csv')
purchases['Order Date'] = pd.to_datetime(purchases['Order Date'], format='%Y-%m-%d')
purchases = purchases[purchases['Order Date'] < pd.to_datetime('2022-11-01', format='%Y-%m-%d')]
purchases['Year'] = purchases['Order Date'].dt.year
purchases['Month'] = purchases['Order Date'].dt.month
purchases['Recency'] = pd.to_datetime('2022-11-01', format='%Y-%m-%d') - purchases.groupby(['Survey ResponseID'])['Order Date'].transform("max")
purchases['Frequency'] = purchases.groupby(['Survey ResponseID'])['Order Date'].transform('nunique')
purchases['Order value'] = purchases['Quantity'] * purchases['Purchase Price Per Unit']
purchases['Monetary'] = purchases.groupby(['Survey ResponseID'])['Order value'].transform('sum')
purchases['Monetary'] = purchases['Monetary']/purchases['Frequency']
purchases['Frequency'] = 46*30/purchases['Frequency']
purchases = purchases[['Recency','Frequency','Monetary','Survey ResponseID']]
purchases = purchases.drop_duplicates()
#print(purchases['Frequency'].describe())
#print(purchases['Monetary'].describe())
purchases['Recency'] = purchases['Recency'].dt.days
#Converting lifetime rfm to low,medium,high RFM

conditions_recency = [
    purchases['Recency'] <= 30,
    (purchases['Recency'] > 30) & (purchases['Recency'] <= 365),
    purchases['Recency'] > 365
]

conditions_frequency = [
    purchases['Frequency'] <= 7,
    (purchases['Frequency'] > 7) & (purchases['Frequency'] < 30),
    purchases['Frequency'] >= 30
]

conditions_monetary = [
    purchases['Monetary'] <= 35,
    (purchases['Monetary'] > 35) & (purchases['Monetary'] < 65),
    purchases['Monetary'] >= 65
]
choices_RF = [3,2,1]
choices_M = [1,2,3]
purchases['Recency_category'] = np.select(conditions_recency, choices_RF, default= 'Unknown')
purchases['Frequency_category'] = np.select(conditions_frequency, choices_RF, default= 'Unknown')
purchases['Monetary_category'] = np.select(conditions_monetary, choices_M, default= 'Unknown')
purchases['RFM'] = purchases['Recency_category'].astype(str) + purchases['Frequency_category'].astype(str) + purchases['Monetary_category'].astype(str)
print(purchases['RFM'].value_counts().sort_index(ascending= False))
lifetime_rfm = purchases[['Survey ResponseID','RFM']].drop_duplicates()
#FM matrix for each recency

#recency_3_data = purchases[purchases['Recency'] == 3]
fm_counts = [
    [sum(purchases['RFM'] == "331"), sum(purchases['RFM'] == "332"), sum(purchases['RFM'] == "333")],
    [sum(purchases['RFM'] == "321"), sum(purchases['RFM'] == "322"), sum(purchases['RFM'] == "323")],
    [sum(purchases['RFM'] == "311"), sum(purchases['RFM'] == "312"), sum(purchases['RFM'] == "313")]
]
fm_matrix = np.array(fm_counts)
total_count = fm_matrix.sum()
percentages = (fm_matrix / total_count * 100).round(2)
annot_labels = np.array([f"{count}\n{percent:.2f}%"
                         for count_row, percent_row in zip(fm_matrix, percentages)
                         for count, percent in zip(count_row, percent_row)]).reshape(fm_matrix.shape)

plt.figure(figsize=(6, 6))
sns.heatmap(fm_matrix, annot=annot_labels, fmt="", cmap="viridis",
            xticklabels=['Low M', 'Mid M', 'High M'],
            yticklabels=['High F', 'Mid F', 'Low F'])
plt.title("FM Matrix for High Recency Customers")
plt.xlabel("Monetary (M)")
plt.ylabel("Frequency (F)")
plt.show()

#RM matrix
fm_counts = [
    [sum((purchases['Recency_category'] == "3") & (purchases['Monetary_category'] == "1")),
     sum((purchases['Recency_category'] == "3") & (purchases['Monetary_category'] == "2")),
     sum((purchases['Recency_category'] == "3") & (purchases['Monetary_category'] == "3"))],
    [sum((purchases['Recency_category'] == "2") & (purchases['Monetary_category'] == "1")),
     sum((purchases['Recency_category'] == "2") & (purchases['Monetary_category'] == "2")),
     sum((purchases['Recency_category'] == "2") & (purchases['Monetary_category'] == "3"))],
    [sum((purchases['Recency_category'] == "1") & (purchases['Monetary_category'] == "1")),
     sum((purchases['Recency_category'] == "1") & (purchases['Monetary_category'] == "2")),
     sum((purchases['Recency_category'] == "1") & (purchases['Monetary_category'] == "3"))]
]
fm_matrix = np.array(fm_counts)
total_count = fm_matrix.sum()
percentages = (fm_matrix / total_count * 100).round(2)
annot_labels = np.array([f"{count}\n{percent:.2f}%"
                         for count_row, percent_row in zip(fm_matrix, percentages)
                         for count, percent in zip(count_row, percent_row)]).reshape(fm_matrix.shape)

plt.figure(figsize=(6, 6))
sns.heatmap(fm_matrix, annot=annot_labels, fmt="", cmap="viridis",
            xticklabels=['Low M', 'Mid M', 'High M'],
            yticklabels=['High R', 'Mid R', 'Low R'])
plt.title("RM Matrix")
plt.xlabel("Monetary (M)")
plt.ylabel("Recency (R)")
plt.show()
fig = px.scatter_3d(purchases, x='Recency', y='Frequency', z='Monetary',
                    color='Monetary', size = 'Frequency',
                    labels={'Recency':'Recency', 'Frequency':'Frequency', 'Monetary':'Monetary'},
                    title="RFM 3D Scatter Plot")
fig.show()
#customers = customers.drop(columns=['RFM'])
customers = pd.merge(customers, lifetime_rfm, on='Survey ResponseID', how='inner')
print(customers)

##

## EDA

###Loading libraries and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import pandas as pd
import plotly.express as px

In [None]:
customers = pd.read_csv('data/customers.csv')
purchases = pd.read_csv('data/purchases.csv')

In [None]:
p = purchases.copy()
p.head()

In [None]:
p.info()

In [None]:
customers.head()

In [None]:
customers.info()

###EDA

#### Sales graph

In [None]:
#monthly sales graph
purchases['Order value'] = purchases['Quantity'] * purchases['Purchase Price Per Unit']
purchases['Order Date'] = pd.to_datetime(purchases['Order Date'], format='%Y-%m-%d')
purchases['Year'] = purchases['Order Date'].dt.year
purchases['Month'] = purchases['Order Date'].dt.month
daily_sales = purchases.groupby(['Year','Month'])['Order value'].sum().reset_index()
daily_sales['Order Date'] = pd.to_datetime(daily_sales[['Year', 'Month']].assign(DAY=1))

fig = px.line(daily_sales, x='Order Date', y='Order value', title='Daily Sales',
              labels = {'Order Date':'Date',
                        'Order value': 'Monthly Revenue ($)'}) # Corrected x to 'Order Date'
fig.update_layout(title_text="Monthly Revenue from Jan 2018 to Oct 2022")
fig

In [None]:
#Yearly revenue growth comparison
yearly_revenue = purchases.groupby('Year')['Order value'].sum().reset_index()
yearly_revenue['Growth'] = yearly_revenue['Order value'].pct_change() * 100
print(yearly_revenue)

purchase_filtered = purchases[purchases['Month'] <= 10]
yearly_revenue_filtered = purchase_filtered.groupby('Year')['Order value'].sum().reset_index()
yearly_revenue_filtered['Growth'] = yearly_revenue_filtered['Order value'].pct_change() * 100
yearly_revenue_filtered = yearly_revenue_filtered[yearly_revenue_filtered['Year'] >= 2019]
#Bar graph of yearly revenue filtered
fig = px.bar(yearly_revenue_filtered, x='Year', y='Growth', title='Yearly Revenue',
             labels = {'Order Date':'Year',
                       'Growth': 'Change in revenue wrt previous year (%)'},
             text = 'Growth') # Corrected x to 'Order Date'
fig.update_traces(marker_color=np.where(yearly_revenue_filtered['Growth'] < 0, 'red', 'green'))
fig.update_layout(title_text="Revenue growth of 1st 10 months of each year")
fig.update_xaxes(
    dtick="Y1",  # Set tick interval to 1 year
    tickformat="%Y"  # Format tick labels to show only the year
)
fig.update_traces(texttemplate='%{text:.2s}%', textposition='outside')
fig

#### Whether customers are Prime members

In [None]:
prime_users = p[p['Prime'] == 1]
prime_users = prime_users['Survey ResponseID'].drop_duplicates()

non_prime_users = p[p['Prime'] == 0]
non_prime_users = non_prime_users['Survey ResponseID'].drop_duplicates()

num_prime_users = len(prime_users)
num_non_prime_users = len(non_prime_users)
user_types = ['Prime', 'Non-Prime']
user_counts = [num_prime_users, num_non_prime_users]

plt.bar(user_types, user_counts)
plt.xlabel('User Type')
plt.ylabel('Number of Users')
plt.title('Number of Prime vs. Non-Prime Users')
plt.show()

In [None]:
len(prime_users) / len(customers)

65% of customers are Amazon Prime users, whereas 35% are not.

#### Cart Abandonment rate

In [None]:
sns.histplot(customers['Cart Abandonment Rate'], kde=True)
plt.xlabel('Cart Abandonment Rate')
plt.ylabel('Frequency')
plt.title('Distribution of Cart Abandonment Rate')
plt.show()

In [None]:
customers['Cart Abandonment Rate'].median()

Overall cart abandonment rate seems to be quite high, with almost everyone choosing to abandon their carts more than half of the time. ~74% of carts are abandoned.

#### Whether customers check Product Reviews

In [None]:
check_users = customers[customers['Check Review'] == 1]
check_users = check_users['Survey ResponseID'].drop_duplicates()

no_check_users = customers[customers['Check Review'] == 0]
no_check_users = no_check_users['Survey ResponseID'].drop_duplicates()

num_check_users = len(check_users)
num_no_check_users = len(no_check_users)
user_types_reviews = ['Check Reviews', 'Doesn\'t Check Reviews']
user_counts_reviews = [num_check_users, num_no_check_users]

plt.bar(user_types_reviews, user_counts_reviews)
plt.xlabel('User Type')
plt.ylabel('Number of Users')
plt.title('Users who Check/Doesn\'t Check Reviews')
plt.show()

In [None]:
num_check_users/len(customers)

51.0% of customers check reviews

#### Trial

In [None]:
free_trial = customers[customers['Free Trial'] == 1]
free_trial = free_trial['Survey ResponseID'].drop_duplicates()

no_free_trial = customers[customers['Free Trial'] == 0]
no_free_trial = no_free_trial['Survey ResponseID'].drop_duplicates()

num_free_trial = len(free_trial)
num_no_free_trial = len(no_free_trial)
user_types_trial = ['Free Trial', 'No Free Trial']
user_counts_trial = [num_free_trial, num_no_free_trial]

plt.bar(user_types_trial, user_counts_trial)
plt.xlabel('User Type')
plt.ylabel('Number of Users')
plt.title('Free Trial VS Not Free Trial Users')
plt.show()

In [None]:
num_free_trial/len(customers)

89.5% of customers are on free trial

#### Percentage of Discounted Items Bought

In [None]:
p['discount_applied'] = p['Discounts'].apply(lambda x: 1 if x > 0 else 0)

discount_counts = p.groupby('Survey ResponseID')['discount_applied'].value_counts().unstack(fill_value=0)
discount_counts.columns = ['no_discount', 'with_discount']
discount_counts['total_items'] = discount_counts['with_discount'] + discount_counts['no_discount']
discount_counts['pct_discount'] = discount_counts['with_discount'] / discount_counts['total_items']
discount_counts.head(20)

In [None]:
keep = discount_counts[discount_counts['total_items'] > 5]

discount_counts['keep discounts'] = discount_counts.index.isin(keep.index).astype(int)
discount_counts.info()

In [None]:
customers['Pct Discount'] = pd.merge(customers, discount_counts, on='Survey ResponseID', how='left')['pct_discount']
customers['Keep Discount'] = pd.merge(customers, discount_counts, on='Survey ResponseID', how='left')['keep discounts']
p = p.drop(columns=['discount_applied'])
customers.head()

In [None]:
sns.histplot(customers['Pct Discount'], kde=True)
plt.xlabel('Pct of Discounted Items Bought')
plt.ylabel('Frequency')
plt.title('Distribution of Pct of Discounted Items Bought')
plt.show()

#### Refund / Return Rate

In [None]:
sns.histplot(customers['Refund rate'], kde=True)
plt.xlabel('Refund rate')
plt.ylabel('Frequency')
plt.title('Distribution of Refund rate')
plt.show()

#### Category

In [None]:
#Most popular category for each customer
purchases['Total items bought'] = purchases.groupby(['Survey ResponseID'])['Order Date'].transform('nunique')
purchases['Total items bought per category'] = purchases.groupby(['Survey ResponseID','Category'])['Order Date'].transform('nunique')
purchases['Most items category'] = purchases.groupby(['Survey ResponseID'])['Total items bought per category'].transform('max')
purchases = purchases[purchases['Total items bought per category'] == purchases['Most items category']]
purchases_filtered = purchases[['Total items bought','Survey ResponseID','Category','Most items category']].drop_duplicates()
purchases_filtered['pct of category'] = purchases_filtered['Most items category']/purchases_filtered['Total items bought']

purchases_filtered = purchases_filtered[purchases_filtered['Total items bought'] >= 5]

sns.histplot(purchases_filtered['pct of category'], kde=True)

In [None]:
purchases_filtered = purchases_filtered[purchases_filtered['pct of category'] > 0.5]
purchases_filtered = purchases_filtered.drop(columns=['Total items bought','pct of category', 'Most items category'])
customers = pd.merge(customers, purchases_filtered, on='Survey ResponseID', how='left')
customers.head()
customers.info()

In [None]:
# We have 4934 IDs instead of 4933: One dude has a duplicated entry bc he has 2 categories tied for most purchased category
duplicates = customers[customers['Survey ResponseID'].duplicated(keep=False)]
print(duplicates)

In [None]:
customers = customers.drop(customers[customers.duplicated(subset='Survey ResponseID', keep='first')].sample(frac=1).index)
customers.info()

In [None]:
counts = customers['Category'].value_counts().dropna()

sns.barplot(x = counts.index, y=counts.values)
plt.title('Count of Customers who Mostly Purchase 1 Category')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.show()

#### Paid for express shipping:

In [None]:
#Plot of % of orders bought with express shipping for non-prime customers
sns.histplot(customers[customers['Prime'] == 0]['Express Shipping'], kde = True)
plt.show()

#### Click-to-conversion (CTC) Rates

In [None]:
sns.boxplot(data=customers[['Email CTC','Display Ad CTC','Video Ad CTC','Search Engine Ad CTC','Social Media Ad CTC']])

plt.title('Boxplots of 5 CTC')
plt.xlabel('Channel')
plt.ylabel('Values')
plt.xticks(rotation=45)

In [None]:
#distribution of CTC for different marketing channels
sns.displot(customers['Email CTC'], kind = 'kde', color = "red")
sns.displot(customers['Display Ad CTC'], kind = 'kde', color = "blue")
sns.displot(customers['Video Ad CTC'], kind = 'kde', color = "green")
sns.displot(customers['Search Engine Ad CTC'], kind = 'kde', color = "purple")
sns.displot(customers['Social Media Ad CTC'], kind = 'kde', color = "orange")
sns.displot(customers['Average CTC'], kind = 'kde', color = "brown")
plt.show()

In [None]:
sns.histplot(customers['Email CTC'], kde = True, color = "red")
plt.show()
sns.histplot(customers['Display Ad CTC'], kde = True, color = "blue")
plt.show()
sns.histplot(customers['Video Ad CTC'], kde = True, color = "green")
plt.show()
sns.histplot(customers['Search Engine Ad CTC'], kde = True, color = "purple")
plt.show()
sns.histplot(customers['Social Media Ad CTC'], kde = True, color = "orange")
plt.show()
sns.histplot(customers['Average CTC'], kde = True, color = "brown")
plt.show()

#### Time in Cart

In [None]:
#sns.kdeplot(customers['Time in Cart'], fill=True)
sns.histplot(customers['Time in Cart'], kde=True)
plt.title('Density Distribution of Time in Cart Values')
plt.xlabel('Values')
plt.ylabel('Density')
plt.show()

#### Time on Page

In [None]:
#sns.kdeplot(customers['Time on Page'], fill=True)
sns.histplot(customers['Time on Page'], kde=True)
plt.title('Density Distribution of Time on Page Values')
plt.xlabel('Values')
plt.ylabel('Density')
plt.show()

####FM Matrix

In [None]:
#Lifetime RFM
purchases['Order Date'] = pd.to_datetime(purchases['Order Date'], format='%Y-%m-%d')
purchases = purchases[purchases['Order Date'] < pd.to_datetime('2022-11-01', format='%Y-%m-%d')]
purchases['Year'] = purchases['Order Date'].dt.year
purchases['Month'] = purchases['Order Date'].dt.month
purchases['Recency'] = pd.to_datetime('2022-11-01', format='%Y-%m-%d') - purchases.groupby(['Survey ResponseID'])['Order Date'].transform("max")
#Frequency need to change to number of purchases/month
purchases['Frequency'] = purchases.groupby(['Survey ResponseID'])['Order Date'].transform('nunique')
purchases['Order value'] = purchases['Quantity'] * purchases['Purchase Price Per Unit']
purchases['Monetary'] = purchases.groupby(['Survey ResponseID'])['Order value'].transform('sum')
purchases = purchases[['Recency','Frequency','Monetary','Survey ResponseID']]
purchases = purchases.drop_duplicates()
purchases['Recency'] = purchases['Recency'].dt.days
freq_medium = purchases['Frequency'].quantile(0.5)
freq_high = purchases['Frequency'].quantile(0.75)
monetary_medium = purchases['Monetary'].quantile(0.5)
monetary_high = purchases['Monetary'].quantile(0.75)
#print(purchases['Frequency'].describe())
#print(purchases['Monetary'].describe())
conditions_frequency = [
    purchases['Frequency'] <= freq_medium,
    (purchases['Frequency'] > freq_medium) & (purchases['Frequency'] < freq_high),
    purchases['Frequency'] >= freq_high
]

conditions_monetary = [
    purchases['Monetary'] <= monetary_medium,
    (purchases['Monetary'] > monetary_medium) & (purchases['Monetary'] < monetary_high),
    purchases['Monetary'] >= monetary_high
]
choices_FM = [1,2,3]
purchases['Frequency_category'] = np.select(conditions_frequency, choices_FM, default= 'Unknown')
purchases['Monetary_category'] = np.select(conditions_monetary, choices_FM, default= 'Unknown')
purchases['FM'] = purchases['Frequency_category'].astype(str) + purchases['Monetary_category'].astype(str)
print(purchases['FM'].value_counts().sort_index(ascending= False))
lifetime_fm = purchases[['Survey ResponseID','FM']].drop_duplicates()
fm_counts = [
    [sum(purchases['FM'] == "31"), sum(purchases['FM'] == "32"), sum(purchases['FM'] == "33")],
    [sum(purchases['FM'] == "21"), sum(purchases['FM'] == "22"), sum(purchases['FM'] == "23")],
    [sum(purchases['FM'] == "11"), sum(purchases['FM'] == "12"), sum(purchases['FM'] == "13")]
]
fm_matrix = np.array(fm_counts)
total_count = fm_matrix.sum()
percentages = (fm_matrix / total_count * 100).round(2)
annot_labels = np.array([f"{count}\n{percent:.2f}%"
                         for count_row, percent_row in zip(fm_matrix, percentages)
                         for count, percent in zip(count_row, percent_row)]).reshape(fm_matrix.shape)

plt.figure(figsize=(6, 6))
sns.heatmap(fm_matrix, annot=annot_labels, fmt="", cmap="viridis",
            xticklabels=['Low M', 'Mid M', 'High M'],
            yticklabels=['High F', 'Mid F', 'Low F'])
plt.title("FM Matrix for Customers")
plt.xlabel("Monetary (M)")
plt.ylabel("Frequency (F)")
plt.show()

# 1.2: Develop customer segmentation models based on purchasing behavior

## Importing libraries and data

In [None]:
import csv
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import plotly.express as px
import time
import random
from scipy.stats import beta

In [None]:
customers = pd.read_csv('data/segmentation_customers.csv')
purchases = pd.read_csv('data/purchases.csv')

In [None]:
customers.head()

In [None]:
purchases.head()

## Customer Segmentation

### Saving time/urgency (for non-Prime members)

In [None]:
urgency = customers[customers['Express Shipping'] >= 0.5]
urgency = urgency[urgency['Time in Cart'] <= 20]
urgency = urgency[urgency['Prime'] == 0]
len(urgency)

In [None]:
len(urgency) / len(customers[customers['Prime'] == 0])


about 23% of non-prime users are more urgent (will pay for expedited shipping more often).

In [None]:
urgency['Cat_Urgency'] = 1
urgency['Cat_Urgency'] = urgency['Cat_Urgency'].astype(int)
urgency = urgency[['Survey ResponseID', 'Cat_Urgency']]
customers = pd.merge(customers, urgency, on='Survey ResponseID', how='left')
customers['Cat_Urgency'] = customers['Cat_Urgency'].fillna(0).astype(int)
customers.head()

### Impulse buyer

In [None]:
impulse = customers[customers['Time in Cart'] <= 20]
impulse = impulse[impulse['Time on Page'] <= 2.5]
impulse = impulse[impulse['Check Review'] == 0 ]
impulse = impulse[impulse['Refund rate'] > 0.05]
len(impulse)

In [None]:
len(impulse) / len(customers)

about 36% of customers are impulse buyers.

In [None]:
impulse['Cat_Impulse'] = 1
impulse['Cat_Impulse'] = impulse['Cat_Impulse'].astype(int)
impulse = impulse[['Survey ResponseID', 'Cat_Impulse']]
customers = pd.merge(customers, impulse, on='Survey ResponseID', how='left')
customers['Cat_Impulse'] = customers['Cat_Impulse'].fillna(0).astype(int)
customers.head()

### Indecisive/cautious

In [None]:
indecisive = customers[customers['Cart Abandonment Rate'] >= 0.75]
indecisive = indecisive[indecisive['Check Review'] == 1]
indecisive = indecisive[indecisive['Time on Page'] >= 7]
indecisive = indecisive[indecisive['Time in Cart'] >= 30]
len(indecisive)

In [None]:
len(indecisive) / len(customers)

about 13% of customers are indecisive buyers

In [None]:
indecisive['Cat_Indecisive'] = 1
indecisive['Cat_Indecisive'] = indecisive['Cat_Indecisive'].astype(int)
indecisive = indecisive[['Survey ResponseID', 'Cat_Indecisive']]
customers = pd.merge(customers, indecisive, on='Survey ResponseID', how='left')
customers['Cat_Indecisive'] = customers['Cat_Indecisive'].fillna(0).astype(int)
customers.head()

### Category

In [None]:
cat = customers[customers['Category'].notnull()]
len(cat)

In [None]:
len(indecisive) / len(customers)

about 13% of customers have a frequently purchased category that they stick to when ordering

In [None]:
cat['Cat_Category'] = 1
cat['Cat_Category'] = cat['Cat_Category'].astype(int)
cat = cat[['Survey ResponseID', 'Cat_Category']]
customers = pd.merge(customers, cat, on='Survey ResponseID', how='left')
customers['Cat_Category'] = customers['Cat_Category'].fillna(0).astype(int)
customers.head()

### Price sensitive

In [None]:
discount = customers[customers['Keep Discount'] == 1]
discount = customers[customers['Pct Discount'] >= 0.4]
len(discount)

In [None]:
len(discount) / len(customers)

In [None]:
discount['Cat_Discount'] = 1
discount['Cat_Discount'] = discount['Cat_Discount'].astype(int)
discount = discount[['Survey ResponseID', 'Cat_Discount']]
customers = pd.merge(customers, discount, on='Survey ResponseID', how='left')
customers['Cat_Discount'] = customers['Cat_Discount'].fillna(0).astype(int)
customers.head()

In [None]:
customers = customers.drop(columns = ['Keep Discount'])

### Marketing Engagement

In [None]:
cols_to_check = ['Email CTC', 'Display Ad CTC', 'Video Ad CTC', 'Search Engine Ad CTC', 'Social Media Ad CTC']
filtered_cust = customers[customers[cols_to_check].gt(0).any(axis=1)]
len(filtered_cust)

In [None]:
len(filtered_cust) / len(customers)

about 87% of customers were engaged by any one of the marketing ads

In [None]:
filtered_cust['Cat_Engagement'] = 1
filtered_cust['Cat_Engagement'] = filtered_cust['Cat_Engagement'].astype(int)
filtered_cust = filtered_cust[['Survey ResponseID', 'Cat_Engagement']]
customers = pd.merge(customers, filtered_cust, on='Survey ResponseID', how='left')
customers['Cat_Engagement'] = customers['Cat_Engagement'].fillna(0).astype(int)
customers.head()

### High Value Customers

In [None]:
hvc = customers.copy()
hvc[['R', 'F', 'M']] = hvc['RFM'].astype(str).apply(lambda x: pd.Series(list(x)))
hvc[['R', 'F', 'M']] = hvc[['R', 'F', 'M']].astype(int)
hvc.head()

In [None]:
hvc_f = hvc[hvc['F'] == 3]
len(hvc_f) / len(customers)

about 25% of customers are high value (frequent) customers

In [None]:
hvc_f['Cat_High_Value_F'] = 1
hvc_f['Cat_High_Value_F'] = hvc_f['Cat_High_Value_F'].astype(int)
hvc_f = hvc_f[['Survey ResponseID', 'Cat_High_Value_F']]
customers = pd.merge(customers, hvc_f, on='Survey ResponseID', how='left')
customers['Cat_High_Value_F'] = customers['Cat_High_Value_F'].fillna(0).astype(int)
customers.head()

In [None]:
hvc_m = hvc[hvc['M'] == 3]
len(hvc_m) / len(customers)

about 25% of customers are high value (high spending) customers

In [None]:
hvc_m['Cat_High_Value_M'] = 1
hvc_m['Cat_High_Value_M'] = hvc_m['Cat_High_Value_M'].astype(int)
hvc_m = hvc_m[['Survey ResponseID', 'Cat_High_Value_M']]
customers = pd.merge(customers, hvc_m, on='Survey ResponseID', how='left')
customers['Cat_High_Value_M'] = customers['Cat_High_Value_M'].fillna(0).astype(int)
customers.head()

In [None]:
customers.info()

### Analysis

In [None]:
customers.head()

In [None]:
customers.info()

In [None]:
label_count = customers.iloc[:, 21:29].sum(axis=0)
print(label_count)

In [None]:
label_percentage = (label_count / len(customers)) * 100
print(label_percentage)

In [None]:
impulse_engaged = customers[(customers['Cat_Engagement'] == 1) & (customers['Cat_Impulse'] == 1)].shape[0]
impulse_engaged / 1779 * 100

In [None]:
discount_indecisive = customers[(customers['Cat_Indecisive'] == 1) & (customers['Cat_Discount'] == 1)].shape[0]
discount_indecisive / 3460 * 100

In [None]:
total_high_prime = sum(((customers['Cat_High_Value_F'] == 1) | (customers['Cat_High_Value_M'] == 1)) & (customers['Prime'] == 1))
total_high_value = sum((customers['Cat_High_Value_F'] == 1) | (customers['Cat_High_Value_M'] == 1))
(total_high_prime) / (total_high_value) * 100