
Market Segmentation Analysis

STEP 4

In [None]:
import shutil
import pkg_resources

vaccsv = pkg_resources.resource_filename("MSA", "csv/mcdonald.csv")
shutil.copy(vaccsv, ".")


In [None]:
import pandas as pd

vac = pd.read_csv("mcdonalds.csv")


In [None]:

colnames = vac.columns.tolist()
dim = vac.shape

print("Column names:")
for col in colnames:
    print(col)

print("Dimensions:")
print(dim)


In [None]:

subset_cols = ["like", "Age", "frequency_visited", "gender"]
subset = vac[subset_cols]

summary = subset.describe(include="all")
print(summary)


In [None]:

inc2 = vac["like"]

levels = inc2.unique()
print("Levels:")
print(levels)

lev = ["ummy","convenient","spicy","fattening","greasy","fast","cheap","tasty","expensive","healthy","disgusting"]
inc2 = pd.Categorical(inc2, categories=lev, ordered=True)

print("Updated inc2:")
print(inc2)


In [None]:

import matplotlib.pyplot as plt
plt.hist(vac["Age"], bins=10)
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.title("Histogram of Age")
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

vac = pd.read_csv("vacation.csv")

plt.hist(vac["Age"], bins=50, density=True)
plt.xlabel("Age")
plt.ylabel("Density")
plt.title("Histogram of Age")
plt.show()


In [None]:

age_summary = vac["Age"].describe()

print("Summary of Age:")
print(age_summary)


In [None]:

import matplotlib.pyplot as plt

plt.boxplot(vac["Age"], vert=False)
plt.xlabel("Age")
plt.show()


In [None]:

import matplotlib.pyplot as plt

yes = 100 * (vac.iloc[:, 0:11] == "yes").mean()
sorted_yes = yes.sort_values()

plt.scatter(sorted_yes, range(len(sorted_yes)))
plt.xlabel("Percent 'yes'")
plt.xlim(0, 100)
plt.show()


In [None]:


like_counts = vac["like"].value_counts().sort_index()

print("Sorted Income Counts:")
print(like_counts)


In [None]:


vacmot = (vac.iloc[:, 0:11] == "yes").astype(int)

print("vacmot DataFrame:")
print(vacmot)


NUMERICAL VARIABLES

In [None]:
from sklearn.preprocessing import StandardScaler

vacmot_scaled = StandardScaler().fit_transform(vacmot)

print("vacmot_scaled array:")
print(vacmot_scaled)


In [None]:
from sklearn.decomposition import PCA

pca = PCA()
vacmot_pca = pca.fit_transform(vacmot)

print("vacmot_pca array:")
print(vacmot_pca)


In [None]:
print("Summary of vacmot_pca:")
print("Standard deviations:")
print(pca.explained_variance_)
print("Proportion of variance:")
print(pca.explained_variance_ratio_)
print("Cumulative proportion of variance:")
print(np.cumsum(pca.explained_variance_ratio_))


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
vacmot_pca = pca.fit_transform(vacmot)

plt.scatter(vacmot_pca[:, 1], vacmot_pca[:, 2], marker='o', color='grey', edgecolors='black')
plt.xlabel("PC2")
plt.ylabel("PC3")
plt.title("Projection Axes")
plt.show()


Step 5: Extracting Segments
1)	Distance-Based Methods
i)	Distance Measures

In [None]:

from sklearn.datasets import fetch_openml

annabill = fetch_openml(name="annabill", as_frame=True)

print("annabill DataFrame:")
print(annabill.data)
print("Target variable:")
print(annabill.target)


In [None]:
from sklearn.metrics.pairwise import euclidean_distances

# Assuming annabill.data contains the input features
D1 = euclidean_distances(annabill.data)

rounded_D1 = D1.round(2)

print("Distance matrix:")
print(rounded_D1)


In [None]:
from sklearn.metrics.pairwise import manhattan_distances

# Assuming annabill.data contains the input features
D2 = manhattan_distances(annabill.data)

print("Distance matrix:")
print(D2)


In [None]:
import numpy as np

D2_matrix = np.array(D2)

print("Distance matrix as a matrix:")
print(D2_matrix)


In [None]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.datasets import fetch_openml
import numpy as np

annabill = fetch_openml(name="annabill", as_frame=True)

# Assuming annabill.data contains the input features
dissimilarity_matrix = pairwise_distances(annabill.data, metric="euclidean")

rounded_dissimilarity_matrix = np.round(dissimilarity_matrix, decimals=2)

print("Dissimilarity matrix:")
print(rounded_dissimilarity_matrix)


ii)	Hierarchical Methods

Example: Tourist Risk Taking


In [None]:
from sklearn.datasets import fetch_openml

risk = fetch_openml(name="risk", as_frame=True)

# Assuming risk.data contains the input features
data_shape = risk.data.shape

print("Dimensions of the 'risk' dataset:")
print(data_shape)


In [None]:
import numpy as np

# Assuming risk.data contains the input features
col_means = np.mean(risk.data, axis=0)

print("Column means:")
print(col_means)


In [None]:
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Assuming risk.data contains the input features
risk_dist = pdist(risk.data, metric="cityblock")
risk_hcl = linkage(risk_dist, method="complete")

print("Hierarchical clustering object:")
print(risk_hcl)

# Plot dendrogram
plt.figure(figsize=(10, 6))
dendrogram(risk_hcl)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Data Points")
plt.ylabel("Distance")
plt.show()


In [None]:
from scipy.cluster.hierarchy import cut_tree

# Assuming risk.hcl contains the hierarchical clustering object
c2 = cut_tree(risk_hcl, h=20)
cluster_counts = np.squeeze(np.bincount(np.squeeze(c2)))

print("Cluster counts:")
print(cluster_counts)


In [None]:
from scipy.cluster.hierarchy import cut_tree

# Assuming risk.hcl contains the hierarchical clustering object
c6 = cut_tree(risk_hcl, n_clusters=[6])
cluster_counts = np.squeeze(np.bincount(np.squeeze(c6)))

print("Cluster counts:")
print(cluster_counts)


In [None]:
import pandas as pd

# Assuming risk.data contains the input features
c6_means = pd.DataFrame(risk.data).groupby(c6[:, 0]).mean().round(1)
c6_means.columns = risk.colnames

print("Cluster means:")
print(c6_means)


iii)	Partitioning Method
Example: Artificial Mobile Phone Data


In [None]:
import numpy as np
from sklearn.datasets import make_blobs

np.random.seed(1234)

# Generate synthetic dataset with 500 samples and 3 clusters
PF3, _ = make_blobs(n_samples=500, centers=3, random_state=1234)


In [None]:
from sklearn.cluster import KMeans

# Perform k-means clustering with k = 3
k = 3
kmeans = KMeans(n_clusters=k, random_state=1234)
PF3_km3 = kmeans.fit_predict(PF3)


In [None]:
# Get cluster centers
cluster_centers = kmeans.cluster_centers_

# Get cluster assignments for the first 20 samples
cluster_assignments = kmeans.labels_[:20]


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial import ConvexHull

# Perform k-means clustering with k = 3
k = 3
kmeans = KMeans(n_clusters=k, random_state=1234)
cluster_assignments = kmeans.fit_predict(PF3)

# Plot the data points colored by cluster
plt.scatter(PF3[:, 0], PF3[:, 1], c=cluster_assignments)

# Compute and plot the convex hulls for each cluster
for cluster_id in range(k):
    cluster_points = PF3[cluster_assignments == cluster_id]
    hull = ConvexHull(cluster_points)
    for simplex in hull.simplices:
        plt.plot(cluster_points[simplex, 0], cluster_points[simplex, 1], 'k--')

plt.xlabel('features')
plt.ylabel('performance / quality price')
plt.title('Cluster Hulls')
plt.show()


Example: Tourist Risk Taking



In [None]:
import numpy as np
from sklearn.cluster import KMeans

PF3 = np.array(PF3)  # Convert PF3 to a numpy array

# Define the range of cluster numbers to consider
k_range = range(2, 9)

# Initialize variables to store results
dist_sum = []
converged = []
iterations = []

# Perform stepwise clustering
for k in k_range:
    # Repeat clustering 10 times and select the best result based on inertia
    best_inertia = np.inf
    best_cluster_assignments = None
    for _ in range(10):
        kmeans = KMeans(n_clusters=k, random_state=1234)
        cluster_assignments = kmeans.fit_predict(PF3)
        inertia = kmeans.inertia_
        if inertia < best_inertia:
            best_inertia = inertia
            best_cluster_assignments = cluster_assignments
    
    # Store the results
    dist_sum.append(best_inertia)
    converged.append(True)
    iterations.append(None)

# Print the results
for i, k in enumerate(k_range):
    print(f"{k} : {dist_sum[i]} {converged[i]} {iterations[i]}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Retrieve the cluster assignments for k=2 from risk.km28
cluster_assignments = risk.km28.cluster

# Count the number of instances in each cluster
cluster_counts = np.bincount(cluster_assignments)

# Create a bar chart
plt.bar(range(2), cluster_counts)
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.title('Cluster Counts for k=2')
plt.xticks(range(2))
plt.show()


Self-Organising Maps



In [None]:
import numpy as np
from minisom import MiniSom
import matplotlib.pyplot as plt

np.random.seed(1234)

# Define the SOM grid size
grid_size = (5, 5)

# Create the SOM object
som = MiniSom(grid_size[0], grid_size[1], len(risk.columns), sigma=1.0, learning_rate=0.5)

# Initialize the weights
som.random_weights_init(risk.values)

# Train the SOM
som.train_random(risk.values, 100)

# Generate the SOM visualization
plt.figure(figsize=(7, 7))
plt.pcolor(som.distance_map().T, cmap='bone_r')  # Plot the distance map as a background
plt.colorbar()

# Plot the data points on the SOM grid
for i, x in enumerate(risk.values):
    w = som.winner(x)  # Find the winning neuron for the data point
    plt.plot(w[0] + 0.5, w[1] + 0.5, 'o', markerfacecolor='None', markeredgecolor='red', markersize=10, markeredgewidth=2)

plt.title('SOM Visualization')
plt.show()


Two-Step Clustering


In [None]:
import numpy as np
from sklearn.cluster import KMeans

np.random.seed(1234)

# Create the KMeans object
kmeans = KMeans(n_clusters=30, random_state=1234)

# Fit the data to the KMeans model
kmeans.fit(PF3)

# Get the cluster labels
labels = kmeans.labels_

# Print the cluster labels for the data points
print(labels)


In [None]:
import matplotlib.pyplot as plt

# Assuming PF3 is your data
# Assuming labels is the cluster labels obtained from K-means

# Create a scatter plot of the data points colored by cluster labels
plt.scatter(PF3[:, 0], PF3[:, 1], c=labels, cmap='viridis')

# Add plot labels and title
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Clustering Results')

# Show the plot
plt.show()


In [None]:
import numpy as np

# Assuming PF3 is your data
# Assuming PF3.k30 is the clustering result

# Get the cluster centers
centroids = np.array(PF3.k30.cent)

# Get the cluster sizes
sizes = np.array(sizes)

# Print the cluster centers
print("Cluster Centers:")
print(centroids)

# Print the cluster sizes
print("Cluster Sizes:")
print(sizes)


In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Assuming PF3.k30.cent is the cluster centers
# Assuming sizes is the cluster sizes

# Compute the distance matrix
dist_matrix = dist(PF3.k30.cent)

# Perform hierarchical clustering
hc = linkage(dist_matrix, method='complete', members=sizes)

# Plot the dendrogram
plt.figure(figsize=(10, 6))
dendrogram(hc)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Clusters")
plt.ylabel("Distance")
plt.show()


In [None]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Assuming PF3.hc is the hierarchical clustering result
# Assuming clusters(PF3.k30) is the cluster assignments

# Create a new AgglomerativeClustering object with linkage='complete'
agglomerative = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='complete')

# Fit the clustering model with the hierarchical clustering result
agglomerative.fit(PF3.hc)

# Get the cluster labels
cluster_labels = agglomerative.labels_

# Compute the cluster sizes
cluster_sizes = np.bincount(cluster_labels)

# Display the table of cluster sizes
table(cluster_sizes)


Bagged Clustering

In [None]:
from msa import winterActiv

column_names = winterActiv.columns.tolist()
print(column_names)


In [None]:
import numpy as np
from sklearn.cluster import BinaryKMeans

np.random.seed(1234)
winter_bc = BinaryKMeans(n_clusters=10, max_iter=50)
winter_bc.fit(winterActiv)



In [None]:
import matplotlib.pyplot as plt

cluster_labels, cluster_sizes = np.unique(winter_bc.labels_, return_counts=True)

plt.bar(cluster_labels, cluster_sizes)
plt.xlabel('Cluster')
plt.ylabel('Cluster Size')
plt.title('Cluster Sizes')
plt.show()


In [None]:
import seaborn as sns

# Create a DataFrame with the data and cluster labels
data_with_labels = winter_activ.copy()
data_with_labels['Cluster'] = winter_bc.labels_

# Specify the order of clusters for plotting
cluster_order = range(5)

# Create the box plot
sns.boxplot(x='Cluster', y=data_with_labels.columns[0], data=data_with_labels, order=cluster_order)

# Set plot labels and title
plt.xlabel('Cluster')
plt.ylabel('Variable')
plt.title('Box Plot of Variables by Cluster')

# Show the plot
plt.show()


Normal Distributions

In [None]:
from sklearn.mixture import GaussianMixture

# Assuming PF3 is your data matrix with shape (n_samples, n_features)
G = range(2, 9)  # Range of number of clusters to consider
models = [GaussianMixture(n_components=k) for k in G]
PF3.m28 = [model.fit(PF3) for model in models]

# To access the clustering results for each model
for k, model in zip(G, PF3.m28):
    print(f"Number of clusters: {k}")
    print(model.weights_)  # Cluster weights
    print(model.means_)  # Cluster means
    print(model.covariances_)  # Cluster covariance matrices
    print()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming PF3.m28 is a list of fitted GaussianMixture models
G = range(2, 9)  # Range of number of clusters

# Calculate the uncertainty measure
uncertainty = [model.bic(PF3) for model in PF3.m28]
uncertainty = np.array(uncertainty)

# Plot the uncertainty measure
plt.plot(G, uncertainty, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('BIC')
plt.title('Model Uncertainty (BIC)')
plt.show()


Step 6: Profiling Segments

In [None]:
import numpy as np
from scipy.cluster.hierarchy import linkage, distance

# Assuming MD.x is a numpy array of your data
MD_vclust = linkage(distance.pdist(np.transpose(MD_x)), method='complete')


In [None]:
import matplotlib.pyplot as plt

# Assuming MD.k4 is a numpy array representing your clustering result
# Assuming MD.vclust$order is a numpy array representing the order of bars
ordered_clusters = MD_vclust['leaves'][::-1]  # Reverse the order to match R code

# Plotting the bar chart with shading
plt.bar(range(len(MD_k4)), MD_k4[ordered_clusters], color='grey')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Assuming MD.k4 is a numpy array representing your clustering result
# Assuming MD.pca is a numpy array representing the PCA result
# Assuming MD.x is a numpy array representing the data points

# Plotting the scatter plot
plt.scatter(MD.pca[:, 0], MD.pca[:, 1], c=MD.k4)

# Plotting projected axes
plt.quiver(0, 0, 1, 0, color='r', scale=5)
plt.quiver(0, 0, 0, 1, color='b', scale=5)

# Setting plot labels
plt.xlabel('principal component 1')
plt.ylabel('principal component 2')

# Displaying the plot
plt.show()


Step 7: Describing Segments 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming k4 is a numpy array representing the cluster assignments
# Assuming mcdonalds is a pandas DataFrame representing the data

# Creating a contingency table
table = pd.crosstab(k4, mcdonalds['Like'])

# Plotting the mosaic plot
sns.mosaicplot(table, shade=True)

# Setting plot labels
plt.xlabel('segment number')
plt.ylabel('Like')

# Displaying the plot
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming k4 is a numpy array representing the cluster assignments
# Assuming mcdonalds is a pandas DataFrame representing the data

# Creating a contingency table
table = pd.crosstab(k4, mcdonalds['Gender'])

# Plotting the mosaic plot
sns.mosaicplot(table, shade=True)

# Displaying the plot
plt.show()


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz

# Assuming k4 is a numpy array representing the cluster assignments
# Assuming mcdonalds is a pandas DataFrame representing the data

# Preparing the input features and target variable
X = mcdonalds[['Like.n', 'Age', 'VisitFrequency', 'Gender']]
y = (k4 == 3).astype(int)

# Creating the decision tree model
tree_model = DecisionTreeClassifier()
tree_model.fit(X, y)

# Visualizing the decision tree
dot_data = tree.export_graphviz(tree_model, out_file=None, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("decision_tree")  # Save the visualization to a file (optional)
graph.view()  # Display the visualization


In [None]:
# Assuming mcdonalds is a pandas DataFrame representing the data

# Making predictions using the decision tree model
predictions = tree_model.predict(X)

# Selecting the target segment(s)
target_segment = mcdonalds[predictions == 3]  # Replace '3' with the desired segment(s)

# Printing the selected segment(s)
print(target_segment)


Step 8: Selecting (the) Target Segment(s)

In [None]:
# Calculate the mean visit frequency for each segment
visit = mcdonalds.groupby(k4)['VisitFrequency'].mean()

# Print the mean visit frequency for each segment
print(visit)


In [None]:
# Calculate the mean "Like.n" value for each segment
like = mcdonalds.groupby(k4)['Like.n'].mean()

# Print the mean "Like.n" value for each segment
print(like)


In [None]:
# Calculate the proportion of females in each segment
female = mcdonalds.groupby(k4)['Gender'].apply(lambda x: (x == 'Female').mean())

# Print the proportion of females in each segment
print(female)


In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot
plt.scatter(visit, like, c=10*female, s=100)

# Set the x-axis and y-axis limits
plt.xlim(2, 4.5)
plt.ylim(-3, 3)

# Add text labels for each point
for i, (x, y) in enumerate(zip(visit, like)):
    plt.text(x, y, i+1)

# Display the plot
plt.show()


In [None]:
# Assume you have variables representing different elements of the marketing mix
product_quality = 8
price_discount = 0.2
promotion_channel = 'online'
distribution_coverage = 'national'

# Customize the marketing mix based on your goals and strategies
if product_quality > 7:
    print("Enhance product features and quality")
else:
    print("Focus on improving product quality")

if price_discount > 0.1:
    print("Offer more attractive discounts to increase sales")
else:
    print("Consider adjusting pricing strategy")

if promotion_channel == 'online':
    print("Allocate more resources to online marketing channels")
else:
    print("Explore additional offline marketing opportunities")

if distribution_coverage == 'national':
    print("Expand distribution network to reach more regions")
else:
    print("Optimize existing distribution channels")

# Add more customized marketing mix strategies based on your specific requirements


Step 9: Customising the Marketing Mix

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load your data into a pandas DataFrame
data = pd.read_csv("mcdonalds.csv")

# Split the data into training and testing sets
X = data.drop("Like", axis=1)
y = data["Like"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model (e.g., Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Perform monitoring tasks (e.g., track key performance indicators)
# You can define relevant metrics, thresholds, and tracking mechanisms based on your requirements.

# Add more evaluation and monitoring tasks based on your specific needs
