# Importing libraries and Inspection

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans  

In [2]:
# Importing the dataset
df = pd.read_csv('marketing_campaign.csv', sep='\t')

In [None]:
# Checking the head of the dataset
df.head()

In [None]:
df.info()

# Data Cleansing

In [None]:
# Checking for null values
df.isnull().any()

In [None]:
# Filling the null value with the average value
df.fillna(df.mean(), inplace=True)

In [None]:
# Checking for null values
df.isnull().any()

# Feature Engineering

In [None]:
# Subtract the year 2014 with the year_birth column to get the age of the customer
df['Age'] = df['Year_Birth'].apply(lambda x: 2014-x)

In [None]:
# Grouping the age
def group_age(data):
    if data > 9 and data <= 24:
        return 'Gen Z / Zoomers (10-24)'
    elif data > 24 and data <= 40:
        return 'Gen Y / Millenials (25-40)'
    elif data > 40 and data <= 56:
        return 'Gen X (40-56)'
    elif data > 56:
        return 'Baby Boomers (57 and above)'
    return 'Unspecified'

df['Generation'] = df['Age'].apply(group_age)

In [None]:
# Sum up all the products bought
df['Amount_Total'] = df['MntWines'] + df['MntFruits'] + df['MntMeatProducts'] + df['MntSweetProducts'] + df['MntGoldProds']

In [None]:
# Sum up all the purchases made
df['Total_Purchases'] = df['NumWebPurchases'] + df['NumCatalogPurchases'] + df['NumStorePurchases'] + df['NumDealsPurchases']

In [None]:
# Sum up all the children the customer have
df['Children'] = df['Kidhome'] + df['Teenhome']

In [None]:
# Checking the marital status
df['Marital_Status'].unique()

In [None]:
# Transformed the marital status to either 'In Relationship' or 'Single'
df['Marital_Status'] = df['Marital_Status'].apply(lambda x: 'In Relationship' if x == 'Married' or x == 'Together' else 'Single')

In [None]:
# Sum up all the family members the customers have
df['Family_Size'] =  df.apply(lambda x: x['Children'] + 2 if x['Marital_Status'] == 'In Relationship' else x['Children'] + 1, axis=1)

In [None]:
# Sum up all the marketing campaigns the customer accept
df['Total_AcceptedCmp'] = df['AcceptedCmp1'] + df['AcceptedCmp2'] + df['AcceptedCmp3'] + df['AcceptedCmp4'] + df['AcceptedCmp5'] + df['Response']

In [None]:
# Convert the Dt_Customer column type to datetime
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])
print(df['Dt_Customer'].min())
print(df['Dt_Customer'].max())

In [None]:
# It looks like the latest customer enrollment in the company was December 6, 2014 so let's assume the last time the data was
# updated was December 7, 2014
df['Date_Collected'] = pd.to_datetime('2014-12-07')

In [None]:
# Subtract the date collected and the date of customer's enrollment with the company
df['Customer_Enrolled'] = (df['Date_Collected'] - df['Dt_Customer']).dt.days

In [None]:
# Subtract Customer_Enrolled with recency to get how many days the customer was active
df['Days_Active'] = df['Customer_Enrolled'] - df['Recency']

In [None]:
# Drop useless columns
df.drop(columns=['ID', 'Year_Birth','Kidhome', 'Teenhome', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue', 'Date_Collected', 'Customer_Enrolled'], axis=1, inplace=True)

In [None]:
# Checking the head of the data after feature engineering
df.head()

In [None]:
# Checking for outliers
df_dist = df[['Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 
       'Age', 'Amount_Total', 'Total_Purchases', 'Children', 'Family_Size', 'Total_AcceptedCmp', 'Days_Active']]

for i in df_dist.columns:
    sns.displot(x=i, bins=30, data=df)

# Removing Outliers

In [None]:
df['Income'] = df['Income'].apply(lambda x: x if x < 120000 else np.nan)

In [None]:
df['MntMeatProducts'] = df['MntMeatProducts'].apply(lambda x: x if x < 1100 else np.nan)

In [None]:
df['NumWebPurchases'] = df['NumWebPurchases'].apply(lambda x: x if x < 15 else np.nan) 

In [None]:
df['NumCatalogPurchases'] = df['NumCatalogPurchases'].apply(lambda x: x if x < 15 else np.nan)

In [None]:
df['Total_Purchases'] = df['Total_Purchases'].apply(lambda x: x if x < 40 else np.nan) 

In [None]:
df['Age'] = df['Age'].apply(lambda x: x if x < 80 else np.nan) 

In [None]:
# Checking for null values
df.isnull().any()

In [None]:
# Checking the shape
df.shape

In [None]:
# Drop null values
df.dropna(inplace=True)

In [None]:
# Checking for null values
df.isnull().any()

In [None]:
# Checking the shape
df.shape

# Data Visualization

Education

In [None]:
sns.countplot(x='Education', data=df)
plt.title('Education level of each customers')

Marital Status

In [None]:
sns.countplot(x='Marital_Status', data=df)
plt.title('Marital Status of each customers')

Place

In [None]:
each_place = ['NumWebPurchases','NumCatalogPurchases', 'NumStorePurchases']
total_place = []
[total_place.append(len(df[df[place] == 1])) for place in each_place]

plt.figure(figsize=(15,5))
sns.barplot(x=each_place, y=total_place)
plt.title('Total purchases of each place')

Products

In [None]:
each_prods = ['MntWines','MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts','MntGoldProds']
total_prods = []
[total_prods.append(len(df[df[prod] == 1])) for prod in each_prods]

plt.figure(figsize=(15,5))
sns.barplot(x=each_prods, y=total_prods)
plt.title('Total amount of each products')

Accepted Campaigns

In [None]:
each_cmp = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response']
total_cmp = []
[total_cmp.append(len(df[df[cmp] == 1])) for cmp in each_cmp]

plt.figure(figsize=(15,5))
sns.barplot(x=each_cmp, y=total_cmp)
plt.title('Total of Accepted Campaign offers on each Campaigns')

Complains

In [None]:
sns.countplot(x='Complain', data=df)
plt.title('Total Complains')

In [None]:
colors = sns.color_palette('pastel')[0:5]
Generation = df['Generation'].value_counts()
plt.figure(figsize=(10,5))
plt.pie(Generation, labels = Generation.index, colors = colors, autopct='%1.2f%%')
plt.title('Generation')

In [None]:
# Data Distribution
df_dist = df[['Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 
       'Age', 'Amount_Total', 'Total_Purchases', 'Children', 'Family_Size', 'Total_AcceptedCmp', 'Days_Active']]

for i in df_dist.columns:
    sns.displot(x=i, bins=30, data=df)


Income based on Education

In [None]:
sns.barplot(x='Education', y='Income', data=df)
plt.title('Income based on Education')

Income based on Marital Status

In [None]:
sns.barplot(x='Marital_Status', y='Income', data=df)
plt.title('Income based on Marital Status')

Income based on how many Children

In [None]:
sns.barplot(x='Children', y='Income', data=df)
plt.title('Income based on how many Children')

Correlation between Total Purchases and Income

In [None]:
sns.scatterplot(x='Total_Purchases', y='Income', data=df)
plt.title('Correlation between Total Purchases and Income')

Correlation between Total Amount of products bought and Income

In [None]:
sns.scatterplot(x='Amount_Total', y='Income', data=df)
plt.title('Correlation between Total Amount of products bought and Income')

Correlation of each columns

In [None]:
sns.heatmap(data=df_dist.corr(), cmap='coolwarm')
plt.title('Correlation of each columns')

# Data Modelling using KMeans

In [None]:
df['Education'] = df['Education'].apply(lambda x: 0 if x == 'Basic' else 1)

In [None]:
df['Marital_Status'] = df['Marital_Status'].apply(lambda x: 1 if x == 'In Relationship' else 0)

In [None]:
model_features = df[['Education', 'Marital_Status', 'Income', 'Age', 'Amount_Total','Total_Purchases', 
                     'Children', 'Total_AcceptedCmp', 'Days_Active']]

In [None]:
scaler = StandardScaler()

In [None]:
scaled_features = scaler.fit_transform(model_features)

In [None]:
df_feat = pd.DataFrame(scaled_features,columns=model_features.columns)
df_feat.head()

In [None]:
X = scaled_features

In [None]:
#Elbow Method - Inertia plot
inertia = []
#looping the inertia calculation for each k
for k in range(1, 10):
    #Assign KMeans as cluster_model
    cluster_model = KMeans(n_clusters = k, random_state = 101)
    #Fit cluster_model to X
    cluster_model.fit(X)
    #Get the inertia value
    inertia_value = cluster_model.inertia_
    #Append the inertia_value to inertia list
    inertia.append(inertia_value)

In [None]:
##Inertia plot
plt.plot(range(1, 10), inertia)
plt.title('The Elbow Method - Inertia plot', fontsize = 15)
plt.xlabel('No. of Clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 4, random_state = 101)
kmeans.fit(X)

In [None]:
kmeans.cluster_centers_

In [None]:
model_features['Cluster'] = kmeans.labels_ + 1

In [None]:
model_features.head()

In [None]:
colors = sns.color_palette('pastel')[0:5]
Cluster = model_features['Cluster'].value_counts()
plt.figure(figsize=(10,5))
plt.pie(Cluster, labels = Cluster.index, colors = colors, autopct='%.0f%%')
plt.title('Clusters')

In [None]:
for i in model_features:
    diag = sns.FacetGrid(model_features, col = 'Cluster', hue = 'Cluster', palette = "Set1", height=4)
    diag.map(plt.hist, i, bins=10, ec="k") 
    diag.set_xticklabels(rotation=30, color = 'black')

# From this prediction, there are four types of customers

# 1. 48% of customers (Majority)
- All of them are Postgraduates
- Higher number of people who are in relationship than single people</font><br>
- Have an approximately 20k-50k of income
- Around 25 to 60 years old with the maximum around 35 to 45 years old
- Bought approximately 50-400 products
- Made a purchase approximately 5-20 times
- Higher number of children than no child
- Most of them reject the marketing campaign
- Highest number of customers who have loyalty with the company for a long period of time

# 2. 38% of customers
- Slightly lower number of Postgraduates than 48% of people
- It's the same circumstances as 48% of people but slightly lower number of people
- Have an approximately 40k-80k of income
- Around 30 to 70 years old with the maximum around 45 to 60 years old
- Bought approximately 500-1700 products
- Made a purchase approximately 15-30 times
- Slightly higher number of children than no child
- Most of them reject the marketing campaign
- Slightly lower number of loyal customers compared to 48% of customers

# 3. 12% of customers
- Significantly lower Postgraduates than the 38% of people
- The number of people who are in relationship are slightly higher than single people
- Have an approximately 70k-90k of income
- Around 20 to 70 years old with the maximum around 30 to 60 years old
- Bought approximately 800-2000 products
- Made a purchase approximately 15-25 times
- Higher number of no child than have children
- None of them reject the marketing campaign
- Significantly lower number of loyal customers compared to 38% of customers

# 4. 2% of customers (Minority)
- All of them are Undergraduates
- It's the same circumstances as 12% of customers but with lower number
- Have an approximately 15k-30k of income
- Around 20 to 65 years old with the maximum around 35 to 40 years old
- Bought approximately 20-100 products
- Made a purchase approximately 5-10 times
- Slightly higher number of children than no child
- Most of them reject the marketing campaign
- Least number of loyal customers