<a href="https://colab.research.google.com/github/ci-cd-binu/skills-introduction-to-github/blob/main/Feature_Engineering_UnsubModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Features to engineer for Unsub Model** from datsets around Email engagement data (opens, clicks, forwards, etc.) | 	Customer demographics (age, gender, location, etc.)| AND Customer purchase history :

1.   Recency of engagement (how recently the customer has opened or clicked on an email)
2.  Frequency of engagement (how often the customer opens or clicks on emails)
1.  Type of engagement (what types of emails the customer typically opens or clicks on)
2.   Customer lifetime value (CLV)






In [5]:
import pandas as pd
import numpy as np
from datetime import datetime

def generate_features(email_engagement_data, customer_demographics, purchase_history):
  """Generates features from synthetic datasets."""

  # Recency of engagement
  email_engagement_data["Recency of Engagement"] = (
      datetime.today() - pd.to_datetime(email_engagement_data["Date and Time Sent"], format="%Y-%m-%d %H:%M:%S")
  ).dt.days

  # Frequency of engagement
  email_engagement_data["Frequency of Engagement"] = email_engagement_data.groupby(
    "Email Address"
  )["Open Status"].sum()

  # Type of engagement
  email_engagement_data["Type of Engagement"] = np.where(
    email_engagement_data["Click Status"] == True, "Clicked", "Opened"
  )

  # Customer lifetime value (CLV)
  customer_clv = purchase_history.groupby("Email Address")["Price"].sum()
  email_engagement_data = email_engagement_data.merge(
    customer_clv.to_frame(name="CLV"), on="Email Address"
  )

  return email_engagement_data

email_engagement_data = pd.read_csv(r"/content/drive/MyDrive/datasets/email_engagement_data.csv")
customer_demographics = pd.read_csv(r"/content/drive/MyDrive/datasets/customer_demographics.csv")
purchase_history = pd.read_csv(r"/content/drive/MyDrive/datasets/purchase_history.csv")

featured_data = generate_features(email_engagement_data, customer_demographics, purchase_history)

#print(featured_data.to_string())
print(featured_data.head(5).to_string())


       Email Address          Date and Time Sent    Subject Line  Open Status  Click Status  Forward Status  Recency of Engagement  Frequency of Engagement Type of Engagement   CLV
0  user0@example.com  2023-10-21 10:26:27.958802  Subject Line 0         True          True           False                      0                      NaN            Clicked  40.0
1  user1@example.com  2023-10-20 10:26:27.958815  Subject Line 1        False         False            True                      1                      NaN             Opened  20.0
2  user2@example.com  2023-10-19 10:26:27.958818  Subject Line 2         True         False           False                      2                      NaN             Opened  40.0
3  user3@example.com  2023-10-18 10:26:27.958820  Subject Line 3         True         False           False                      3                      NaN             Opened  20.0
4  user4@example.com  2023-10-17 10:26:27.958821  Subject Line 4         True         False    

In [8]:
# Calculate CLV based on customer lifetime value models
!pip install lifetimes



In [13]:
from lifetimes import BetaGeoFitter, GammaGammaFitter

In [14]:
def calculate_clv(purchase_history):
    # Calculate basic RFM values
    today = datetime.today()
    rfm = purchase_history.groupby('Email Address').agg({
        'Purchase Date': [lambda date: (today - pd.to_datetime(date).max()).days,
                          lambda date: (today - pd.to_datetime(date).min()).days],
        'Price': ['count', 'sum']
    }).reset_index()
    rfm.columns = ['Email Address', 'Recency', 'T', 'Frequency', 'Monetary_Value']

    # For BG/NBD, the Frequency represents the number of repeat (not total) transactions.
    rfm['Frequency'] = rfm['Frequency'] - 1

    # Fit the BG/NBD model
    bgf = BetaGeoFitter(penalizer_coef=0.0)
    bgf.fit(rfm['Recency'], rfm['Frequency'])

    # Predict the customer's future transaction in next 10 days for example
    rfm["predicted_purchases"] = bgf.predict(10, rfm['Recency'], rfm['Frequency'])

    # For monetary value, we use only those customers who had at least one repeat purchase with us.
    returning_customers_summary = rfm[rfm['Frequency'] > 1]

    # Fit the Gamma-Gamma model
    ggf = GammaGammaFitter(penalizer_coef=0.0)
    ggf.fit(returning_customers_summary['Frequency'],
            returning_customers_summary['Monetary_Value'])

    # Predict the customer's average transaction value in the next period
    rfm["predicted_monetary_value"] = ggf.predict(returning_customers_summary['Frequency'],
                                                  returning_customers_summary['Monetary_Value'])

    # Now, combine predicted_purchases and predicted_monetary_value to compute CLV
    rfm["CLV"] = rfm["predicted_purchases"] * rfm["predicted_monetary_value"]

    return rfm[["CLV"]]


# Calculate CLV
clv_data = calculate_clv(purchase_history)

#print(featured_data.to_string())
print(clv_data.head(5).to_string())


KeyError: ignored

In [11]:
import pandas as pd
import numpy as np
from datetime import datetime
from lifetimes import BetaGeoNBD

def generate_features(email_engagement_data, customer_demographics, purchase_history):
  """Generates features from synthetic datasets."""

  # Recency of engagement
  email_engagement_data["Recency of Engagement"] = (
      datetime.today() - pd.to_datetime(email_engagement_data["Date and Time Sent"], format="%Y-%m-%d %H:%M:%S")
  ).dt.days

  # Frequency of engagement
  email_engagement_data["Frequency of Engagement"] = email_engagement_data.groupby(
    "Email Address"
  )["Open Status"].sum()

  # Type of engagement
  email_engagement_data["Type of Engagement"] = np.where(
    email_engagement_data["Click Status"] == True, "Clicked", "Opened"
  )

  # Customer lifetime value (CLV)
  customer_clv = purchase_history.groupby("Email Address")["Price"].sum()
  email_engagement_data = email_engagement_data.merge(
    customer_clv.to_frame(name="CLV"), on="Email Address"
  )

  # Email engagement data

  # Number of emails sent to customer
  email_engagement_data["Number of Emails Sent"] = email_engagement_data.groupby("Email Address")["Date and Time Sent"].transform('size')

  # Average open rate
  email_engagement_data["Average Open Rate"] = email_engagement_data.groupby("Email Address")["Open Status"].mean()

  # Average click rate
  email_engagement_data["Average Click Rate"] = email_engagement_data.groupby("Email Address")["Click Status"].mean()

  # Customer demographics

  # Age group
  email_engagement_data["Age Group"] = pd.cut(customer_demographics["Age Group"], bins=[0, 18, 25, 35, 45, 55, 65, np.inf])

  # Gender
  email_engagement_data["Gender"] = customer_demographics["Gender"]

  # Location
  email_engagement_data["Location"] = customer_demographics["Location"]

  # Purchase history

  # Average order value
  email_engagement_data["Average Order Value"] = purchase_history.groupby("Email Address")["Price"].mean()

  # Number of orders placed
  email_engagement_data["Number of Orders Placed"] = purchase_history.groupby("Email Address")["Purchase Date"].transform('size')

  # Last purchase date
  email_engagement_data["Last Purchase Date"] = purchase_history.groupby("Email Address")["Purchase Date"].transform('max')

  # RFM score
  email_engagement_data = calculate_rfm_score(email_engagement_data)

  # CLV based on BG/NBD model
  email_engagement_data["CLV_BG/NBD"] = calculate_clv_bg_nbd(purchase_history)

  # CLV based on customer engagement
  email_engagement_data["CLV_Engagement"] = calculate_clv_engagement(email_engagement_data, purchase_history)

  return email_engagement_data

def calculate_rfm_score(featured_data):
  """Calculates RFM scores for customers."""

  # Recency
  featured_data["Recency Score"] = pd.qcut(featured_data["Recency of Engagement"], 5, labels=[5, 4, 3, 2, 1])

  # Frequency
  featured_data["Frequency Score"] = pd.qcut(featured_data["Frequency of Engagement"], 5, labels=[5, 4, 3, 2, 1])

  # Monetary value
  featured_data["Monetary Value Score"] = pd.qcut(featured_data["CLV"], 5, labels=[5, 4, 3, 2, 1])

  # RFM score
  featured_data["RFM Score"] = featured_data["Recency Score"] + featured_data["Frequency Score"] + featured_data["Monetary Value Score"]

  return featured_data

def calculate_clv_bg_nbd(purchase_history):
  """Calculates CLV based on the BG/NBD model."""

  bg_nbd = BetaGeoNBD(purchase_history)
  bg_nbd.fit()

  clv = bg_nb


ImportError: ignored

In [12]:
import pandas as pd
import numpy as np
from datetime import datetime
import lifetimes
from lifetimes import BetaGeoNBD

def generate_features(email_engagement_data, customer_demographics, purchase_history):
  """Generates features from synthetic datasets."""

  # Recency of engagement
  email_engagement_data["Recency of Engagement"] = (
      datetime.today() - pd.to_datetime(email_engagement_data["Date and Time Sent"], format="%Y-%m-%d %H:%M:%S")
  ).dt.days

  # Frequency of engagement
  email_engagement_data["Frequency of Engagement"] = email_engagement_data.groupby(
    "Email Address"
  )["Open Status"].sum()

  # Type of engagement
  email_engagement_data["Type of Engagement"] = np.where(
    email_engagement_data["Click Status"] == True, "Clicked", "Opened"
  )

  # Customer lifetime value (CLV)
  customer_clv = purchase_history.groupby("Email Address")["Price"].sum()
  email_engagement_data = email_engagement_data.merge(
    customer_clv.to_frame(name="CLV"), on="Email Address"
  )

  # Email engagement data

  # Number of emails sent to customer
  email_engagement_data["Number of Emails Sent"] = email_engagement_data.groupby("Email Address")["Date and Time Sent"].transform('size')

  # Average open rate
  email_engagement_data["Average Open Rate"] = email_engagement_data.groupby("Email Address")["Open Status"].mean()

  # Average click rate
  email_engagement_data["Average Click Rate"] = email_engagement_data.groupby("Email Address")["Click Status"].mean()

  # Customer demographics

  # Age group
  email_engagement_data["Age Group"] = pd.cut(customer_demographics["Age Group"], bins=[0, 18, 25, 35, 45, 55, 65, np.inf])

  # Gender
  email_engagement_data["Gender"] = customer_demographics["Gender"]

  # Location
  email_engagement_data["Location"] = customer_demographics["Location"]

  # Purchase history

  # Average order value
  email_engagement_data["Average Order Value"] = purchase_history.groupby("Email Address")["Price"].mean()

  # Number of orders placed
  email_engagement_data["Number of Orders Placed"] = purchase_history.groupby("Email Address")["Purchase Date"].transform('size')

  # Last purchase date
  email_engagement_data["Last Purchase Date"] = purchase_history.groupby("Email Address")["Purchase Date"].transform('max')

  # RFM score
  email_engagement_data = calculate_rfm_score(email_engagement_data)

  # CLV based on BG/NBD model
  email_engagement_data["CLV_BG/NBD"] = calculate_clv_bg_nbd(purchase_history)

  # CLV based on customer engagement
  email_engagement_data["CLV_Engagement"] = calculate_clv_engagement(email_engagement_data, purchase_history)

  return email_engagement_data

def calculate_rfm_score(featured_data):
  """Calculates RFM scores for customers."""

  # Recency
  featured_data["Recency Score"] = pd.qcut(featured_data["Recency of Engagement"], 5, labels=[5, 4, 3, 2, 1])

  # Frequency
  featured_data["Frequency Score"] = pd.qcut(featured_data["Frequency of Engagement"], 5, labels=[5, 4, 3, 2, 1])

  # Monetary value
  featured_data["Monetary Value Score"] = pd.qcut(featured_data["CLV"], 5, labels=[5, 4, 3, 2, 1])

  # RFM score
  featured_data["RFM Score"] = featured_data["Recency Score"] + featured_data["Frequency Score"] + featured_data["Monetary Value Score"]

  return featured_data

def calculate_clv_bg_nbd(purchase_history):
  """Calculates CLV based on the BG/NBD model."""

  bg_nbd = BetaGeoNBD(purchase_history)
  bg_nbd.fit()

  clv =


SyntaxError: ignored

GPT-4


In [15]:
import pandas as pd
from lifetimes import BetaGeoFitter, GammaGammaFitter
from datetime import datetime

def generate_features(email_engagement_data, customer_demographics, purchase_history):
    """Generates features for predicting email unsubscriptions."""

    # Recency of engagement
    email_engagement_data["Recency of Engagement"] = (
        datetime.today() - pd.to_datetime(email_engagement_data["Date and Time Sent"], format="%Y-%m-%d %H:%M:%S")
    ).dt.days

    # Frequency of engagement
    email_engagement_data["Frequency of Engagement"] = email_engagement_data.groupby(
        "Email Address"
    )["Open Status"].transform('sum')

    # Type of engagement
    email_engagement_data["Type of Engagement"] = np.where(
        email_engagement_data["Click Status"] == True, "Clicked", "Opened"
    )

    # Merging datasets
    data = email_engagement_data.merge(
        customer_demographics, on="Email Address", how="left"
    ).merge(
        purchase_history.groupby("Email Address").agg(Total_Spent=pd.NamedAgg(column="Price", aggfunc="sum"),
                                                     Purchase_Count=pd.NamedAgg(column="Purchase Date", aggfunc="size"),
                                                     Avg_Purchase_Value=pd.NamedAgg(column="Price", aggfunc="mean")),
        on="Email Address", how="left"
    )

    # Customer lifetime value (CLV)
    data["CLV"] = data["Total_Spent"]

    # Add additional CLV based on BG/NBD model
    rfm = purchase_history.groupby('Email Address').agg({
        'Purchase Date': [lambda date: (datetime.today() - pd.to_datetime(date).max()).days],
        'Price': ['count', 'sum']
    }).reset_index()
    rfm.columns = ['Email Address', 'Recency', 'Frequency', 'Monetary_Value']
    rfm['Frequency'] = rfm['Frequency'] - 1
    bgf = BetaGeoFitter(penalizer_coef=0.0)
    bgf.fit(rfm['Recency'], rfm['Frequency'])
    predicted_purchases = bgf.predict(365, rfm['Recency'], rfm['Frequency'])
    ggf = GammaGammaFitter(penalizer_coef=0.0)
    ggf.fit(rfm['Frequency'], rfm['Monetary_Value'])
    predicted_monetary_value = ggf.predict(rfm['Frequency'], rfm['Monetary_Value'])
    rfm["CLV_BG/NBD"] = predicted_purchases * predicted_monetary_value
    data = data.merge(rfm[["Email Address", "CLV_BG/NBD"]], on="Email Address", how="left")

    return data

# Example usage
email_engagement_data = pd.read_csv(r"/content/drive/MyDrive/datasets/email_engagement_data.csv")
customer_demographics = pd.read_csv(r"/content/drive/MyDrive/datasets/customer_demographics.csv")
purchase_history = pd.read_csv(r"/content/drive/MyDrive/datasets/purchase_history.csv")
featured_data = generate_features(email_engagement_data, customer_demographics, purchase_history)

print(featured_data.head(5).to_string())


KeyError: ignored

In [19]:
import pandas as pd
import numpy as np
from datetime import datetime
from lifetimes import BetaGeoFitter, GammaGammaFitter

def generate_features(email_engagement_data, customer_demographics, purchase_history):
    """Generates features from synthetic datasets."""

    # Recency of engagement
    email_engagement_data["Recency of Engagement"] = (
        datetime.today() - pd.to_datetime(email_engagement_data["Date and Time Sent"], format="%Y-%m-%d %H:%M:%S")
    ).dt.days

    # Frequency of engagement
    email_engagement_data["Frequency of Engagement"] = email_engagement_data.groupby(
        "Email Address"
    )["Open Status"].transform('sum')

    # Type of engagement
    email_engagement_data["Type of Engagement"] = np.where(
        email_engagement_data["Click Status"] == True, "Clicked", "Opened"
    )

    # Basic Customer lifetime value (CLV)
    customer_clv = purchase_history.groupby("Email Address")["Price"].sum()
    email_engagement_data = email_engagement_data.merge(
        customer_clv.to_frame(name="CLV"), on="Email Address", how="left"
    )

    # Additional CLV based on BG/NBD model
    rfm = purchase_history.groupby('Email Address').agg({
        'Purchase Date': [lambda date: (datetime.today() - pd.to_datetime(date).max()).days],
        'Price': ['count', 'sum']
    }).reset_index()
    rfm.columns = ['Email Address', 'Recency', 'Frequency', 'Monetary_Value']
    rfm['Frequency'] = rfm['Frequency'] - 1

    # Ensure that Recency is 0 for all customers with a Frequency of 0
    rfm.loc[rfm['Frequency'] == 0, 'Recency'] = 0

    # Compute T for each customer
    rfm['T'] = (datetime.today() - pd.to_datetime(purchase_history.groupby('Email Address')['Purchase Date'].min())).dt.days

    #bgf = BetaGeoFitter(penalizer_coef=0.0)
    bgf = BetaGeoFitter(penalizer_coef=0.01, maxiter=1000)
    bgf.fit(rfm['Frequency'], rfm['Recency'], rfm['T'])

    predicted_purchases = bgf.predict(365, rfm['Recency'], rfm['Frequency'])

    ggf = GammaGammaFitter(penalizer_coef=0.0)
    ggf.fit(rfm['Frequency'], rfm['Monetary_Value'])
    predicted_monetary_value = ggf.predict(rfm['Frequency'], rfm['Monetary_Value'])

    rfm["CLV_BG/NBD"] = predicted_purchases * predicted_monetary_value
    email_engagement_data = email_engagement_data.merge(rfm[["Email Address", "CLV_BG/NBD"]], on="Email Address", how="left")

    return email_engagement_data

# Reading datasets
email_engagement_data = pd.read_csv(r"/content/drive/MyDrive/datasets/email_engagement_data.csv")
customer_demographics = pd.read_csv(r"/content/drive/MyDrive/datasets/customer_demographics.csv")
purchase_history = pd.read_csv(r"/content/drive/MyDrive/datasets/purchase_history.csv")

# Generating features
featured_data = generate_features(email_engagement_data, customer_demographics, purchase_history)

# Printing the result
print(featured_data.head(5).to_string())


TypeError: ignored