<a href="https://colab.research.google.com/github/chaitraDev/GenAI-for-marketing/blob/main/Cognizant_RFM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import datetime
# Set random seed for reproducibility
np.random.seed(42)
isGenerating = True #flag to choose between imported and generated data

# Data Generation

In [None]:
#@title Customers data
def generate_customers(fake,num_customers=100):
  customer_data = {
      "CustomerID": [f"CUST{i:03d}" for i in range(1, num_customers + 1)],
      "CustGender": np.random.choice(["Male", "Female"], num_customers),
      "Age": np.random.randint(18, 70, num_customers),
      "Income": np.random.randint(30000, 150000, num_customers),
      "Loan": np.random.choice(["Yes", "No"], num_customers),
      "Num_Transactions": np.random.randint(1, 20, num_customers),
      # "balance": np.random.randint(1000, 50000, num_customers),
      "Cibil_Score": np.random.randint(300, 900, num_customers),
      # "cust_joining_date": [fake.date_this_decade() for _ in range(num_customers)],
      "Credit_Card": np.random.choice(["Yes", "No"], num_customers),
      "Debit_Card": np.random.choice(["Yes", "No"], num_customers),
  }
  customers_df = pd.DataFrame(customer_data)
  customers_df.sample(frac=1,ignore_index=True)
  return customers_df

In [None]:
#@title Products data
def generate_products(fake,num_products=9):
  products = ["Education Loan", "Vehicle Loan", "Home Loan", "Current Account", "Savings Account",
              "Credit Card", "Debit Card", "Recurring Deposit", "Fixed Deposit"]
  product_data = {
      "p_id": [f"PROD{i:03d}" for i in range(1, num_products + 1)],
      "p_name": [fake.bs() for _ in range(num_products)],
      "p_type": np.random.choice(products, num_products),
      "p_el_age": np.random.randint(18, 70, num_products),
      "p_el_income": np.random.randint(30000, 150000, num_products),
      "p_el_cibil": np.random.randint(300, 900, num_products)
  }
  products_df = pd.DataFrame(product_data)
  products_df.sample(frac=1,ignore_index=True)
  return products_df

In [None]:
#@title Transactions data
def generate_transactions(fake,customers_df,num_transactions=1000):
  transaction_types = ["Loan Amount Disbursement", "Loan Installment", "Credit", "Debit", "FD", "RD"]
  transaction_data = {
      "TransactionID": [],
      "CustomerID": [],
      "TransactionDate": [],
      "CustAccountBalance": [],
      "TransactionAmount (INR)": [],
      "t_type": [],
      "TransactionTime": [],
      "payment_method": []
  }

  trans_count = 1
  num_customers = customers_df.shape[0]
  cust_ids = customers_df["CustomerID"]
  payment_methods = ["Credit Card", "Debit Card","UPI", "Net Banking", "Cash"]
  for index in range(num_customers):
    cust_id = cust_ids[index]
    customer = customers_df[customers_df["CustomerID"] == cust_id].iloc[0]
    cust_trans = customer["Num_Transactions"]

    for _ in range(1, cust_trans + 1):
      t_id = f"TRANS{trans_count:04d}"
      trans_count += 1
      cust_id = cust_ids[index]

      t_date = fake.date_this_decade()
      cust_acc_balance = np.random.randint(1000, 50000)
      t_amt = round(np.random.uniform(10.0, 5000.0),2)
      t_time = fake.time()

    # ----- check for loan add only those transactions ----- #
      if customer["Loan"] == "Yes":
        t_type = np.random.choice(transaction_types)
      else:
        t_type = np.random.choice(transaction_types[2:])

      # ----- check for cards to append in payment method ----- #
      if customer["Credit_Card"] == "Yes" and customer["Debit_Card"] == "Yes" :
          payment_method = np.random.choice(payment_methods)
      elif customer["Credit_Card"] == "Yes":#no debit card
          payment_method = np.random.choice(["Credit Card", "UPI", "Net Banking", "Cash"])
      elif customer["Debit_Card"] == "Yes":#no credit card
          payment_method = np.random.choice(["Debit Card", "UPI", "Net Banking", "Cash"])
      else: #no debit or credit card
          payment_method = np.random.choice(["UPI", "Net Banking", "Cash"])

      transaction_data["TransactionID"].append(t_id)
      transaction_data["CustomerID"].append(cust_id)
      transaction_data["TransactionDate"].append(t_date)
      transaction_data["CustAccountBalance"].append(cust_acc_balance)
      transaction_data["TransactionAmount (INR)"].append(t_amt)
      transaction_data["t_type"].append(t_type)
      transaction_data["TransactionTime"].append(t_time)
      transaction_data["payment_method"].append(payment_method)


  transactions_df = pd.DataFrame(transaction_data)
  #converting date column to datetime.date object
  transactions_df.sample(frac=1,ignore_index=True)
  return transactions_df

In [None]:
def fake_data():
  !pip install faker
  from faker import Faker
  # Initialize Faker
  fake = Faker()

  customers = generate_customers(fake)
  products = generate_products(fake)
  transactions = generate_transactions(fake,customers)

  transactions["TransactionDate"] = pd.to_datetime(transactions.TransactionDate).dt.date

  customers.to_csv("customers.csv", index=False)
  products.to_csv("products.csv", index=False)
  transactions.to_csv("transactions.csv", index=False)

  return customers,transactions,products

# Data Import

In [None]:
def import_data():
  customers = pd.read_csv("/content/drive/MyDrive/Cognizant/customers.csv")
  transactions = pd.read_csv("/content/drive/MyDrive/Cognizant/transactions.csv")
  transactions["TransactionDate"] = pd.to_datetime(transactions.TransactionDate).dt.date
  return customers, transactions

# EDA

In [None]:
if not isGenerating:
  # to take the imported data
  customers,transactions = import_data()
else:
  # to take the generated data
  customers,transactions,products = fake_data()



In [None]:
customers.head()

Unnamed: 0,CustomerID,CustGender,Age,Income,Loan,Num_Transactions,Cibil_Score,Credit_Card,Debit_Card
0,CUST001,Male,35,111734,Yes,9,405,Yes,No
1,CUST002,Female,43,105450,Yes,4,419,Yes,No
2,CUST003,Male,61,123426,No,1,357,Yes,No
3,CUST004,Male,51,147845,No,4,645,No,No
4,CUST005,Male,27,52299,Yes,1,773,Yes,No


In [None]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,TransactionDate,CustAccountBalance,TransactionAmount (INR),t_type,TransactionTime,payment_method
0,TRANS0001,CUST001,2022-11-10,30721,4342.82,Loan Amount Disbursement,13:51:08,Net Banking
1,TRANS0002,CUST001,2020-08-04,23195,1054.65,RD,01:12:12,UPI
2,TRANS0003,CUST001,2020-12-29,29699,1850.88,Loan Installment,01:32:52,Credit Card
3,TRANS0004,CUST001,2024-06-20,40577,193.05,Credit,23:38:03,Net Banking
4,TRANS0005,CUST001,2020-11-11,6122,4477.08,RD,01:00:15,Net Banking


In [None]:
# products.head()

In [None]:
customers.isnull().sum()

Unnamed: 0,0
CustomerID,0
CustGender,0
Age,0
Income,0
Loan,0
Num_Transactions,0
Cibil_Score,0
Credit_Card,0
Debit_Card,0


In [None]:
transactions["CustomerID"].value_counts()

Unnamed: 0_level_0,count
CustomerID,Unnamed: 1_level_1
CUST028,19
CUST075,19
CUST020,19
CUST015,19
CUST079,19
...,...
CUST069,1
CUST005,1
CUST057,1
CUST025,1


# Quantifying Customer Relation

## Preparing the RFM columns

In [None]:
transactions.dtypes

Unnamed: 0,0
TransactionID,object
CustomerID,object
TransactionDate,object
CustAccountBalance,int64
TransactionAmount (INR),float64
t_type,object
TransactionTime,object
payment_method,object


In [None]:
recents = transactions.groupby('CustomerID')["TransactionDate"].max().reset_index()
# left join customers table to transactions table
customers = customers.merge(recents,on="CustomerID",how="left")
customers.rename(columns = {"TransactionDate":"most_recent"}, inplace=True)
customers.columns

Index(['CustomerID', 'CustGender', 'Age', 'Income', 'Loan', 'Num_Transactions',
       'Cibil_Score', 'Credit_Card', 'Debit_Card', 'most_recent'],
      dtype='object')

In [None]:
today = datetime.datetime.today().date()
customers["Gap"] = transactions["TransactionDate"].apply(lambda date: (today-date).days)
customers["Num_Transactions"] = transactions.groupby("CustomerID")["TransactionID"].count().values
customers["Sum_Transactions"] = transactions.groupby("CustomerID")["TransactionAmount (INR)"].sum().values

In [None]:
customers.columns

Index(['CustomerID', 'CustGender', 'Age', 'Income', 'Loan', 'Num_Transactions',
       'Cibil_Score', 'Credit_Card', 'Debit_Card', 'most_recent', 'Gap',
       'Sum_Transactions'],
      dtype='object')

## Scoring RFM columns between 1-5, based on quartiles

In [None]:
customers["Recency"] = pd.qcut(customers["Gap"],q=5,labels=[5,4,3,2,1]).astype(int)
customers["Frequency"] = customers["Num_Transactions"]
customers["Monetary_Value"] = pd.qcut(customers["Sum_Transactions"],q=5,labels=[1,2,3,4,5]).astype(int)

In [None]:
customers["Recency"],customers["Frequency"],customers["Monetary_Value"]

(0     3
 1     1
 2     2
 3     5
 4     2
      ..
 95    2
 96    4
 97    2
 98    2
 99    2
 Name: Recency, Length: 100, dtype: int64,
 0      9
 1      4
 2      1
 3      4
 4      1
       ..
 95     3
 96     8
 97    16
 98    13
 99    18
 Name: Frequency, Length: 100, dtype: int64,
 0     2
 1     2
 2     1
 3     2
 4     1
      ..
 95    1
 96    3
 97    5
 98    3
 99    5
 Name: Monetary_Value, Length: 100, dtype: int64)

In [None]:
customers["RFM_Score"] = customers["Recency"]*100 + customers["Frequency"]*10 + customers["Monetary_Value"]

In [None]:
customers.head()

Unnamed: 0,CustomerID,CustGender,Age,Income,Loan,Num_Transactions,Cibil_Score,Credit_Card,Debit_Card,most_recent,Gap,Sum_Transactions,Recency,Frequency,Monetary_Value,RFM_Score
0,CUST001,Male,35,111734,Yes,9,405,Yes,No,2024-06-20,641,16123.03,3,9,2,392
1,CUST002,Female,43,105450,Yes,4,419,Yes,No,2024-04-01,1469,12890.6,1,4,2,142
2,CUST003,Male,61,123426,No,1,357,Yes,No,2020-05-27,1322,1900.08,2,1,1,211
3,CUST004,Male,51,147845,No,4,645,No,No,2024-07-26,53,8160.48,5,4,2,542
4,CUST005,Male,27,52299,Yes,1,773,Yes,No,2020-12-20,1370,875.13,2,1,1,211


# RFM Scoring

In [None]:
# the scores range from 111(lowest) to 565(highest), so we can divide in 3 parts
# as 111-259(low), 260-408(med), 409-565(high)

# customers["Relationship_Value"] = pd.cut(customers["RFM_Score"],bins=[111,260,409,float('inf')],labels=["low","med","high"])
customers["Relationship_Value"] = pd.cut(customers["RFM_Score"], bins=[110, 259, 408, 566], labels=["low", "med", "high"])
customers["Relationship_Value"].head()

Unnamed: 0,Relationship_Value
0,med
1,low
2,low
3,high
4,low


In [None]:
customers.head()

Unnamed: 0,CustomerID,CustGender,Age,Income,Loan,Num_Transactions,Cibil_Score,Credit_Card,Debit_Card,most_recent,Gap,Sum_Transactions,Recency,Frequency,Monetary_Value,RFM_Score,Relationship_Value
0,CUST001,Male,35,111734,Yes,9,405,Yes,No,2024-06-20,641,16123.03,3,9,2,392,med
1,CUST002,Female,43,105450,Yes,4,419,Yes,No,2024-04-01,1469,12890.6,1,4,2,142,low
2,CUST003,Male,61,123426,No,1,357,Yes,No,2020-05-27,1322,1900.08,2,1,1,211,low
3,CUST004,Male,51,147845,No,4,645,No,No,2024-07-26,53,8160.48,5,4,2,542,high
4,CUST005,Male,27,52299,Yes,1,773,Yes,No,2020-12-20,1370,875.13,2,1,1,211,low


In [None]:
customers["Relationship_Value"].value_counts()

Unnamed: 0_level_0,count
Relationship_Value,Unnamed: 1_level_1
high,34
med,29
low,23


In [None]:
# Getting the segmentwise customer ID
lows = customers[customers.Relationship_Value=="low"]["CustomerID"]
meds = customers[customers.Relationship_Value=="med"]["CustomerID"]
high = customers[customers.Relationship_Value=="high"]["CustomerID"]
cust_values = {"low":lows,"medium":meds,"high":high}

In [None]:
cust_values

{'low': 1     CUST002
 2     CUST003
 4     CUST005
 13    CUST014
 16    CUST017
 20    CUST021
 22    CUST023
 24    CUST025
 29    CUST030
 35    CUST036
 40    CUST041
 42    CUST043
 46    CUST047
 47    CUST048
 48    CUST049
 53    CUST054
 57    CUST058
 60    CUST061
 62    CUST063
 65    CUST066
 68    CUST069
 69    CUST070
 95    CUST096
 Name: CustomerID, dtype: object,
 'medium': 0     CUST001
 8     CUST009
 9     CUST010
 10    CUST011
 12    CUST013
 18    CUST019
 21    CUST022
 25    CUST026
 26    CUST027
 27    CUST028
 44    CUST045
 49    CUST050
 52    CUST053
 55    CUST056
 56    CUST057
 59    CUST060
 63    CUST064
 66    CUST067
 67    CUST068
 78    CUST079
 83    CUST084
 85    CUST086
 89    CUST090
 91    CUST092
 92    CUST093
 93    CUST094
 97    CUST098
 98    CUST099
 99    CUST100
 Name: CustomerID, dtype: object,
 'high': 3     CUST004
 5     CUST006
 6     CUST007
 7     CUST008
 11    CUST012
 14    CUST015
 15    CUST016
 17    CUST018
 28    

# Storing

In [None]:
# to save original
if not isGenerating:
  customers.to_csv("cust_segmented.csv",index=False)
else:
  extra_cols = ["Loan","Credit_Card","Debit_Card"]
  customers.drop(columns=extra_cols,inplace=True)
  customers.to_csv("fake_cust.csv",index=False)