In [9]:
import pandas as pd
import numpy as np

transactions = pd.read_pickle(r'data\03_int\int_transactions.pkl')
offers = pd.read_pickle(r'data/03_int/int_transcript_offers.pkl')

## Aggregate Offers
#Count the number of offer_ids per customer_id in the offers dataframe
offers_per_cust = offers.groupby('customer_id').agg(
    num_offers_received=('offer_id', 'count'),
    num_offers_viewed=('offer_viewed', 'count'),
    num_offers_completed=('offer_completed', 'count')
).reset_index().sort_values('num_offers_received', ascending=False)

## Aggregate Transactions
# Count transactions and sum transaction amounts per customer_id
transactions_per_cust = transactions.groupby('customer_id').agg(
    total_transactions=('transaction_amount', 'count'),
    total_transaction_amount=('transaction_amount', 'sum')
).reset_index().sort_values(by='total_transactions', ascending=False)

### Match Transactions to Offers
# Join the transactions and offers dataframes on 'customer_id'
merged = pd.merge(transactions, offers, on='customer_id')

# Sort by 'transaction' and 'offer_received'
merged.sort_values(['transaction', 'offer_received'], inplace=True)

# Filter transactions that occurred during the offer period
filtered_transactions = merged[(merged['transaction'] >= merged['offer_received']) & 
                               (merged['transaction'] <= merged['offer_completed'])]

# Aggregate the filtered transactions
aggregated_transactions = filtered_transactions.groupby(['customer_id', 'offer_id']).agg(
    total_transactions=('transaction_amount', 'count'),
    total_transaction_amount=('transaction_amount', 'sum'),
    transaction_list=('transaction', list),
    amount_list=('transaction_amount', list)
).reset_index()

aggregated_transactions.sort_values(by='total_transactions', ascending=False, inplace=True)

### We want to know how much of the total transaction amount was due to the offer ###
# Step 1: Sum total_transaction_amount and total_transactions per customer_id
# group aggregated by customer_id. 
total_offer_transactions = aggregated_transactions.groupby('customer_id').agg(
                                                                total_offer_transaction_amount=('total_transaction_amount', 'sum'),
                                                                total_offer_transactions=('total_transactions', 'sum')
                                                            ).reset_index()

# merge total_transactions to total_offer_transactions on customer_id
transaction_ratio = pd.merge(total_offer_transactions, transactions_per_cust, on='customer_id')

#Merge offers_per_cust to transaction_ratio on customer_id
transaction_ratio = pd.merge(offers_per_cust, transaction_ratio, on='customer_id')
transaction_ratio.sort_values(by='total_transactions', ascending=False, inplace=True)
transaction_ratio.head()

Unnamed: 0,customer_id,num_offers_received,num_offers_viewed,num_offers_completed,total_offer_transaction_amount,total_offer_transactions,total_transactions,total_transaction_amount
1194,79d9d4f86aca4bed9290350fb43817c2,5,3,4,110.66,15,36,173.41
3365,8dbfa485249f409aa223a2130f40634a,5,5,3,44.76,22,36,76.46
114,94de646f7b6041228ca7dec82adb97d2,6,5,5,72.62,23,35,90.23
454,5e60c6aa3b834e44b822ea43a3efea26,6,5,5,85.54,25,32,103.66
4828,b1f4ece7d49342628a9ed77aee2cde58,4,4,4,88.18,23,32,133.02


In [10]:
transaction_ratio.sort_values(by='num_offers_received', ascending=False).head()

Unnamed: 0,customer_id,num_offers_received,num_offers_viewed,num_offers_completed,total_offer_transaction_amount,total_offer_transactions,total_transactions,total_transaction_amount
667,2db574f9885b44fbbb99396ffd60511a,6,6,5,151.21,7,12,243.98
149,9196c2bfb739494f902a13bab46199d2,6,5,6,96.64,6,17,286.83
906,e27d828d67894aa58a1e250ff5feb166,6,5,2,36.99,2,4,66.79
161,991321fedc8e4c46abd2a17137af756d,6,4,2,36.83,7,17,210.54
22,a73cf044395d46ea804f688490ad9227,6,4,5,144.61,5,7,1128.22


In [11]:
# Data Wrangling Checks
# 1) if num_offers_complete <= total_offer_transactions, true, else false
transaction_ratio['check1'] = transaction_ratio['num_offers_completed'] <= transaction_ratio['total_offer_transactions']
transaction_ratio = transaction_ratio[(transaction_ratio['check1'] == False)]
print("Number of rows where number_offers_completed is greater than the total_offer_transactions:", len(transaction_ratio))

# 2 if total_offer_transactions <= total_transactions, true, else false
transaction_ratio['check3'] = transaction_ratio['total_offer_transactions'] <= transaction_ratio['total_transactions']
transaction_ratio = transaction_ratio[(transaction_ratio['check3'] == False)]
print("Number of rows where total_offer_transactions is greater than the total_transactions:", len(transaction_ratio))

Number of rows where number_offers_completed is greater than the total_offer_transactions: 0
Number of rows where total_offer_transactions is greater than the total_transactions: 0


In [12]:
#Spining off the offers dataframe to include the aggregated transactions
offers_with_transactions = pd.merge(offers, aggregated_transactions, on=['customer_id', 'offer_id'])
offers_with_transactions.head()

Unnamed: 0,customer_id,offer_id,offer_type,offer_received,offer_viewed,offer_completed,time_to_completion,offer_effect,difficulty,duration_hrs,expiration,reward,total_transactions,total_transaction_amount,transaction_list,amount_list
0,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,bogo,408.0,456.0,414.0,6.0,0.0,5,120,528.0,5.0,1,8.57,[414],[8.57]
1,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,discount,504.0,540.0,528.0,24.0,0.0,10,240,744.0,2.0,1,14.11,[528],[14.11]
2,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,discount,576.0,,576.0,0.0,-1.0,10,168,744.0,2.0,1,10.27,[576],[10.27]
3,0011e0d4e6b944f998e987f904e8c1e5,2298d6c36e964ae4a3e7e9706d1fb8c2,discount,168.0,186.0,252.0,66.0,2.0,7,168,336.0,3.0,1,11.93,[252],[11.93]
4,0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,408.0,432.0,576.0,144.0,2.0,20,240,648.0,5.0,1,22.05,[576],[22.05]
