In [1]:
import pandas as pd
import numpy as np

# Load datasets
transactions = pd.read_pickle('data/03_int/int_transactions.pkl')
offers = pd.read_pickle('data/03_int/int_offers.pkl')
profile_feat = pd.read_pickle('data/03_int/int_profile_feat.pkl')

# Merge Transaction and Offer Data

In [2]:
# Merge transactions and offers, and filter transactions within the offer period
merged = pd.merge(transactions, offers, on='customer_id', how='inner').sort_values(['transaction', 'offer_received'])
filtered_transactions = merged[(merged['transaction'] >= merged['offer_received']) & 
                               (merged['transaction'] <= merged['offer_completed'])]

# Aggregate transactions by customer and offer
agg_matched_transactions = filtered_transactions.groupby(['customer_id', 'offer_id']).agg(
    total_transactions=('transaction_amount', 'count'),
    total_transaction_amount=('transaction_amount', 'sum')
).reset_index()

# Output number of customers and max transactions
print(f'There were {agg_matched_transactions["customer_id"].nunique()} customers who made transactions during the offer period.')
print(f'The maximum number of transactions for a single customer was {agg_matched_transactions["total_transactions"].max()}.')

# Merge offers with aggregated transactions and profile features
df_matched = pd.merge(offers, agg_matched_transactions, on=['customer_id', 'offer_id'], how='left')
df_matched = pd.merge(profile_feat, df_matched, on='customer_id', how='inner')

# Save merged data
df_matched.to_pickle('data/04_fct/fct_matched_offers.pkl')
df_matched.to_csv('data/04_fct/fct_matched_offers.csv', index=False)
print(f'There are {df_matched["customer_id"].nunique()} unique customers in the dataset.')
df_matched.head()

There were 12774 customers who made transactions during the offer period.
The maximum number of transactions for a single customer was 22.
There are 16994 unique customers in the dataset.


Unnamed: 0,customer_id,age,income,days_as_member,gender_F,gender_M,gender_O,gender_Unknown,offer_id,offer_received,...,email,mobile,social,web,is_bogo,is_discount,is_informational,expiration,total_transactions,total_transaction_amount
0,68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,2906b810c7d4411798c6938adc9daaa5,168.0,...,1,1,0,1,0,1,0,336.0,,
1,68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,0b1e1539f2cc45b7b9fa7c272da2e1d7,336.0,...,1,0,0,1,0,1,0,576.0,,
2,68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,fafdcd668e3743c1bb461111dcafc2a4,408.0,...,1,1,1,1,0,1,0,648.0,5.0,10.17
3,68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,2298d6c36e964ae4a3e7e9706d1fb8c2,504.0,...,1,1,1,1,0,1,0,672.0,3.0,7.54
4,0610b486422d4921ae7d2bf64640c50b,55.0,112000.0,376,1,0,0,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,408.0,...,1,1,0,1,1,0,0,576.0,1.0,23.22


# Prepare Data for Machine Learning

In [3]:
# Load matched offers data and drop unnecessary columns
matched_offers = pd.read_pickle('data/04_fct/fct_matched_offers.pkl').set_index('customer_id')

columns_to_drop = [
    'offer_received', # dropping as this is implied by the record in the data set
    'gender_Unknown', # dropping to avoid colinearity
    'expiration', # dropping because we're interested in responses, not in timing
    'offer_id', # can be derived by the characteristics of the offer
    'email', # all overs were sent out via emaile
    'is_informational', # dropping to avoid colinearity
    'time_to_complete' # dropping because we're interested in responses, not in timing
    ]
matched_offers.drop(columns=columns_to_drop, axis=1, inplace=True)

# Convert columns to appropriate types
matched_offers[['offer_viewed', 'offer_completed']] = matched_offers[['offer_viewed', 'offer_completed']].notnull().astype(int)
matched_offers['total_transactions'] = matched_offers['total_transactions'].fillna(0).astype(int)
matched_offers['total_transaction_amount'] = matched_offers['total_transaction_amount'].round(2)
matched_offers['viewed_before_completion'] = matched_offers['viewed_before_completion'].fillna(0)

# Prepare demographic data
demo_data = matched_offers.dropna().drop('gender_O', axis=1)
int_cols = ['age', 'income', 'days_as_member', 'difficulty', 'reward', 'duration_hrs', 'total_transactions', 'total_transaction_amount']
boolean_cols = ['gender_F', 'gender_M', 'offer_viewed', 'offer_completed', 'viewed_before_completion', 'mobile', 'social', 'web', 'is_bogo', 'is_discount']
demo_data[int_cols] = demo_data[int_cols].apply(pd.to_numeric, downcast='integer')
demo_data[boolean_cols] = demo_data[boolean_cols].astype('int8')

# Export cleaned data
demo_data.to_pickle('data/04_fct/fct_demographic_offers_and_transactions.pkl')
demo_data.to_csv('data/04_fct/fct_demographic_offers_and_transactions.csv')
demo_data.head()

Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,offer_viewed,offer_completed,viewed_before_completion,difficulty,reward,duration_hrs,mobile,social,web,is_bogo,is_discount,total_transactions,total_transaction_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0,1,0,5,5,168,1,0,1,1,0,1,23.22
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,5,5,168,1,0,1,1,0,1,19.89
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,10,10,168,1,1,0,1,0,1,21.72
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,0,5,5,120,1,1,1,1,0,1,21.72
e2127556f4f64592b11af22de27a7932,68,70000,91,0,1,1,1,1,5,5,168,1,0,1,1,0,1,18.42
