In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import Data -> total_transactions = 0 if no transactions are matched with the offer
matched_offers = pd.read_pickle(r'data\04_fct\fct_matched_offers.pkl')
matched_offers = matched_offers.set_index('customer_id')
matched_offers.head()

Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,gender_O,gender_Unknown,offer_id,offer_received,offer_viewed,...,email,mobile,social,web,is_bogo,is_discount,is_informational,expiration,total_transactions,total_transaction_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,2906b810c7d4411798c6938adc9daaa5,168.0,216.0,...,1,1,0,1,0,1,0,336.0,0.0,0.0
68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,0b1e1539f2cc45b7b9fa7c272da2e1d7,336.0,348.0,...,1,0,0,1,0,1,0,576.0,0.0,0.0
68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,fafdcd668e3743c1bb461111dcafc2a4,408.0,408.0,...,1,1,1,1,0,1,0,648.0,5.0,10.17
68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,2298d6c36e964ae4a3e7e9706d1fb8c2,504.0,504.0,...,1,1,1,1,0,1,0,672.0,3.0,7.54
0610b486422d4921ae7d2bf64640c50b,55.0,112000.0,376,1,0,0,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,408.0,,...,1,1,0,1,1,0,0,576.0,1.0,23.22


In [3]:
# Drop unneed columns
columns_to_drop = [
    'offer_received', # dropping as this is implied by the record in the data set
    'gender_Unknown', # dropping to avoid colinearity
    'expiration', # dropping because we're interested in responses, not in timing
    'offer_id', # can be derived by the characteristics of the offer
    'email', # all overs were sent out via emaile
    'is_informational', # dropping to avoid colinearity
    'time_to_complete' # dropping because we're interested in responses, not in timing
    ]

matched_offers.drop(columns=columns_to_drop, axis=1, inplace=True)

# Convert 'offer_viewed' and 'offer_completed' to boolean in a loop
for col in ['offer_viewed', 'offer_completed']:
    matched_offers[col] = matched_offers[col].notnull().astype(int)

# Convert 'total_transactions' to int and round 'total_transaction_amount'
matched_offers['total_transactions'] = matched_offers['total_transactions'].astype(int)
matched_offers['total_transaction_amount'] = round(matched_offers['total_transaction_amount'], 2)

# Fill 'viewed_before_completion' with 0
matched_offers['viewed_before_completion'] = matched_offers['viewed_before_completion'].fillna(0)

# Prepare demographic data, dropping 'gender_O' because it's colinear with gender_F and gender_M
demo_data = matched_offers.dropna().drop('gender_O', axis=1)

# Convert specified columns to 'integer' and 'int8' types
int_cols = ['age', 'income', 'days_as_member', 'difficulty', 'reward', 'duration_hrs', 'total_transactions', 'total_transaction_amount']
boolean_cols = ['gender_F', 'gender_M', 'offer_viewed', 'offer_completed', 'viewed_before_completion', 'mobile', 'social', 'web', 'is_bogo', 'is_discount']

demo_data[int_cols] = demo_data[int_cols].apply(pd.to_numeric, downcast='integer')
demo_data[boolean_cols] = demo_data[boolean_cols].astype('int8')

# Exporting the cleaned data
demo_data.to_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')
demo_data.to_csv(r'data\04_fct\fct_demographic_offers_and_transactions.csv')

print(demo_data.dtypes)
demo_data.head()

age                            int8
income                        int32
days_as_member                int16
gender_F                       int8
gender_M                       int8
offer_viewed                   int8
offer_completed                int8
viewed_before_completion       int8
difficulty                     int8
reward                         int8
duration_hrs                  int16
mobile                         int8
social                         int8
web                            int8
is_bogo                        int8
is_discount                    int8
total_transactions             int8
total_transaction_amount    float64
dtype: object


Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,offer_viewed,offer_completed,viewed_before_completion,difficulty,reward,duration_hrs,mobile,social,web,is_bogo,is_discount,total_transactions,total_transaction_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0,1,0,5,5,168,1,0,1,1,0,1,23.22
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0,0,0,0,0,96,1,0,1,0,0,0,0.0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,5,5,168,1,0,1,1,0,1,19.89
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,0,0,0,0,72,1,1,0,0,0,0,0.0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,10,10,168,1,1,0,1,0,1,21.72
