In [10]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


In [11]:
# Import Data
# This data set will return 0 if there are no transactions that matched with the offer
matched_offers = pd.read_pickle(r'data\04_fct\fct_matched_offers.pkl')

# Set the index to customer_id
matched_offers = matched_offers.set_index('customer_id')

matched_offers.head()

Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,gender_O,gender_Unknown,offer_id,offer_received,offer_viewed,...,email,mobile,social,web,is_bogo,is_discount,is_informational,expiration,total_transactions,total_transaction_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,2906b810c7d4411798c6938adc9daaa5,168.0,216.0,...,1,1,0,1,0,1,0,336.0,0.0,0.0
68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,0b1e1539f2cc45b7b9fa7c272da2e1d7,336.0,348.0,...,1,0,0,1,0,1,0,576.0,0.0,0.0
68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,fafdcd668e3743c1bb461111dcafc2a4,408.0,408.0,...,1,1,1,1,0,1,0,648.0,5.0,10.17
68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,2298d6c36e964ae4a3e7e9706d1fb8c2,504.0,504.0,...,1,1,1,1,0,1,0,672.0,3.0,7.54
0610b486422d4921ae7d2bf64640c50b,55.0,112000.0,376,1,0,0,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,408.0,,...,1,1,0,1,1,0,0,576.0,1.0,23.22


In [12]:
### Drop unnecessary columns and simplify data set ###
# Dropping 'offer_received' because we know that all of the records are offers that have been received
# Drop 'gender_Unknown' to avoid colinearity
# Drop 'expiration' column because we're no longer looking at the timing of offers, just responses
# Drop 'offer_id' because we can identify the offer based off of it's encoded parts
# Drop 'email' because all offers were sent out via email
# Drop 'is_information' to avoid colinearity
matched_offers.drop(['offer_received','gender_Unknown', 'expiration', 'offer_id', 'email', 'is_informational'], axis=1, inplace=True)

# Dropping 'time_to_completion' because we can leverage 'expiration' as a rough gauge of how long it took to complete the offer
# Could potentially fill this with the expiration column
matched_offers.drop(['time_to_complete'], axis=1, inplace=True)

# Convert offer_viewed and offer_completed to boolean
matched_offers['offer_viewed'] = matched_offers['offer_viewed'].notnull().astype(int)
matched_offers['offer_completed'] = matched_offers['offer_completed'].notnull().astype(int)

# Convert total_transactions to int
matched_offers['total_transactions'] = matched_offers['total_transactions'].astype(int)
matched_offers['total_transaction_amount'] = round(matched_offers['total_transaction_amount'], 2)

# Fill viewed_before_completion with0
matched_offers['viewed_before_completion'] = matched_offers['viewed_before_completion'].fillna(0)

matched_offers.head()

Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,gender_O,offer_viewed,offer_completed,viewed_before_completion,difficulty,reward,duration_hrs,mobile,social,web,is_bogo,is_discount,total_transactions,total_transaction_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,0,0.0,10,2,168,1,0,1,0,1,0,0.0
68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,0,0.0,20,5,240,0,0,1,0,1,0,0.0
68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,1,1.0,10,2,240,1,1,1,0,1,5,10.17
68be06ca386d4c31939f3a4f0e3dd783,,,529,0,0,0,1,1,1.0,7,3,168,1,1,1,0,1,3,7.54
0610b486422d4921ae7d2bf64640c50b,55.0,112000.0,376,1,0,0,0,1,0.0,5,5,168,1,0,1,1,0,1,23.22


In [13]:
### Downcast Data Types for Performance #### Converting the continuous columns to integers
### Prep Demo Data ###
demo_data = matched_offers.copy()

# Drop rows with missing values so that we are only looking at the data that has demographic information
demo_data.dropna(inplace=True)

# Dropping gender_O to avoid colinearity with the gender data
demo_data = demo_data.drop('gender_O', axis=1) 

int_cols = ['age', 'income', 'days_as_member', 'difficulty', 'reward', 'duration_hrs', 'total_transactions', 'total_transaction_amount']
for col in int_cols:
    demo_data[col] = pd.to_numeric(demo_data[col], downcast='integer')

# Converting the boolean columns to int8
boolean_cols = [
                'gender_F','gender_M',
                'offer_viewed', 'offer_completed','viewed_before_completion', 
                'mobile', 'social', 'web', 'is_bogo', 'is_discount'
                ]
for col in boolean_cols:
    demo_data[col] = demo_data[col].astype('int8')

# Exporting the cleaned data
demo_data.to_pickle(r'data\04_fct\fct_demographic_offers_and_transactions.pkl')
demo_data.to_csv(r'data\04_fct\fct_demographic_offers_and_transactions.csv')

print(demo_data.dtypes)
demo_data.head()

age                            int8
income                        int32
days_as_member                int16
gender_F                       int8
gender_M                       int8
offer_viewed                   int8
offer_completed                int8
viewed_before_completion       int8
difficulty                     int8
reward                         int8
duration_hrs                  int16
mobile                         int8
social                         int8
web                            int8
is_bogo                        int8
is_discount                    int8
total_transactions             int8
total_transaction_amount    float64
dtype: object


Unnamed: 0_level_0,age,income,days_as_member,gender_F,gender_M,offer_viewed,offer_completed,viewed_before_completion,difficulty,reward,duration_hrs,mobile,social,web,is_bogo,is_discount,total_transactions,total_transaction_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0,1,0,5,5,168,1,0,1,1,0,1,23.22
0610b486422d4921ae7d2bf64640c50b,55,112000,376,1,0,0,0,0,0,0,96,1,0,1,0,0,0,0.0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,5,5,168,1,0,1,1,0,1,19.89
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,0,0,0,0,72,1,1,0,0,0,0,0.0
78afa995795e4d85b5d9ceeca43f5fef,75,100000,443,1,0,1,1,1,10,10,168,1,1,0,1,0,1,21.72


In [19]:
demo_data.columns

Index(['age', 'income', 'days_as_member', 'gender_F', 'gender_M',
       'offer_viewed', 'offer_completed', 'viewed_before_completion',
       'difficulty', 'reward', 'duration_hrs', 'mobile', 'social', 'web',
       'is_bogo', 'is_discount', 'total_transactions',
       'total_transaction_amount'],
      dtype='object')

# Assess Features

In [14]:
# Group independent variables
all_ind_vars = (
    ['age','income', 'days_as_member'] #demographic data
    + ['is_bogo', 'is_discount']       # offer_type
    + ['difficulty', 'reward', 'duration_hrs'] # offer description
    + ['mobile', 'social', 'web'] # offer distribution channel
)

# Target var
offer_completed = 'offer_completed'

# Since we want to uncover demographic response trends, we need to split the data by gender
# and filter on the cases where the customer viewed the offer
female_df = demo_data[(demo_data['gender_F'] == 1) & (demo_data['offer_viewed'] == 1)]
male_df = demo_data[(demo_data['gender_M'] == 1) & (demo_data['offer_viewed'] == 1)]


# Analyze Feature Correlations

In [15]:
def analyze_feature_correlation(df, independent_vars, target_var):
    # Define independent vars and target var
    X = df[independent_vars]
    y = df[target_var]

    # Combine X and y for correlation calculation
    combined_df = pd.concat([X, y], axis=1)

    # Calculate correlations
    correlations = combined_df.corr()[target_var].drop(target_var)

    # Create a DataFrame for easier visualization
    features_correlation = pd.DataFrame({'feature': correlations.index, 'correlation': correlations.values})

    # Sort the DataFrame by correlation
    features_correlation_sorted = features_correlation.sort_values(by='correlation', ascending=False).reset_index(drop=True)

    return features_correlation_sorted

# Call the function
female_corr = analyze_feature_correlation(female_df, independent_vars=all_ind_vars, target_var=offer_completed)
male_corr = analyze_feature_correlation(male_df, independent_vars=all_ind_vars, target_var=offer_completed)

# Analyze Feature Importance

In [16]:
def analyze_feature_importance(df, independent_vars, target_var):
    # Define independent vars
    X = df[independent_vars]
    
    # Define target var
    y = df[target_var]  

    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the Decision Tree Classifier
    clf = DecisionTreeClassifier(random_state=42)

    # Fit the model
    clf.fit(X_train, y_train)

    # Get feature importances
    feature_importances = clf.feature_importances_

    # Create a DataFrame for easier visualization
    features = pd.DataFrame({'feature': X.columns, 'importance': feature_importances})

    # Sort the DataFrame by importance
    features_sorted = features.sort_values(by='importance', ascending=False).reset_index(drop=True)

    return features_sorted


# Call the function
female_import = analyze_feature_importance(female_df, independent_vars=all_ind_vars, target_var=offer_completed)
male_import = analyze_feature_importance(male_df, independent_vars=all_ind_vars, target_var=offer_completed)

In [17]:
# Join the data together
women = pd.merge(female_corr, female_import, on='feature', how='inner')

# Calculate the magnitude of each feature
women['magnitude'] = abs(women['correlation'] + women['importance'])

# Round the values and sort the data frame
women = women.round(2)
women = women.sort_values(by='magnitude', ascending=False).reset_index(drop=True)

women.to_csv(r'data/04_fct/fct_feature_analysis_women.csv')
women.head()

Unnamed: 0,feature,correlation,importance,magnitude
0,reward,0.29,0.42,0.7
1,duration_hrs,0.47,0.02,0.48
2,difficulty,0.44,0.01,0.45
3,days_as_member,0.17,0.25,0.41
4,is_discount,0.31,0.0,0.31


In [18]:
# Join the data together
men_df = pd.merge(male_corr, male_import, on='feature', how='inner')

# Calculate the magnitude of each feature
men_df['magnitude'] = abs(men_df['correlation'] + men_df['importance'])

# Round the values and sort the data frame
men_df = men_df.round(2)
men_df = men_df.sort_values(by='magnitude', ascending=False).reset_index(drop=True)

men_df.to_csv(r'data/04_fct/fct_feature_analysis_men.csv')
men_df.head()

Unnamed: 0,feature,correlation,importance,magnitude
0,days_as_member,0.17,0.32,0.5
1,duration_hrs,0.42,0.02,0.44
2,reward,0.12,0.29,0.4
3,difficulty,0.35,0.01,0.36
4,income,0.19,0.16,0.35
