In [13]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sqlite3
from scipy.stats import zscore #To explore the dataset and spot ouliers
from scipy.stats import chi2_contingency


In [14]:
# Connect to the SQLite database
conn = sqlite3.connect('../Customer_Behavior.db')

# Define your SQL query
query = "SELECT * FROM E_Comm_Customer_Behavior"

# Use pandas to execute the query and read the data into a DataFrame
df = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

In [15]:
df

Unnamed: 0,Customer ID,Gender,Age,City,Membership Type,Total Spend,Items Purchased,Average Rating,Discount Applied,Days Since Last Purchase,Satisfaction Level
0,101,Female,29,New York,Gold,1120.20,14,4.6,TRUE,25,Satisfied
1,102,Male,34,Los Angeles,Silver,780.50,11,4.1,FALSE,18,Neutral
2,103,Female,43,Chicago,Bronze,510.75,9,3.4,TRUE,42,Unsatisfied
3,104,Male,30,San Francisco,Gold,1480.30,19,4.7,FALSE,12,Satisfied
4,105,Male,27,Miami,Silver,720.40,13,4.0,TRUE,55,Unsatisfied
...,...,...,...,...,...,...,...,...,...,...,...
345,446,Male,32,Miami,Silver,660.30,10,3.8,TRUE,42,Unsatisfied
346,447,Female,36,Houston,Bronze,470.50,8,3.0,FALSE,27,Neutral
347,448,Female,30,New York,Gold,1190.80,16,4.5,TRUE,28,Satisfied
348,449,Male,34,Los Angeles,Silver,780.20,11,4.2,FALSE,21,Neutral


In [19]:
# Step 1: Create separate dataframes for each gender
df_male = df[(df['Gender'] == 'Male') & df['Satisfaction Level'].notnull()]
df_female = df[(df['Gender'] == 'Female') & df['Satisfaction Level'].notnull()]


# Define order of satisfaction levels
satisfaction_order = ['Unsatisfied', 'Neutral', 'Satisfied']

# Calculate distribution of satisfaction levels for each gender with and without discounts while ignoring NULL entry
# Male - Discount Applied
male_discount = pd.crosstab(df_male['Discount Applied'], df_male['Satisfaction Level']).reindex(columns=satisfaction_order)

# Female - Discount Applied
female_discount = pd.crosstab(df_female['Discount Applied'], df_female['Satisfaction Level']).reindex(columns=satisfaction_order)

# Perform chi-square test to determine if there's a significant relationship

# Chi-square test for males
chi2_male, p_male, dof_male, expected_male = chi2_contingency(male_discount.fillna(0))

# Chi-square test for females
chi2_female, p_female, dof_female, expected_female = chi2_contingency(female_discount.fillna(0))

# Output the results
print("Male Discount vs Satisfaction Level:")
print(male_discount)
print(f"Chi-square Statistic: {chi2_male}, p-value: {p_male}")

print("\nFemale Discount vs Satisfaction Level:")
print(female_discount)
print(f"Chi-square Statistic: {chi2_female}, p-value: {p_female}")


Male Discount vs Satisfaction Level:
Satisfaction Level  Unsatisfied  Neutral  Satisfied
Discount Applied                                   
FALSE                         0       51         66
TRUE                         57        0          1
Chi-square Statistic: 170.55438985444135, p-value: 9.216898627765793e-38

Female Discount vs Satisfaction Level:
Satisfaction Level  Unsatisfied  Neutral  Satisfied
Discount Applied                                   
FALSE                         0       56          0
TRUE                         59        0         58
Chi-square Statistic: 172.99999999999997, p-value: 2.713484313986819e-38


*Findings:
In both genders, the p-value's are significantly smaller than 0.05, which indicates that the relationship between "Discount Applied" and "Satisfaction Level" is statistically significant for both males and females. Both genders show a significant relationship between discount application and satisfaction level. However, these results suggest that discount strategies might need to be gender-specific to improve customer satisfaction effectively. For males, discounts seem to cause dissatisfaction, while for females, the response to discounts is mixed and might depend on other factors not captured in this dataset.