## Customer and Sales Analysis

In [1]:
#import libraries
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, f_oneway, f

### Data Preparation:
Load data from customer.csv and sales.csv into pandas data frames.<br>
Convert the 'date' column in both datasets to pandas datetime objects for analysis.

In [2]:
#load customer feedback data
df_feedback = pd.read_csv ("customer.csv")

In [3]:
# check the shape of the df
df_feedback.shape

(500, 3)

In [4]:
# review dataframe
df_feedback.head()

Unnamed: 0,date,product,feedback_score
0,2023-02-22,iOS,5
1,2023-05-22,Android,2
2,2022-11-22,iOS,2
3,2022-11-26,Android,10
4,2023-04-26,iOS,1


In [5]:
#load sales data
df_sales = pd.read_csv("sales.csv")

In [6]:
# check the shape of the df
df_sales.shape

(500, 3)

In [7]:
#review dateframe
df_sales.head()

Unnamed: 0,date,product,sales
0,2022-12-12,iOS,473
1,2022-12-12,Android,919
2,2023-06-24,iOS,805
3,2023-06-24,Android,996
4,2023-10-20,iOS,792


In [8]:
#convert date column to pandas datetime object
df_feedback["date"] = pd.to_datetime(df_feedback["date"])

In [9]:
#convert date column to pandas datetime object
df_sales["date"] = pd.to_datetime(df_sales["date"])

In [10]:
#confirm data types
df_feedback.dtypes

date              datetime64[ns]
product                   object
feedback_score             int64
dtype: object

In [11]:
#confirm data types 
df_sales.dtypes

date       datetime64[ns]
product            object
sales               int64
dtype: object

In [12]:
#utility function to check equal variance when samples are uneven
def check_equal_variance(sample1, sample2):
    # Calculate variances
    variance1 = np.var(sample1, ddof=1)
    variance2 = np.var(sample2, ddof=1)

    # Calculate the F-statistic
    if variance1 > variance2:
        F = variance1 / variance2
        dfn = len(sample1) - 1  # degrees of freedom numerator
        dfd = len(sample2) - 1  # degrees of freedom denominator
    else:
        F = variance2 / variance1
        dfn = len(sample2) - 1
        dfd = len(sample1) - 1

    # Calculate the p-value
    p_value = 2 * min(f.cdf(F, dfn, dfd), 1 - f.cdf(F, dfn, dfd))

    # Interpret the result
    alpha = 0.05
    if p_value < alpha:
        return False
    else:
        return True

### Customer Feedback Analysis

In [13]:
def feedback_analysis(df_feedback):
    # Filter data for iOS and Android
    ios_feedback = df_feedback[df_feedback["product"] == "iOS"]
    android_feedback = df_feedback[df_feedback["product"] == "Android"]
    
    # Extract the satisfaction scores
    ios_scores = ios_feedback["feedback_score"].values
    android_scores = android_feedback["feedback_score"].values
    
    # Combine into a 2-dimensional array
    feedback_scores_2d = np.array([ios_scores, android_scores])

    # Determine Equal Variance by testing if (the Larger Stand Deviation / the smaller Standard Deviation) > 2
    # Assume equal variance unless (the Larger Stand Deviation / the smaller Standard Deviation) > 2
    EqualVar = True
    
    if feedback_scores_2d[0].std() > feedback_scores_2d[1].std():
        if (feedback_scores_2d[0].std() / feedback_scores_2d[1].std()) > 2:
            EqualVar = False
    else:
        if (feedback_scores_2d[1].std() / feedback_scores_2d[0].std()) > 2:
            EqualVar = False

    # Obtain T-Stat and Pvalue
    SampleT = ttest_ind(feedback_scores_2d[0], feedback_scores_2d[1], equal_var=EqualVar) 

    #Assign T-Stat and Pvalue
    statistic = SampleT.statistic
    p_val = SampleT.pvalue
    #Return statistic and pvalue
    return statistic, p_val

In [14]:
#print return value for statistic
statistic = feedback_analysis(df_feedback)[0]
print(f"Statistic: {statistic}")

Statistic: 1.9033888211703986


In [15]:
#print return value for p_value
p_value = feedback_analysis(df_feedback)[1]
print(f"p_value: {p_value}")

p_value: 0.05756609365982318


In [16]:
# confirm if difference is statistically significant
if p_value < 0.05:
    print("The difference in average satisfaction between iOS and Android is statistically significant.")
else:
    print("The difference in average satisfaction between iOS and Android is not statistically significant.")

The difference in average satisfaction between iOS and Android is not statistically significant.


#### Interpreting The Results

Based on the data provided and the hypothesis testing if there's a significant difference in average customer satisfaction between the iOS and Android groups, I determined that an Independent Two-Tail T-Test was appropriate for this situation. I selected the following because iOS and Android users are independent groups, and my decision for Two Tail was because we are checking for any difference without a specified direction. After performing the test, I determined that the difference in average satisfaction between iOS and Android is not statistically significant since the p_value 0.057 is greater than 0.05.

### Sales Performance Analysis

In [17]:
def sales_analysis(df_sales): 
    #convert campaign dates to pandas datetime object
    campaign_start = pd.to_datetime("2023-03-01")
    campaign_end = pd.to_datetime("2023-03-31")

    # filter df for sales before and after
    sales_before = df_sales[df_sales['date'] < campaign_start]
    sales_after = df_sales[df_sales['date'] > campaign_end]

    # check equal variance
    equal_variance = check_equal_variance(sales_before['sales'], sales_after['sales'])

    # Obtain T-Stat and Pvalue
    SampleT = ttest_ind(sales_before['sales'], sales_after['sales'], equal_var=equal_variance)

    #Assign T-Stat and Pvalue
    statistic = SampleT.statistic
    p_val = SampleT.pvalue

    #Return statistic and pvalue
    return statistic, p_val

In [18]:
#print return value for statistic
statistic = sales_analysis(df_sales)[0]
print(f"Statistic: {statistic}")

Statistic: 0.17298529520949715


In [19]:
#print return value for p_value
p_value = sales_analysis(df_sales)[1]
print(f"p_value: {p_value}")

p_value: 0.862739406552252


In [20]:
# confirm if difference is statistically significant
if p_value < 0.05:
    print("The difference in average sales before and after the marketing campaign is statistically significant.")
else:
    print("The difference in average sales before and after the marketing campaign is not statistically significant.")

The difference in average sales before and after the marketing campaign is not statistically significant.


#### Interpreting The Results

Based on the data provided and the hypothesis to test a marketing campaign's impact on sales, I determined that an Independent Two-Tail T-Test was appropriate for this situation. In this case, sales data before and after the campaign are from different periods and are not paired. In addition, my decision for Two Tail was because we checked for any difference without a specified direction. After performing the test, I determined that the difference in average sales before and after the marketing campaign is not statistically significant since the p_value 0.86 is greater than 0.05.

### Seasonal Sales Analysis

In [21]:
def seasonal_analysis(df_sales): 
    #define seasonal differences
    summer_months = [6, 7, 8]
    winter_months = [12, 1, 2]

    # Filter data for summer and winter
    sales_summer = df_sales[df_sales['date'].dt.month.isin(summer_months)]
    sales_winter = df_sales[df_sales['date'].dt.month.isin(winter_months)]

    #check equal variance
    equal_variance = check_equal_variance(sales_summer['sales'], sales_winter['sales'])

    # Obtain T-Stat and Pvalue
    SampleT = ttest_ind(sales_summer['sales'], sales_winter['sales'], equal_var=equal_variance)

    #Assign T-Stat and Pvalue
    statistic = SampleT.statistic
    p_val = SampleT.pvalue

    #Return statistic and pvalue
    return statistic, p_val

In [22]:
#print return value for statistic
statistic = seasonal_analysis(df_sales)[0]
print(f"Statistic: {statistic}")

Statistic: 0.09927308556714513


In [23]:
#print return value for p_value
p_value = seasonal_analysis(df_sales)[1]
print(f"p_value: {p_value}")

p_value: 0.9209991394112975


In [24]:
# confirm if difference is statistically significant
if p_value < 0.05:
    print("The difference in average sales between summer and winter is statistically significant.")
else:
    print("The difference in average sales between summer and winter is not statistically significant.")

The difference in average sales between summer and winter is not statistically significant.


#### Interpreting The Results

Based on the data provided and the hypothesis to test differences in sales between summer and winter, I determined that an Independent Two-Tail T-Test was appropriate for this situation. In this case, sales data between summer and winter are from different periods and are not paired. In addition, my decision for Two Tail was because we checked for any difference without a specified direction. After performing the test, I determined that the difference in average sales between summer and winter is not statistically significant since the p_value 0.92 is greater than 0.05.

### Feedback Consistency Analysis

In [25]:
def consistency_analysis(df_feedback): 
    # Define the months to analyze
    months_to_analyze = [1, 5, 9, 12]  # January, May, September, December

    # Filter data for the specified months
    feedback_january = df_feedback[df_feedback['date'].dt.month == 1]['feedback_score']
    feedback_may = df_feedback[df_feedback['date'].dt.month == 5]['feedback_score']
    feedback_september = df_feedback[df_feedback['date'].dt.month == 9]['feedback_score']
    feedback_december = df_feedback[df_feedback['date'].dt.month == 12]['feedback_score']

    # Perform one-way ANOVA
    anova_result = f_oneway(feedback_january, feedback_may, feedback_september, feedback_december)

    # Extract the F-statistic and p-value
    statistic = anova_result.statistic
    p_val = anova_result.pvalue
    
    #Return statistic and pvalue
    return statistic, p_val

In [26]:
#print return value for statistic
statistic = consistency_analysis(df_feedback)[0]
print(f"Statistic: {statistic}")

Statistic: 0.3146823675455494


In [27]:
#print return value for p_value
p_value = consistency_analysis(df_feedback)[1]
print(f"p_value: {p_value}")

p_value: 0.8147473590881886


In [28]:
# confirm if difference is statistically significant
if p_value < 0.05:
    print("The differences in monthly feedback scores are statistically significant.")
else:
    print("The differences in monthly feedback scores are not statistically significant.")

The differences in monthly feedback scores are not statistically significant.


#### Interpreting The Results

Based on the data provided and the hypothesis to assess if monthly feedback scores are consistent across January, May, September, and December, as instructed, I used one-way ANOVA to test significant differences in feedback scores across these months. After performing the test, I determined that the difference in monthly feedback scores is not statistically significant since the p_value 0.81 is greater than 0.05.

### Sales and Feedback Correlation Analysis

In [29]:
def corr_analysis(df_feedback, df_sales): 
    # convert date column to months for df_feedback and df_sales
    df_feedback["month"] = df_feedback["date"].dt.to_period('M')
    df_sales['month'] = df_sales['date'].dt.to_period('M')
    
    # aggregate feedback scores by month
    monthly_feedback = df_feedback.groupby('month')['feedback_score'].mean().reset_index()
    
    #determine high and low feedback months based on the median
    median_feedback = monthly_feedback['feedback_score'].median()
    monthly_feedback['feedback_category'] = monthly_feedback['feedback_score'].apply(lambda x: 'High' if x >= median_feedback else 'Low')
    
    # merge sales data with monthly feedback
    merged_data = pd.merge(df_sales, monthly_feedback, on='month')

    # Filter sales data based on feedback category
    sales_high_feedback = merged_data[merged_data['feedback_category'] == 'High']['sales']
    sales_low_feedback = merged_data[merged_data['feedback_category'] == 'Low']['sales']

    #check equal variance
    equal_variance = check_equal_variance(sales_high_feedback, sales_low_feedback)
    
    #Assign T-Stat and Pvalue
    SampleT = ttest_ind(sales_high_feedback, sales_low_feedback, equal_var=equal_variance)

    # assign variables for statistic and p_val
    statistic = SampleT.statistic
    p_val = SampleT.pvalue

    #Return statistic and pvalue
    return statistic, p_val

In [30]:
#print return value for statistic
statistic = corr_analysis(df_feedback, df_sales)[0]
print(f"Statistic: {statistic}")

Statistic: -1.9713630084936649


In [31]:
#print return value for p_value
p_value = corr_analysis(df_feedback, df_sales)[1]
print(f"p_value: {p_value}")

p_value: 0.04923553477240242


In [32]:
if p_value < 0.05:
    print("The difference in sales between high and low feedback months is statistically significant.")
else:
    print("The difference in sales between high and low feedback months is not statistically significant.")

The difference in sales between high and low feedback months is statistically significant.


#### Interpreting The Results

Based on the data provided and the hypothesis to compare sales in months with high vs. low feedback scores, I determined that an Independent Two-Tail T-test was appropriate for this situation. In this case, sales in high feedback months vs sales in low feedback months are independent. In addition, my decision for Two Tail was because we checked for any difference without a specified direction. After performing the test, I determined that the difference in sales between high and low feedback months is statistically significant since the p_value 0.049 is less than 0.05.