In [8]:
#importing libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from factor_analyzer import FactorAnalyzer
import scipy.stats as stats

mydata=pd.read_csv("cleaned_brand_loyalty_dataset.csv")
mydata.describe()
mydata
generation_counts=mydata["Generation"].value_counts().reset_index()
generation_counts.columns=["Generation","Count"]
generation_counts
# Create bar plot
fig = px.bar(generation_counts, x="Generation", y="Count", title="Number of Gen Z and Millennials",
             labels={"Count": "Number of Respondents", "Generation": "Generation"},
             color="Generation", text="Count")

fig.update_traces(textposition="outside")

# Show figure

fig.write_image("generation_count_plot.png")

#Plotting Age Disstribution by Generation
summary_stats = mydata.groupby("Generation")["Age"].agg(["min", "max", "mean", "median"]).reset_index()
fig1= px.box(mydata,x="Generation",y="Age",title="Age Distribution by Generation", 
             points="all",labels={"Age":"Age","Generation":"Generation"},
             color="Generation")
fig1.update_traces(marker=dict(size=5,line=dict(width=1,color="DarkSlateGrey")),
                   selector=dict(type="box",mode="markers+text"))
# Add annotations for min, max, mean, and median
for _, row in summary_stats.iterrows():
    
    fig1.add_annotation(x=row["Generation"], y=row["mean"], text=f"Mean: {row['mean']:.2f}", showarrow=True, arrowhead=2, font=dict(color="blue"))
    fig1.add_annotation(x=row["Generation"], y=row["min"], text=f"Min: {row['min']:.2f}", showarrow=True, arrowhead=2, font=dict(color="green"))
    fig1.add_annotation(x=row["Generation"], y=row["max"], text=f"Max: {row['max']:.2f}", showarrow=True, arrowhead=2, font=dict(color="purple"))

# Show figure

fig1.write_image("Age_distribution_by_generation.png")

# Count occurrences of Education levels within each generation
education_counts = mydata.groupby(["Generation", "Education"]).size().reset_index(name="Count")
# Calculate percentage within each generation
education_counts["Percentage"] = education_counts.groupby("Generation")["Count"].transform(lambda x: 100 * x / x.sum())
# Create stacked bar plot
fig = px.bar(education_counts, x="Generation", y="Percentage", color="Education",
             title="Education Distribution by Generation (Percentage)",
             labels={"Percentage": "Percentage (%)", "Generation": "Generation", "Education": "Education Level"},
             barmode="stack", text=education_counts["Percentage"].apply(lambda x: f"{x:.1f}%"))

# Show figure

fig.write_image("education_distribution_by_generation.png")

#Count occurrences of occupation levels within each generation
occupation_counts = mydata.groupby(["Generation", "Occupation"]).size().reset_index(name="Count")
# Calculate percentage within each generation
occupation_counts["Percentage"] = occupation_counts.groupby("Generation")["Count"].transform(lambda x: 100 * x / x.sum())
# Create stacked bar plot
fig = px.bar(occupation_counts, x="Generation", y="Percentage", color="Occupation",
             title="Occupation Distribution by Generation (Percentage)",
             labels={"Percentage": "Percentage (%)", "Generation": "Generation", "Occupation": "Occupation type"},
             barmode="stack", text=occupation_counts["Percentage"].apply(lambda x: f"{x:.1f}%"))

# Show figure

fig.write_image("occupation_distribution_by_generation.png")

#Box plots for Time Spent on Social Media by Generation and switching to other tech items

# Compute min, max, mean for Switch Likelihood & Social Media Hours
summary_stats = mydata.groupby("Generation")[["Switch_Likelihood", "Social_Media_Hours"]].agg(["min", "max", "mean"]).reset_index()


# Create subplot layout (1 row, 2 columns)
fig = make_subplots(rows=1, cols=2, subplot_titles=("Switch Likelihood by Generation", "Social Media Hours by Generation"))

# Add box plot for Switch Likelihood
fig.add_trace(go.Box(x=mydata["Generation"], y=mydata["Switch_Likelihood"], name="Switch Likelihood", marker_color="blue"), row=1, col=1)

# Add box plot for Social Media Hours
fig.add_trace(go.Box(x=mydata["Generation"], y=mydata["Social_Media_Hours"], name="Social Media Hours", marker_color="green"), row=1, col=2)

# Update layout
# for _, row in summary_stats.iterrows():
#     fig.add_annotation(x=row["Generation"], y=row["Switch_Likelihood"]["mean"], text=f"Mean: {row['Switch_Likelihood']['mean']:.2f}", showarrow=True, arrowhead=2, font=dict(color="blue"), row=1, col=1)
#     fig.add_annotation(x=row["Generation"], y=row["Switch_Likelihood"]["min"], text=f"Min: {row['Switch_Likelihood']['min']:.2f}", showarrow=True, arrowhead=2, font=dict(color="green"), row=1, col=1)
#     fig.add_annotation(x=row["Generation"], y=row["Switch_Likelihood"]["max"], text=f"Max: {row['Switch_Likelihood']['max']:.2f}", showarrow=True, arrowhead=2, font=dict(color="red"), row=1, col=1)

#     fig.add_annotation(x=row["Generation"], y=row["Social_Media_Hours"]["mean"], text=f"Mean: {row['Social_Media_Hours']['mean']:.2f}", showarrow=True, arrowhead=2, font=dict(color="blue"), row=1, col=2)
#     fig.add_annotation(x=row["Generation"], y=row["Social_Media_Hours"]["min"], text=f"Min: {row['Social_Media_Hours']['min']:.2f}", showarrow=True, arrowhead=2, font=dict(color="green"), row=1, col=2)
#     fig.add_annotation(x=row["Generation"], y=row["Social_Media_Hours"]["max"], text=f"Max: {row['Social_Media_Hours']['max']:.2f}", showarrow=True, arrowhead=2, font=dict(color="red"), row=1, col=2)


fig.update_layout(title_text="Comparison of Switch Likelihood & Social Media Hours by Generation", showlegend=False)

# Show figure

fig.write_image("Switch_Likelihood_Social_Media_Hours_by_Generation.png")
summary_stats.to_excel("slsmh.xlsx", index=True)


###
# Compute mean values for each variable grouped by Generation
mean_values = mydata.groupby("Generation")[["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM_Influence"]].mean().reset_index()
mean_values
# Create subplot layout (2 rows, 3 columns)
fig = make_subplots(rows=2, cols=3, subplot_titles=["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM Influence"])

# Define variables
variables = ["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM_Influence"]

# Add bar plots for each variable grouped by Generation
for i, var in enumerate(variables):
    fig.add_trace(go.Bar(x=mean_values["Generation"], y=mean_values[var], name=var, text=mean_values[var].round(2), textposition="outside"), row=(i//3)+1, col=(i%3)+1)

# Update y-axis labels to indicate ordinal values
for r in range(1, 3):  # Rows
    for c in range(1, 4):  # Columns
        fig.update_yaxes(title_text="Ordinal Mean", row=r, col=c)

# Update layout
fig.update_layout(title_text="Mean Values of Loyalty, Trust, Commitment, Satisfaction, Quality, and SM Influence by Generation", showlegend=False)


#Comparative Analysis
#Univariate cases
#Histogram for switch likelihood
fig = px.histogram(mydata, x="Switch_Likelihood", nbins=8, title="Distribution of Switch Likelihood",
                   labels={"Switch_Likelihood": "Likelihood to Switch Brands"}, color="Generation")

# Histogram for time on social media
fig = px.histogram(mydata, x="Social_Media_Hours", nbins=5, title="Distribution of Social Media Hours",
                   labels={"Social_Media_Hours": "Hours Spent on Social Media"}, color="Generation")



# Histogram for satisfaction
fig = px.histogram(mydata, x="Satisfaction", nbins=5, title="Spread of Satisfaction Scores",
                   labels={"Satisfaction": "Satisfaction Rating"}, color="Generation")

# Scatter plot for Trust vs. Loyalty
fig = px.scatter(mydata, x="Trust", y="Loyalty", color="Generation", title="Trust vs. Loyalty",
                 trendline="ols",
                 labels={"Trust": "Trust Level", "Loyalty": "Loyalty Score"})



# Scatter plot for Commitment vs. Social Media Influence
fig = px.scatter(mydata, x="SM_Influence", y="Commitment", color="Generation", title="Social Media Influence vs. Commitment",
                 trendline="ols",
                 labels={"SM_Influence": "Influence of Social Media", "Commitment": "Brand Commitment"})


# Carrying out correlation analysis
# Compute Spearman correlation matrix
correlation_matrix = mydata[["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM_Influence"]].corr(method="spearman")


# Create heatmap
# plt.figure(figsize=(8, 6))  # Set figure size
# sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)

# Add title
#plt.title("Spearman Correlation Heatmap")



# Perform factor analysis
fa = FactorAnalyzer(n_factors=2, rotation="varimax")  # Adjust factor count if needed
fa.fit(mydata[["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM_Influence"]])

# Get eigenvalues
eigenvalues = fa.get_eigenvalues()

# Plot eigenvalues
# plt.plot(range(1, len(eigenvalues[0]) + 1), eigenvalues[0], marker="o", linestyle="--")
# plt.xlabel("Number of Factors")
# plt.ylabel("Eigenvalue")
# plt.title("Scree Plot for Factor Selection")
# plt.axhline(y=1, color="r", linestyle="--", label="Eigenvalue = 1 threshold")
# plt.legend()
# plt.show()



fa = FactorAnalyzer(n_factors=3, rotation="varimax")  # Adjust factor count if needed
fa.fit(mydata[["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM_Influence"]])

# Get factor loadings
loadings = pd.DataFrame(fa.loadings_, index=["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM_Influence"])



################Correlation for Each Generation
millennials_corr = mydata[mydata["Generation"] == "Millennial"][["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM_Influence"]].corr(method="spearman")
genz_corr = mydata[mydata["Generation"] == "Gen Z"][["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM_Influence"]].corr(method="spearman")

#print("Millennial's Correlations:")
# #Create heatmap
# plt.figure(figsize=(8, 6))  # Set figure size
# sns.heatmap(millennials_corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)

# #Add title
# plt.title("Millennials Spearman Correlation Heatmap")
# #plt.show()

#print("Gen Z Correlations:")
#Create heatmap
# plt.figure(figsize=(8, 6))  # Set figure size
# sns.heatmap(genz_corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)

# #Add title
# plt.title("Gen Z Spearman Correlation Heatmap")
# plt.show()

##### Correlation Analysis for factors
fa_millennials = FactorAnalyzer(n_factors=2, rotation="varimax")
fa_millennials.fit(mydata[mydata["Generation"] == "Millennial"][["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM_Influence"]])
loadings_millennials = pd.DataFrame(fa_millennials.loadings_, index=["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM_Influence"])
loadings_millennials.columns = ["Factor 1", "Factor 2"]
loadings_millennials.to_excel("millennials_factor_loadings.xlsx", index=True)
print("Millennials Factor Loadings:")


# #Create heatmap
# plt.figure(figsize=(8, 6))  # Set figure size
# sns.heatmap(loadings_millennials, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
# plt.title("Millennials Factor Loadings Spearman Correlation")
fa_genz = FactorAnalyzer(n_factors=2, rotation="varimax")
fa_genz.fit(mydata[mydata["Generation"] == "Gen Z"][["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM_Influence"]])
loadings_genz = pd.DataFrame(fa_genz.loadings_, index=["Loyalty", "Trust", "Commitment", "Satisfaction", "Quality", "SM_Influence"])
loadings_genz.columns = ["Factor 1", "Factor 2"]
loadings_genz.to_excel("genz_factor_loadings.xlsx", index=True)

# #Create heatmap
# plt.figure(figsize=(8, 6))  # Set figure size
# sns.heatmap(loadings_genz, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
# plt.title("Gen Z Factor Loadings Spearman Correlation Heatmap")
#print("Gen Z Factor Loadings:")
#loadings_genz

# Conducting Chi-Squared Test for Independence



# Create contingency tables
education_loyalty = pd.crosstab(mydata["Education"], mydata["Loyalty"])
occupation_loyalty = pd.crosstab(mydata["Occupation"], mydata["Loyalty"])

# # Display tables
# print("Education vs. Loyalty Contingency Table:")
# print(education_loyalty)

# print("\nOccupation vs. Loyalty Contingency Table:")
# print(occupation_loyalty)

# Perform Chi-Square test for Education vs. Loyalty
chi2_edu, p_edu, dof_edu, expected_edu = stats.chi2_contingency(education_loyalty)

# Perform Chi-Square test for Occupation vs. Loyalty
chi2_occ, p_occ, dof_occ, expected_occ = stats.chi2_contingency(occupation_loyalty)


# Display results
print(f"Chi-Square Statistic (Education vs. Loyalty): {chi2_edu:.3f}")
print(f"P-value (Education vs. Loyalty): {p_edu:.5f}")

print(f"\nChi-Square Statistic (Occupation vs. Loyalty): {chi2_occ:.3f}")
print(f"P-value (Occupation vs. Loyalty): {p_occ:.5f}")

Millennials Factor Loadings:
Chi-Square Statistic (Education vs. Loyalty): 49.723
P-value (Education vs. Loyalty): 0.11665

Chi-Square Statistic (Occupation vs. Loyalty): 31.365
P-value (Occupation vs. Loyalty): 0.80275
