In [None]:
import cudf
df=cudf.read_csv('/home/pavit21178/Nalin_OFF/Data/en.openfoodfacts.org.products.csv',sep='\t',index_col=0)

In [None]:
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests

In [None]:
for i,column in enumerate(df.columns):
    print(i,column)

In [None]:
df = df[df['nova_group'].isin([1,2,3,4])]

In [None]:
df['categories_en'].value_counts().head(10).to_pandas().plot(kind='bar',title='Top 10 catogories_en',figsize=(10,5))

In [None]:
df['nova_group']

In [None]:
categories_df=df[['nova_group','categories_en']].to_pandas()

In [None]:
# remove rows with values as None
categories_df=categories_df.dropna()

In [None]:
categories_df.shape

In [None]:
categories_df[1:10]['categories_en']

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms.community import louvain_communities
from collections import Counter

# Convert to pandas DataFrame if necessary
categories_df = df[['nova_group', 'categories_en']].to_pandas()

# Remove any None entries in the 'categories_en' column
categories_df = categories_df.dropna(subset=['categories_en'])

# Count the occurrences of each category
all_categories = [cat.strip() for entry in categories_df['categories_en'] for cat in entry.split(',')]
category_counts = Counter(all_categories)

# Get the 300 most common categories (fewer to reduce clutter)
top_300_categories = set([category for category, count in category_counts.most_common(50)])

# Filter the DataFrame to only include entries with the most prevalent categories
categories_df['filtered_categories'] = categories_df['categories_en'].apply(
    lambda entry: ','.join([cat for cat in entry.split(',') if cat.strip() in top_300_categories])
)

# Set up the figure with subplots (2 rows, 2 columns for NOVA classes 1-4)
fig, axes = plt.subplots(2, 2, figsize=(30, 30))
axes = axes.flatten()

# Loop over each nova_group (Class 1-4)
for i, nova_group in enumerate(sorted(categories_df['nova_group'].unique())):
    # Extract subset for the current nova_group
    subset_df = categories_df[categories_df['nova_group'] == nova_group]['filtered_categories']
    
    # Create a directed graph for the current nova_group
    G = nx.DiGraph()
    
    # Process each entry in the subset DataFrame
    for entry in subset_df:
        if entry:  # Ensure the entry is not empty
            categories_list = [cat for cat in dict.fromkeys(entry.split(',')) if ':' not in cat]
            for j in range(len(categories_list) - 1):
                G.add_edge(categories_list[j], categories_list[j + 1])
    
    # Prune nodes by degree (keep only nodes with degree > 4)
    G = G.subgraph([n for n, d in G.degree() if d > 4])
    
    # Use Louvain community detection for clustering
    communities = louvain_communities(G, seed=42)
    community_map = {}
    for k, comm in enumerate(communities):
        for node in comm:
            community_map[node] = k
    
    # Limit edges to top connections by edge weight or frequency
    threshold = 0.5  # Stricter threshold to reduce edge clutter
    top_edges = [(u, v) for u, v, d in G.edges(data=True) if d.get('weight', 1) > threshold]
    G = G.edge_subgraph(top_edges)
    
    # Node size by degree with limited range (adjusted scaling)
    node_sizes = [min(1200, max(300, G.degree(node) * 150)) for node in G.nodes]
    
    # Use a lighter color palette (e.g., 'Pastel1')
    pos = nx.spring_layout(G, k=1.5, iterations=150)  # Further increased spacing and iterations
    
    # Color nodes by community with a lighter colormap (e.g., 'Pastel1')
    nx.draw_networkx_nodes(G, pos, ax=axes[i], node_size=node_sizes,
                           node_color=[community_map[node] for node in G.nodes],
                           cmap=plt.cm.Set3, alpha=0.85)  # Lighter colormap
    
    # Draw edges with high transparency and reduced width
    nx.draw_networkx_edges(G, pos, ax=axes[i], alpha=0.05, arrowstyle='-|>', arrowsize=10, width=0.8)  # More transparent edges
    
    # Show labels only for high-degree nodes (adjusted threshold)
    high_degree_nodes = [node for node in G.nodes if G.degree(node) > 4]  # Increased threshold for labeling
    nx.draw_networkx_labels(G, pos, ax=axes[i], labels={node: node for node in high_degree_nodes},
                            font_size=16, font_color='black')  # Larger font size
    
    # Set the title for each subplot
    axes[i].set_title(f'NOVA {nova_group}', fontsize=30)
    axes[i].axis('off')

# Adjust layout for better spacing between plots
plt.tight_layout(pad=3.0)
plt.show()

In [None]:
def create_and_display_graph(nova_group):
    # Extract subset for the current nova_group
    subset_df = categories_df[categories_df['nova_group'] == nova_group]['categories_en']
    
    # Create a directed graph for the current nova_group
    G = nx.DiGraph()
    
    # Process each entry in the subset DataFrame
    for entry in subset_df:
        # Split the entry into a list of unique categories and filter out those with colons
        categories_list = [cat for cat in dict.fromkeys(entry.split(',')) if ':' not in cat]
        
        # Add edges to the graph based on the category order
        for i in range(len(categories_list) - 1):
            G.add_edge(categories_list[i], categories_list[i + 1])
    
    # Draw the graph
    plt.figure(figsize=(12, 8))  # Adjust the figure size as needed
    pos = nx.spring_layout(G, k=1)  # Use spring layout for better spacing
    nx.draw(G, pos, with_labels=True, node_size=3000, node_color='skyblue', font_size=10, font_weight='bold', arrowstyle='-|>', arrowsize=20)
    plt.title(f'Category Order Graph - Class {nova_group}')
    plt.show()

# Process each unique nova_group and create/display the graph
for nova_group in sorted(categories_df['nova_group'].unique()):
    create_and_display_graph(nova_group)

In [None]:
import pandas as pd
from collections import defaultdict
import plotly.graph_objects as go
categories_df=categories_df[1:500]


# Initialize defaultdict to count transitions
transition_counts = defaultdict(lambda: defaultdict(int))

# Process each entry in the DataFrame
for _, row in categories_df.iterrows():
    nova_group = row['nova_group']
    categories_list = [cat for cat in dict.fromkeys(row['categories_en'].split(',')) if ':' not in cat]
    
    # Count the transitions for each class
    for i in range(len(categories_list) - 1):
        transition_counts[nova_group][(categories_list[i], categories_list[i + 1])] += 1

# Prepare data for Sankey diagrams
all_nodes = set()
for transitions in transition_counts.values():
    all_nodes.update(set(sum(transitions.keys(), ())))

all_nodes = list(all_nodes)
node_indices = {node: i for i, node in enumerate(all_nodes)}

# Generate and display Sankey diagram for each class
for nova_group, transitions in transition_counts.items():
    source_indices = [node_indices[transition[0]] for transition in transitions.keys()]
    target_indices = [node_indices[transition[1]] for transition in transitions.keys()]
    values = list(transitions.values())

    sankey_fig = go.Figure(go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=all_nodes
        ),
        link=dict(
            source=source_indices,
            target=target_indices,
            value=values
        )
    ))

    sankey_fig.update_layout(
        title_text=f"Category Transition Sankey Diagram - Class {nova_group}",
        font_size=8,
        width=1200,  # Adjust width as needed
        height=800   # Adjust height as needed
    )
    sankey_fig.show()


In [None]:
for i,column in enumerate(df.columns):
    print(i,column)

In [None]:
df['countries_en'].value_counts()

In [None]:
countries_en=df[['nova_group','countries_en']].to_pandas()

In [None]:
countries_en

In [None]:
countries_en=df[['nova_group','countries_en']].to_pandas()
countries_en = countries_en.dropna()

In [None]:
countries_en.shape

In [None]:
countries_en=df[['nova_group','countries_en']].to_pandas()
countries_en = countries_en.dropna()




In [None]:
countries_en

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Initialize an empty list to store expanded data
expanded_data = []

# Iterate over each row in the DataFrame
for index, row in countries_en.iterrows():
    nova_group = row['nova_group']
    countries = [country.strip() for country in row['countries_en'].split(',') if country.strip()]
    
    # Create a new entry for each country in the list
    for country in countries:
        expanded_data.append({'nova_group': nova_group, 'countries_en': country})

# Create a new DataFrame from the expanded data
expanded_df = pd.DataFrame(expanded_data)

In [None]:
expanded_df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Initialize an empty list to store expanded data
expanded_data = []

# Iterate over each row in the DataFrame
for index, row in countries_en.iterrows():
    nova_group = row['nova_group']
    countries = [country.strip() for country in row['countries_en'].split(',') if country.strip()]
    
    # Create a new entry for each country in the list
    for country in countries:
        expanded_data.append({'nova_group': nova_group, 'countries_en': country})

# Create a new DataFrame from the expanded data
expanded_df = pd.DataFrame(expanded_data)

# Define colors for the chart
colors = plt.get_cmap('tab10').colors

# Function to format the labels without the percentage symbol
def without_percentage(pct):
    return '{:.0f}%'.format(pct)

# Get the unique nova_group values and sort them in numerical order
sorted_nova_groups = sorted(expanded_df['nova_group'].unique())

# Create a 2x2 subplot
fig, axs = plt.subplots(2, 2, figsize=(14, 12))

# Flatten the axs array for easy iteration
axs = axs.flatten()

# Iterate over each sorted `nova_group` and the corresponding axes
for i, (nova_group, ax) in enumerate(zip(sorted_nova_groups, axs)):
    # Get the top 10 countries and their counts for this nova_group
    group_data = expanded_df[expanded_df['nova_group'] == nova_group]
    top_countries = group_data['countries_en'].value_counts().head(10)
    
    # Plot the doughnut chart
    wedges, _, autotexts = ax.pie(
        top_countries, 
        autopct=without_percentage,  # Use the custom function to remove %
        startangle=90, 
        colors=colors, 
        wedgeprops={'edgecolor': 'white'},
        pctdistance=0.85,  # Move percentage labels slightly closer to the center
        explode=[0.05]*len(top_countries),  # Explode all wedges slightly for clarity
    )
    
    # Customize percentage text size and color
    for autotext in autotexts:
        autotext.set_fontsize(10)  # Increase font size for readability
        autotext.set_weight('bold')  # Bold font for visibility
        autotext.set_color('black')  # Set to black for clarity
        autotext.set_ha('center')  # Centrally align text horizontally
        autotext.set_va('center')  # Centrally align text vertically

    # Add a circle at the center to create the doughnut hole
    centre_circle = plt.Circle((0,0),0.70,fc='white')
    ax.add_artist(centre_circle)

    # Add a legend outside the plot for each subplot
    ax.legend(wedges, top_countries.index, title="Country", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
    
    # Set title for the chart as the nova_group
    ax.set_title(f'NOVA {nova_group}', fontsize=14)

    # Ensure equal aspect ratio for the doughnut chart
    ax.axis('equal')

# Adjust layout to make room for the legends
plt.tight_layout()

# Display the full plot with 4 subplots
plt.show()


In [None]:
contingency_table = pd.crosstab(expanded_df['nova_group'], expanded_df['countries_en'])

# Display contingency table
print("Contingency Table:")
print(contingency_table)

In [None]:
# Chi-square test of independence
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Display results
print("\nChi-square Test Results:")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")

alpha = 0.05
n_tests = contingency_table.size
reject, corrected_p_values, _, _ = multipletests(p, alpha=alpha, method='bonferroni')

# Display corrected p-values
print("\nCorrected P-values:")
print(corrected_p_values)


n = contingency_table.values.sum()

# Number of rows and columns
r, k = contingency_table.shape

# Calculate Cramér's V
V = np.sqrt(chi2 / n / min(k - 1, r - 1))

print(f"Cramér's V: {V}")

In [None]:
df_brands = df[['nova_group', 'brands']].to_pandas()
df_brands = df_brands.dropna()

df_brands['brands'].value_counts().head(10).plot(kind='bar',title='Top 10 brands_en',figsize=(10,5))

In [None]:
contingency_table = pd.crosstab(df_brands['nova_group'], df_brands['brands'])

# Display contingency table
print("Contingency Table:")
print(contingency_table)

In [None]:
# Chi-square test of independence
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Display results
print("\nChi-square Test Results:")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")

alpha = 0.05
n_tests = contingency_table.size
reject, corrected_p_values, _, _ = multipletests(p, alpha=alpha, method='bonferroni')

# Display corrected p-values
print("\nCorrected P-values:")
print(corrected_p_values)

n = contingency_table.values.sum()

# Number of rows and columns
r, k = contingency_table.shape

# Calculate Cramér's V
V = np.sqrt(chi2 / n / min(k - 1, r - 1))

print(f"Cramér's V: {V}")


In [None]:
import pandas as pd
import re

# Example DataFrame (replace with your actual data)
allergens_df = df[['nova_group', 'allergens']].to_pandas()
# Drop rows with missing values in 'allergens'
allergens_df = allergens_df.dropna(subset=['allergens'])

# Initialize an empty list to store expanded data
expanded_data = []

# Regex pattern to match allergen format 'en:allergen'
pattern = re.compile(r'^en:(\w+)$')

# Iterate over each row in the DataFrame
for index, row in allergens_df.iterrows():
    nova_group = row['nova_group']
    allergens_list = row['allergens'].split(',')
    
    # Process each allergen in the list
    for allergen in allergens_list:
        match = pattern.match(allergen.strip())
        if match:
            expanded_data.append({'nova_group': nova_group, 'allergen': match.group(1)})

# Create a new DataFrame from the expanded data
expanded_df = pd.DataFrame(expanded_data)

# Display the expanded DataFrame
print(expanded_df)


In [None]:
allergens_df

In [None]:
import pandas as pd
import re

# Example DataFrame (replace with your actual data)
allergens_df = df[['nova_group', 'allergens']].to_pandas()
# Drop rows with missing values in 'allergens'
allergens_df = allergens_df.dropna(subset=['allergens'])

# Initialize an empty list to store expanded data
expanded_data = []

# Regex pattern to match allergen format 'en:allergen'
pattern = re.compile(r'^en:(\w+)$')

# Iterate over each row in the DataFrame
for index, row in allergens_df.iterrows():
    nova_group = row['nova_group']
    allergens_list = row['allergens'].split(',')
    
    # Process each allergen in the list
    for allergen in allergens_list:
        match = pattern.match(allergen.strip())
        if match:
            expanded_data.append({'nova_group': nova_group, 'allergen': match.group(1)})

# Create a new DataFrame from the expanded data
expanded_df = pd.DataFrame(expanded_data)


# Define colors for the chart
colors = plt.get_cmap('tab10').colors

# Function to format the labels without the percentage symbol
def without_percentage(pct):
    return '{:.0f}%'.format(pct)

# Get the unique nova_group values and sort them in numerical order
sorted_nova_groups = sorted(expanded_df['nova_group'].unique())

# Create a 2x2 subplot
fig, axs = plt.subplots(2, 2, figsize=(14, 12))

# Flatten the axs array for easy iteration
axs = axs.flatten()

# Iterate over each sorted `nova_group` and the corresponding axes
for i, (nova_group, ax) in enumerate(zip(sorted_nova_groups, axs)):
    # Get the top 10 countries and their counts for this nova_group
    group_data = expanded_df[expanded_df['nova_group'] == nova_group]
    top_countries = group_data['allergen'].value_counts().head(10)
    
    # Plot the doughnut chart
    wedges, _, autotexts = ax.pie(
        top_countries, 
        autopct=without_percentage,  # Use the custom function to remove %
        startangle=90, 
        colors=colors, 
        wedgeprops={'edgecolor': 'white'},
        pctdistance=0.85,  # Move percentage labels slightly closer to the center
        explode=[0.05]*len(top_countries),  # Explode all wedges slightly for clarity
    )
    
    # Customize percentage text size and color
    for autotext in autotexts:
        autotext.set_fontsize(10)  # Increase font size for readability
        autotext.set_weight('bold')  # Bold font for visibility
        autotext.set_color('black')  # Set to black for clarity
        autotext.set_ha('center')  # Centrally align text horizontally
        autotext.set_va('center')  # Centrally align text vertically

    # Add a circle at the center to create the doughnut hole
    centre_circle = plt.Circle((0,0),0.70,fc='white')
    ax.add_artist(centre_circle)

    # Add a legend outside the plot for each subplot
    ax.legend(wedges, top_countries.index, title="Allergen", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
    
    # Set title for the chart as the nova_group
    ax.set_title(f'NOVA {nova_group}', fontsize=14)

    # Ensure equal aspect ratio for the doughnut chart
    ax.axis('equal')

# Adjust layout to make room for the legends
plt.tight_layout()

# Display the full plot with 4 subplots
plt.show()


In [None]:
expanded_df

In [None]:
contingency_table = pd.crosstab(expanded_df['nova_group'], expanded_df['allergen'])

# Display contingency table
print("Contingency Table:")
print(contingency_table)

In [None]:
# Chi-square test of independence
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Display results
print("\nChi-square Test Results:")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")
alpha = 0.05
n_tests = contingency_table.size
reject, corrected_p_values, _, _ = multipletests(p, alpha=alpha, method='bonferroni')

# Display corrected p-values
print("\nCorrected P-values:")
print(corrected_p_values)

n = contingency_table.values.sum()

# Number of rows and columns
r, k = contingency_table.shape

# Calculate Cramér's V
V = np.sqrt(chi2 / n / min(k - 1, r - 1))

print(f"Cramér's V: {V}")


In [None]:
df[['nova_group', 'main_category_en']].to_pandas()

In [None]:
df_categories = df[['nova_group', 'main_category_en']].to_pandas()
df_categories = df_categories.dropna()


In [None]:
df_categories

In [None]:
contingency_table = pd.crosstab(df_categories['nova_group'], df_categories['main_category_en'])

# # Display contingency table
# print("Contingency Table:")
# print(contingency_table)
# Chi-square test of independence
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Display results
print("\nChi-square Test Results:")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")

alpha = 0.05
n_tests = contingency_table.size
reject, corrected_p_values, _, _ = multipletests(p, alpha=alpha, method='bonferroni')

# Display corrected p-values
print("\nCorrected P-values:")
print(corrected_p_values)


n = contingency_table.values.sum()

# Number of rows and columns
r, k = contingency_table.shape

# Calculate Cramér's V
V = np.sqrt(chi2 / n / min(k - 1, r - 1))

print(f"Cramér's V: {V}")


In [None]:
df['food_groups_en'].value_counts().head(10).to_pandas().plot(kind='bar',title='Top 10 food_groups',figsize=(10,5))

In [None]:
df_food_groups = df[['nova_group', 'food_groups_en']].to_pandas()
df_food_groups = df_food_groups.dropna()



In [None]:
contingency_table
# use bonferroni correction
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests

# Chi-square test of independence
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Display results
print("\nChi-square Test Results:")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")

# Apply Bonferroni correction
alpha = 0.05
n_tests = contingency_table.size
reject, corrected_p_values, _, _ = multipletests(p, alpha=alpha, method='bonferroni')

# Display corrected p-values
print("\nCorrected P-values:")
print(corrected_p_values)

n = contingency_table.values.sum()

# Number of rows and columns
r, k = contingency_table.shape

# Calculate Cramér's V
V = np.sqrt(chi2 / n / min(k - 1, r - 1))

print(f"Cramér's V: {V}")

In [None]:
nutriscore_grade_df = df[['nova_group','nutriscore_grade']]

In [None]:
# keep only food items which have nutriscore_grade in a,b,c,d,e
nutriscore_grade_df = nutriscore_grade_df[nutriscore_grade_df['nutriscore_grade'].isin(['a','b','c','d','e'])]

In [None]:
nutriscore_grade_df_pandas = nutriscore_grade_df.to_pandas()

In [None]:
import pandas as pd

# Assuming df is your DataFrame
contingency_table = pd.crosstab(nutriscore_grade_df_pandas['nova_group'], nutriscore_grade_df_pandas['nutriscore_grade'])
print(contingency_table)


In [None]:
35294+  55070 + 88337 + 137484 + 98996

In [None]:

98996/415181*100

In [None]:
nutriscore_grade_df

In [None]:
import matplotlib.pyplot as plt
nutriscore_grade_df = df[['nova_group','nutriscore_grade']]
# keep only food items which have nutriscore_grade in a,b,c,d,e
nutriscore_grade_df = nutriscore_grade_df[nutriscore_grade_df['nutriscore_grade'].isin(['a','b','c','d','e'])]
nutriscore_grade_df_pandas = nutriscore_grade_df.to_pandas()
import pandas as pd

# Assuming df is your DataFrame
contingency_table = pd.crosstab(nutriscore_grade_df_pandas['nova_group'], nutriscore_grade_df_pandas['nutriscore_grade'])
print(contingency_table)

contingency_table.columns = ['A', 'B', 'C', 'D', 'E']

# Plot the contingency table as a stacked bar chart
contingency_table.plot(kind='bar', stacked=True)

# Set the y-axis label
plt.ylabel('Count')

# Remove the title
plt.title('')

# Customize the legend with uppercase grades
plt.legend(title='Nutri-Score Grade')

# Customize the x-ticks to display as 'NOVA 1', 'NOVA 2', etc. and keep them horizontal
plt.xticks(ticks=range(len(contingency_table.index)), labels=['NOVA 1', 'NOVA 2', 'NOVA 3', 'NOVA 4'], rotation=0)

# Remove the x-axis label
plt.xlabel('')


In [None]:

# use bonferroni correction
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests

# Chi-square test of independence
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Display results
print("\nChi-square Test Results:")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")

alpha = 0.05
n_tests = contingency_table.size
reject, corrected_p_values, _, _ = multipletests(p, alpha=alpha, method='bonferroni')


print("\nCorrected P-values:")
print(corrected_p_values)

n = contingency_table.values.sum()

r, k = contingency_table.shape


V = np.sqrt(chi2 / n / min(k - 1, r - 1))

print(f"Cramér's V: {V}")



In [None]:
ecoscore_grade_df = df[['nova_group','ecoscore_grade']]

In [None]:
ecoscore_grade_df['ecoscore_grade'].value_counts()

In [None]:
# keep only food items which have nutriscore_grade in a,b,c,d,e
ecoscore_grade_df = ecoscore_grade_df[ecoscore_grade_df['ecoscore_grade'].isin(['a','b','c','d','e'])]

In [None]:
ecoscore_grade_df

In [None]:
ecoscore_grade_df_pandas = ecoscore_grade_df.to_pandas()

import pandas as pd

# Assuming df is your DataFrame
contingency_table = pd.crosstab(ecoscore_grade_df_pandas['nova_group'], ecoscore_grade_df_pandas['ecoscore_grade'])



In [None]:
contingency_table

In [None]:
12811+	74865+	50302+	54488+	19874

In [None]:
19874/212340*100

In [None]:
import matplotlib.pyplot as plt
ecoscore_grade_df = df[['nova_group','ecoscore_grade']]
ecoscore_grade_df['ecoscore_grade'].value_counts()
# keep only food items which have nutriscore_grade in a,b,c,d,e
ecoscore_grade_df = ecoscore_grade_df[ecoscore_grade_df['ecoscore_grade'].isin(['a','b','c','d','e'])]
ecoscore_grade_df_pandas = ecoscore_grade_df.to_pandas()

import pandas as pd

# Assuming df is your DataFrame
contingency_table = pd.crosstab(ecoscore_grade_df_pandas['nova_group'], ecoscore_grade_df_pandas['ecoscore_grade'])
contingency_table.columns = ['A', 'B', 'C', 'D', 'E']

# Plot the contingency table as a stacked bar chart
contingency_table.plot(kind='bar', stacked=True)

# Set the y-axis label
plt.ylabel('Count')

# Remove the title
plt.title('')

# Customize the legend
plt.legend(title='Eco-Score Grade')

# Customize the x-ticks to display as 'NOVA 1', 'NOVA 2', etc. and keep them horizontal
plt.xticks(ticks=range(len(contingency_table.index)), labels=['NOVA 1', 'NOVA 2', 'NOVA 3', 'NOVA 4'], rotation=0)

# Remove the x-axis label
plt.xlabel('')

# Show the plot
plt.show()


In [None]:

from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests

chi2, p, dof, expected = chi2_contingency(contingency_table)

print("\nChi-square Test Results:")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")

alpha = 0.05
n_tests = contingency_table.size
reject, corrected_p_values, _, _ = multipletests(p, alpha=alpha, method='bonferroni')

print("\nCorrected P-values:")
print(corrected_p_values)

n = contingency_table.values.sum()

r, k = contingency_table.shape

V = np.sqrt(chi2 / n / min(k - 1, r - 1))

print(f"Cramér's V: {V}")

