In [None]:
# ANOVA
!pip install statsmodels
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Create the DataFrame
data = {
    'Race': ['Asian']*4 + ['Black']*4 + ['Hispanic']*4 + ['White']*4 + ['Unknown']*4,
    'Prompt': ['Long/English', 'Short/English', 'Long/Chinese', 'Short/Chinese']*5,
    'Percentage': [24.63, 74.35, 75.90, 74.35, -4.77, -12.33, -13.33, -12.33,
                   14.13, 0.85, -10.65, 0.85, -9.92, -34.37, -38.98, -34.37,
                   3.62, -0.83, 14.73, -0.83]
}

df = pd.DataFrame(data)

# Fit the model
model = ols('Percentage ~ C(Race) + C(Prompt)', data=df).fit()

# Perform the ANOVA
anova_results = sm.stats.anova_lm(model, typ=2)

# Display the results
print(anova_results)


In [None]:
#Prep data for Stat Analysis
def extract_top_differences(differences_df):
    # Sort the DataFrame by 'Total Difference' in descending order
    sorted_df = differences_df.sort_values(by='Total Difference', ascending=False)

    # Select the top 10 entries
    top_differences_df = sorted_df.head(10)

    return top_differences_df

# Example usage assuming 'differences_english' and 'differences_chinese' are defined
top_differences_english = extract_top_differences(differences_english)
top_differences_chinese = extract_top_differences(differences_chinese)

# Print the top 10 diseases and their differences for verification
print("Top 10 Diseases and Differences (English):")
print(top_differences_english)
print("\nTop 10 Diseases and Differences (Chinese):")
print(top_differences_chinese)


In [None]:
#Prep part 2
english_condition_diff = differences_english[['Condition', 'Total Difference']]

# Create a new DataFrame for Chinese with only 'Condition' and 'Total Difference'
chinese_condition_diff = differences_chinese[['Condition', 'Total Difference']]

# Optionally, print the new DataFrames to verify
print(english_condition_diff)
print(chinese_condition_diff)

In [None]:
#Stat Test - Mann-Whitney U Test

from scipy.stats import mannwhitneyu

# Extracting the 'Total Difference' data from each DataFrame

english_condition_diff['Total Difference'] = pd.to_numeric(english_condition_diff['Total Difference'], errors='coerce')
chinese_condition_diff['Total Difference'] = pd.to_numeric(chinese_condition_diff['Total Difference'], errors='coerce')

english_differences = english_condition_diff['Total Difference']
chinese_differences = chinese_condition_diff['Total Difference']

# Perform the Mann-Whitney U test
u_statistic, p_value = mannwhitneyu(english_differences, chinese_differences, alternative='two-sided')

print("Mann-Whitney U test results:")
print(f"U statistic: {u_statistic}")
print(f"P-value: {p_value}")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d7c77c8b-06d2-45fe-94df-58bfc0e72f73' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>