In [11]:
import pandas as pd
from scipy.stats import f_oneway

# Load the dataset
filename = 'book1.csv'
ethnicity_data = pd.read_csv(filename)

# Convert necessary columns to numeric
numeric_cols = ['Numerator', 'Denominator', 'Rate']
ethnicity_data[numeric_cols] = ethnicity_data[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Extract data for different ethnicities
hispanic = ethnicity_data.loc[ethnicity_data['Category'] == 'Race-Ethnicity', 'Rate'].dropna()
multi_race = ethnicity_data.loc[ethnicity_data['Category'] == 'Race-Ethnicity', 'Rate'].dropna()
black = ethnicity_data.loc[ethnicity_data['Category'] == 'Race-Ethnicity', 'Rate'].dropna()
asian = ethnicity_data.loc[ethnicity_data['Category'] == 'Race-Ethnicity', 'Rate'].dropna()
white = ethnicity_data.loc[ethnicity_data['Category'] == 'Race-Ethnicity', 'Rate'].dropna()

# Perform one-way ANOVA
f_statistic, p_value = f_oneway(hispanic, multi_race, black, asian, white)

# Print results
print(f"F-statistic: {f_statistic}")
print(f"P-value: {p_value}")

# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("Reject null hypothesis: There is significant evidence that at least one ethnic group has a different rate.")
else:
    print("Fail to reject null hypothesis: There is no significant evidence that the ethnic groups have different rates.")


F-statistic: -3.2212785002325485e-33
P-value: nan
Fail to reject null hypothesis: There is no significant evidence that the ethnic groups have different rates.


In [13]:
import pandas as pd
from scipy.stats import f_oneway

# Load the dataset
filename = 'book1.csv'
insurance_data = pd.read_csv(filename)

# Convert necessary columns to numeric
numeric_cols = ['Numerator', 'Denominator', 'Rate']
insurance_data[numeric_cols] = insurance_data[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Extract data for different insurance categories
medi_cal = insurance_data.loc[insurance_data['Category'] == 'Health Insurance', 'Rate'].dropna()
private = insurance_data.loc[insurance_data['Category'] == 'Health Insurance', 'Rate'].dropna()
self_pay_uninsured = insurance_data.loc[insurance_data['Category'] == 'Health Insurance', 'Rate'].dropna()
other_government = insurance_data.loc[insurance_data['Category'] == 'Health Insurance', 'Rate'].dropna()

# Perform one-way ANOVA
f_statistic, p_value = f_oneway(medi_cal, private, self_pay_uninsured, other_government)

# Print results
print(f"F-statistic: {f_statistic}")
print(f"P-value: {p_value}")

# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("Reject null hypothesis: There is significant evidence that at least one insurance category has a different rate.")
else:
    print("Fail to reject null hypothesis: There is no significant evidence that the insurance categories have different rates.")


F-statistic: 0.0
P-value: 1.0
Fail to reject null hypothesis: There is no significant evidence that the insurance categories have different rates.


In [17]:
import pandas as pd
from scipy import stats

# Load the data from the CSV file
df = pd.read_csv('Book1.csv')

# Display the first few rows of the dataframe to understand its structure
print(df.head())

                      Indicator Name   Geography  \
0  Pregnancy-Related Mortality Ratio  California   
1  Pregnancy-Related Mortality Ratio  California   
2  Pregnancy-Related Mortality Ratio  California   
3  Pregnancy-Related Mortality Ratio  California   
4  Pregnancy-Related Mortality Ratio  California   

                                     Region       Year          Category  \
0                                       NaN  2019-2021  Total Population   
1                     North and Mid-Coastal  2019-2021  Total Population   
2  Northeastern and Northern Central Valley  2019-2021  Total Population   
3                   Southern Central Valley  2019-2021  Total Population   
4     Los Angeles - Santa Barbara - Ventura  2019-2021  Total Population   

        Subcategory Numerator  Denominator  Rate  
0  Total population       226      1287679  17.6  
1  Total population        38       261388  14.5  
2  Total population        32       152418    21  
3  Total population       

In [19]:
# Extract the rates for the two regions
sample1 = df[df['Region'] == 'North and Mid-Coastal']['Rate'].astype(float)
sample2 = df[df['Region'] == 'Southern Central Valley']['Rate'].astype(float)

# Perform the t-test
t_stat, p_value = stats.ttest_ind(sample1, sample2)

# Print the results
print('T-statistic:', t_stat)
print('P-value:', p_value)

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
    print('Reject the null hypothesis: There is a significant difference between the means of the two samples.')
else:
    print('Fail to reject the null hypothesis: There is no significant difference between the means of the two samples.')

T-statistic: -inf
P-value: 0.0
Reject the null hypothesis: There is a significant difference between the means of the two samples.


  res = hypotest_fun_out(*samples, **kwds)


In [23]:
# Import necessary libraries
import pandas as pd
from scipy.stats import pearsonr

# Load the data
book1_df = pd.read_csv('Book1.csv')

# Filter the data for the specified age groups
age_groups = book1_df[book1_df['Category'] == 'Age']

# Convert the 'Rate' column to numeric using .loc to avoid SettingWithCopyWarning
age_groups.loc[:, 'Rate'] = pd.to_numeric(age_groups['Rate'], errors='coerce')

# Remove rows with NaN values in 'Rate' and 'Subcategory'
age_groups = age_groups.dropna(subset=['Rate', 'Subcategory'])

# Map age groups to numerical values for correlation
age_mapping = {'Under 20': 1, '20-24 years': 2, '25-29 years': 3, '30-34 years': 4, '35-39 years': 5, '40 and older': 6}
age_groups.loc[:, 'Age_Numeric'] = age_groups['Subcategory'].map(age_mapping)

# Remove rows with NaN values in 'Age_Numeric'
age_groups = age_groups.dropna(subset=['Age_Numeric'])

# Perform Pearson correlation
correlation, p_value = pearsonr(age_groups['Age_Numeric'], age_groups['Rate'])

# Print the results
print('Pearson correlation coefficient:', correlation)
print('p-value:', p_value)

# Write the results to a text file with summary explanation
with open('correlation_results.txt', 'w') as file:
    file.write('Pearson correlation coefficient: ' + str(correlation) + '\
')
    file.write('p-value: ' + str(p_value) + '\
')
    file.write('\
Summary:\
')
    file.write('The Pearson correlation coefficient of 0.969 indicates a very strong positive correlation between age and the rate.\
')
    file.write('The p-value of 0.031 suggests that this correlation is statistically significant at the 5% significance level.\
')
    file.write('This means that as age increases, the rate also tends to increase, and this relationship is statistically significant.')

print('Results written to correlation_results.txt')

Pearson correlation coefficient: 0.9694603141611813
p-value: 0.03053968583881872
Results written to correlation_results.txt
