## Q1

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Sample data for Problem 1
data = {
    "Group": ["Young Adults", "Old Seniors"],
    "Late Payment": [74, 48],
    "No Late Payment": [606, 632]
}

# Create DataFrame
df = pd.DataFrame(data)

# Calculate late payment rates
df['Total'] = df['Late Payment'] + df['No Late Payment']
df['Late Payment Rate'] = df['Late Payment'] / df['Total']

# Calculate relative risk
relative_risk = df['Late Payment Rate'].iloc[0] / df['Late Payment Rate'].iloc[1]

# Calculate odds
df['Odds Late Payment'] = df['Late Payment'] / df['No Late Payment']

# Calculate odds ratio
odds_ratio = df['Odds Late Payment'].iloc[0] / df['Odds Late Payment'].iloc[1]

# Prepare data for chi-squared test
contingency_table = df[['Late Payment', 'No Late Payment']].to_numpy()

# Chi-squared test without continuity correction
chi2, p, dof, expected = chi2_contingency(contingency_table, correction=False)

# Chi-squared test with continuity correction
chi2_corrected, p_corrected, _, _ = chi2_contingency(contingency_table, correction=True)

# Results
late_payment_rates = df['Late Payment Rate'].tolist()
results = {
    "Late Payment Rates": late_payment_rates,
    "Relative Risk": relative_risk,
    "Odds Ratio": odds_ratio,
    "Chi-squared without correction": (chi2, p),
    "Chi-squared with correction": (chi2_corrected, p_corrected),
    "Degrees of Freedom": dof
}

results


{'Late Payment Rates': [0.10882352941176471, 0.07058823529411765],
 'Relative Risk': 1.5416666666666667,
 'Odds Ratio': 1.607810781078108,
 'Chi-squared without correction': (6.0870256097883955, 0.013617814651122878),
 'Chi-squared with correction': (5.627797346328028, 0.017677831043777587),
 'Degrees of Freedom': 1}

|  Late Payment Rates      | Young Adults | Old Seniors |
|-------------|---------------|--------------------|
| Late Payment     | 0.1088      | 0.0706           |


|  Relative Risk    | Risk Ratio |
|-------------|---------------|
| Late Payment     | 1.5417      |

| Mattrix  | Odds Ratio |
|-------------|---------------|
| Odds Ratio     | 1.6078      |

| Mattrix  | value |
|-------------|---------------|
| Chi-Squared Statistic     | 6.0870      |
| P-value| 0.0136|


| Mattrix  | value |
|-------------|---------------|
| Chi-Squared Statistic(With Correction)     | 5.6278      |
| P-value| 0.0177 |

## Q2

In [13]:
import pandas as pd

# Load the data from MM5.xlsx to analyze M&M's candy color proportions
mm_data_path = r'/Users/tangjiahong/Dropbox/HW_statistics/assignment3/MM5.xlsx'
data = pd.read_excel(mm_data_path)


# Display the first few rows of the data to understand its structure
data.head()


Unnamed: 0,Color
0,Red
1,Orange
2,Red
3,Green
4,Green


In [14]:
# Count the observed frequencies of each color
observed_frequencies = data['Color'].value_counts()
observed_frequencies


Color
Blue      131
Green     117
Orange    110
Yellow    109
Red        94
Brown      89
Name: count, dtype: int64

In [15]:
import scipy.stats as stats

# Calculate the total number of observations
total_observations = observed_frequencies.sum()

# Define the expected proportions
expected_proportions = {
    'Blue': 0.2,
    'Brown': 0.1,
    'Green': 0.18,
    'Orange': 0.17,
    'Red': 0.15,
    'Yellow': 0.2
}

# Calculate the expected frequencies
expected_frequencies = {color: proportion * total_observations for color, proportion in expected_proportions.items()}

# Convert to pandas Series for easier handling
expected_frequencies = pd.Series(expected_frequencies)

# Calculate the Chi-square statistic
chi_square_statistic = ((observed_frequencies - expected_frequencies) ** 2 / expected_frequencies).sum()

# Determine the critical value for alpha = 0.05 and df = 5
alpha = 0.05
df = len(expected_frequencies) - 1
critical_value = stats.chi2.ppf(1 - alpha, df)

chi_square_statistic, critical_value


(12.389441930618402, 11.070497693516351)

| Mattrix  | value |
|-------------|---------------|
| Chi-square statistic     | 12.3894      |
| Critical Value | 11.0705 |


## Q3

In [1]:
import pandas as pd

# Load the data from the provided Excel file
file_path = r'C:\Users\User\Dropbox\HW_statistics\assignment3\SocialMedia6.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows of the data to understand its structure
data.head()


Unnamed: 0,Country,Use Social Media?
0,United Kingdom,Yes
1,United Kingdom,Yes
2,United Kingdom,Yes
3,United Kingdom,Yes
4,United Kingdom,Yes


In [2]:
# Calculate the sample proportions of adults using social media for each country
proportions = data['Use Social Media?'].groupby(data['Country']).value_counts(normalize=True).unstack().fillna(0)

# Extract proportions of 'Yes' responses
sample_proportions = proportions.get('Yes', 0)
sample_proportions


Country
China             0.619898
Russia            0.555556
United Kingdom    0.621188
United States     0.638387
Name: Yes, dtype: float64

| Mattrix  | value |
|-------------|---------------|
| China     | 0.6199      |
| Russia | 0.5556 |
| United Kingdom | 0.6212 |
| United States | 0.6384 |


In [5]:
from scipy.stats import chi2_contingency

# Construct the contingency table
contingency_table = data['Use Social Media?'].groupby(data['Country']).value_counts().unstack().fillna(0)

# Perform the chi-square test for homogeneity
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

# Critical value for alpha=0.05
critical_value = chi2.ppf(0.95, dof)

chi2_stat, p_value, critical_value, dof, expected


(9.191543265923958,
 0.0268496914951318,
 7.814727903251179,
 3,
 array([[152.21090259, 239.78909741],
        [195.6997319 , 308.3002681 ],
        [241.90661305, 381.09338695],
        [279.18275246, 439.81724754]]))

In [4]:
from scipy.stats import chi2  # Import necessary function for critical value calculation

# Critical value for alpha=0.05
critical_value = chi2.ppf(0.95, dof)

# Display the chi-square statistic, p-value, critical value, and degrees of freedom
chi2_stat, p_value, critical_value, dof


(9.191543265923958, 0.0268496914951318, 7.814727903251179, 3)

| Mattrix  | value |
|-------------|---------------|
| Critical value     | 7.8147      |
| P-value | 0.0268 |


## Q4

In [7]:
import pandas as pd

# Load the data from the provided Excel file
file_path = r'C:\Users\User\Dropbox\HW_statistics\assignment3\AutoQuality6.xlsx'
auto_quality_data = pd.read_excel(file_path)

# Display the first few rows of the data to understand its structure
data.head()


Unnamed: 0,Country,Use Social Media?
0,United Kingdom,Yes
1,United Kingdom,Yes
2,United Kingdom,Yes
3,United Kingdom,Yes
4,United Kingdom,Yes


In [8]:
# Construct the contingency table
contingency_table_auto_quality = pd.crosstab(auto_quality_data['Quality Rating'], auto_quality_data['Education'])

# Perform the chi-square test of independence
chi2_stat_auto_quality, p_value_auto_quality, dof_auto_quality, expected_auto_quality = chi2_contingency(contingency_table_auto_quality)

# Display the chi-square statistic, p-value, degrees of freedom, and expected frequencies
chi2_stat_auto_quality, p_value_auto_quality, dof_auto_quality, expected_auto_quality


(11.41768221374378,
 0.07629399391931052,
 6,
 array([[38.16346154, 29.68269231, 28.16826923, 29.98557692],
        [37.55769231, 29.21153846, 27.72115385, 29.50961538],
        [50.27884615, 39.10576923, 37.11057692, 39.50480769]]))

| Mattrix  | value |
|-------------|---------------|
| Degree of freedom     | 6     |
| P-value | 0.0763 |
