In [7]:
df_capstone_num = pd.read_csv('./rmpCapstoneNum.csv', header=None)

In [15]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import levene

In [16]:
numDat = pd.read_csv('rmpCapstoneNum.csv', header=None)
numDat.columns = ['Average Rating', 'Average Difficulty', 'Number of ratings', 'Received a pepper', 
                       'Proportion of students that said they would take the class again', 
                       'Number of ratings coming from online classes', 'Male Professor', 'Female Professor']

In [17]:
# Keep rows where 'Number of ratings' is greater or equal to 10
numDat = numDat.dropna()

# Remove rows where 'Male Professor' and 'Female Professor' are both 0 or both 1
numDat = numDat.loc[
    ~((numDat['Male Professor'] == 0) & (numDat['Female Professor'] == 0)) &
    ~((numDat['Male Professor'] == 1) & (numDat['Female Professor'] == 1))
]

# Display the final filtered dataset
numDat

Unnamed: 0,Average Rating,Average Difficulty,Number of ratings,Received a pepper,Proportion of students that said they would take the class again,Number of ratings coming from online classes,Male Professor,Female Professor
5,3.5,3.3,22.0,0.0,56.0,7.0,1,0
25,4.3,3.3,16.0,1.0,83.0,0.0,0,1
40,1.8,3.8,15.0,0.0,22.0,1.0,0,1
42,4.1,3.3,21.0,0.0,67.0,0.0,0,1
46,4.2,1.8,26.0,1.0,57.0,8.0,1,0
...,...,...,...,...,...,...,...,...
89839,3.8,2.9,9.0,0.0,67.0,0.0,0,1
89855,4.0,1.9,11.0,1.0,60.0,0.0,1,0
89866,4.4,3.6,14.0,1.0,88.0,0.0,0,1
89877,3.6,3.4,10.0,0.0,50.0,1.0,1,0


In [18]:
# Create DataFrames for Male and Female Professors
numDat_male = numDat[numDat['Male Professor'] == 1]
numDat_female = numDat[numDat['Female Professor'] == 1]

In [19]:
from scipy.stats import levene
import numpy as np
import pandas as pd

# Assuming numDat is loaded with the necessary data

# Calculate median splits for Average Difficulty and Number of ratings
median_difficulty = numDat['Average Difficulty'].median()
median_ratings = numDat['Number of ratings'].median()

# Create stratification groups based on the conditions
numDat.loc[:, 'Difficulty Groups'] = pd.cut(
    numDat['Average Difficulty'],
    bins=[numDat['Average Difficulty'].min(), median_difficulty, numDat['Average Difficulty'].max()],
    labels=[f'Below Median (≤{median_difficulty:.2f})', f'Above Median (> {median_difficulty:.2f})'],
    include_lowest=True
)

numDat.loc[:, 'Ratings Groups'] = pd.cut(
    numDat['Number of ratings'],
    bins=[numDat['Number of ratings'].min(), median_ratings, numDat['Number of ratings'].max()],
    labels=[f'Below Median (≤{median_ratings:.2f})', f'Above Median (> {median_ratings:.2f})'],
    include_lowest=True
)

# Create a combined stratification group
numDat.loc[:, 'Stratification Group'] = (
    numDat['Difficulty Groups'].astype(str) + "_" +
    numDat['Ratings Groups'].astype(str) + "_" +
    numDat['Received a pepper'].astype(str)
)

# Initialize a list to store Levene's test results and effect sizes
levene_results = []

# Get unique stratification groups
stratification_groups = numDat['Stratification Group'].unique()

# Iterate through each stratification group
for group in stratification_groups:
    # Filter data for males and females in the current group
    male_data = numDat[(numDat['Male Professor'] == 1) & (numDat['Stratification Group'] == group)]['Average Rating']
    female_data = numDat[(numDat['Female Professor'] == 1) & (numDat['Stratification Group'] == group)]['Average Rating']
    
    # Calculate sample sizes for the subgroup
    male_sample_size = len(male_data)
    female_sample_size = len(female_data)
    total_sample_size = male_sample_size + female_sample_size
    
    # Ensure both groups have enough data for Levene's test
    if male_sample_size > 1 and female_sample_size > 1:
        stat, p_value = levene(male_data, female_data)
        
        # Calculate Cohen's d for the group
        mean_male = male_data.mean()
        mean_female = female_data.mean()
        std_male = male_data.std()
        std_female = female_data.std()
        pooled_std = np.sqrt(((male_sample_size - 1) * std_male**2 + (female_sample_size - 1) * std_female**2) / (male_sample_size + female_sample_size - 2))
        effect_size = (mean_male - mean_female) / pooled_std
        
        levene_results.append({
            'Stratification Group': group,
            'Levene Stat': stat,
            'P-value': p_value,
            'Significant': p_value < 0.005,  # Using a significance level of 0.005
            'Male Sample Size': male_sample_size,
            'Female Sample Size': female_sample_size,
            'Total Sample Size': total_sample_size,
            'Cohen\'s d': effect_size
        })
    else:
        levene_results.append({
            'Stratification Group': group,
            'Levene Stat': None,
            'P-value': None,
            'Significant': "Insufficient Data",
            'Male Sample Size': male_sample_size,
            'Female Sample Size': female_sample_size,
            'Total Sample Size': total_sample_size,
            'Cohen\'s d': None
        })

# Convert results to a DataFrame
levene_results_df = pd.DataFrame(levene_results)

# Print the results
print("Levene's Test Results with Cohen's d for Male vs. Female within Subgroups:")
print(levene_results_df)


Levene's Test Results with Cohen's d for Male vs. Female within Subgroups:
                               Stratification Group  Levene Stat   P-value  \
0  Above Median (> 2.90)_Above Median (> 12.00)_0.0     1.095483  0.295469   
1  Above Median (> 2.90)_Above Median (> 12.00)_1.0     3.732197  0.053721   
2   Below Median (≤2.90)_Above Median (> 12.00)_1.0     0.300542  0.583635   
3   Above Median (> 2.90)_Below Median (≤12.00)_0.0     1.256810  0.262433   
4    Below Median (≤2.90)_Below Median (≤12.00)_0.0     0.085535  0.769982   
5   Below Median (≤2.90)_Above Median (> 12.00)_0.0     3.014407  0.082960   
6   Above Median (> 2.90)_Below Median (≤12.00)_1.0     8.552735  0.003555   
7    Below Median (≤2.90)_Below Median (≤12.00)_1.0     4.431733  0.035472   

   Significant  Male Sample Size  Female Sample Size  Total Sample Size  \
0        False               695                 513               1208   
1        False               456                 358                814 

In [20]:
# Bootstrap to calculate the 95% confidence interval for Cohen's d
bootstrap_effect_sizes = []
n_bootstrap = 1000

if not male_data.empty and not female_data.empty:
    for _ in range(n_bootstrap):
        # Resample data with replacement
        male_sample = np.random.choice(male_data, size=len(male_data), replace=True)
        female_sample = np.random.choice(female_data, size=len(female_data), replace=True)

        # Calculate means and standard deviations for resampled data
        mean_male_sample = np.mean(male_sample)
        mean_female_sample = np.mean(female_sample)
        std_male_sample = np.std(male_sample, ddof=1)
        std_female_sample = np.std(female_sample, ddof=1)

        # Calculate pooled standard deviation
        pooled_std_sample = np.sqrt(
            ((len(male_sample) - 1) * std_male_sample**2 + (len(female_sample) - 1) * std_female_sample**2) /
            (len(male_sample) + len(female_sample) - 2)
        )

        # Calculate Cohen's d for the resampled data
        bootstrap_effect_sizes.append((mean_male_sample - mean_female_sample) / pooled_std_sample)

    # Calculate the confidence interval
    lower_bound = np.percentile(bootstrap_effect_sizes, 2.5)
    upper_bound = np.percentile(bootstrap_effect_sizes, 97.5)

    # Print the results
    print(f"95% Confidence Interval for Cohen's d: [{lower_bound:.4f}, {upper_bound:.4f}]")
else:
    print("No data available for the specified group to perform bootstrap analysis.")

95% Confidence Interval for Cohen's d: [-0.0237, 0.1996]
