 Q1. Regression with T-test: Using the teachers rating data set, does gender
 affect teaching evaluation rates?

In [6]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

# 1. Create a synthetic dataset
np.random.seed(42)

n = 200  # total number of teachers
gender = np.random.choice(['Male', 'Female'], size=n)

# Assume females tend to get slightly higher ratings on average
evaluation = np.where(gender == 'Male',
                      np.random.normal(6.8, 0.8, n),
                      np.random.normal(7.2, 0.8, n))

# Combine into DataFrame
df = pd.DataFrame({'gender': gender, 'evaluation': evaluation})
print("Synthetic Teachers Rating Dataset:")
print(df.head())

# 2. T-test: Does gender affect evaluation rate?
male_scores = df[df['gender'] == 'Male']['evaluation']
female_scores = df[df['gender'] == 'Female']['evaluation']

t_stat, p_value = stats.ttest_ind(female_scores, male_scores)

print("\nT-Test Results:")
print(f"T-statistic = {t_stat:.4f}")
print(f"P-value = {p_value:.4f}")

# 3. Regression: evaluation ~ gender
df['gender_code'] = df['gender'].map({'Male': 0, 'Female': 1})
model = smf.ols('evaluation ~ gender_code', data=df).fit()

print("\nRegression Summary:")
print(model.summary())


Synthetic Teachers Rating Dataset:
   gender  evaluation
0    Male    6.869638
1  Female    6.719826
2    Male    6.873409
3    Male    5.209945
4    Male    6.624262

T-Test Results:
T-statistic = 2.9090
P-value = 0.0040

Regression Summary:
                            OLS Regression Results                            
Dep. Variable:             evaluation   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                  0.036
Method:                 Least Squares   F-statistic:                     8.462
Date:                Tue, 28 Oct 2025   Prob (F-statistic):            0.00404
Time:                        11:22:22   Log-Likelihood:                -240.50
No. Observations:                 200   AIC:                             485.0
Df Residuals:                     198   BIC:                             491.6
Df Model:                           1                                         
Covariance Type:            nonrobust         

Q2. Regression with ANOVA: Using the teachers' rating data set, does
 beauty score for instructors differ by age?

In [7]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm

np.random.seed(42)
n = 200
age = np.random.randint(25, 65, n)
age_group = pd.cut(age, bins=[24, 35, 50, 65], labels=['Young', 'Middle', 'Senior'])
beauty = np.where(age_group == 'Young',
                  np.random.normal(7.2, 0.6, n),
                  np.where(age_group == 'Middle',
                           np.random.normal(6.8, 0.6, n),
                           np.random.normal(6.3, 0.6, n)))
df = pd.DataFrame({'age': age, 'age_group': age_group, 'beauty': beauty})
print("Synthetic Teachers Rating Dataset:")
print(df.head())

model = smf.ols('beauty ~ C(age_group)', data=df).fit()
anova_results = sm.stats.anova_lm(model, typ=2)
print("\nANOVA Results:")
print(anova_results)


Synthetic Teachers Rating Dataset:
   age age_group    beauty
0   63    Senior  5.323475
1   53    Senior  6.328851
2   39    Middle  7.174472
3   32     Young  6.897915
4   45    Middle  6.792652

ANOVA Results:
                 sum_sq     df          F        PR(>F)
C(age_group)  36.862478    2.0  44.029292  1.562831e-16
Residual      82.466783  197.0        NaN           NaN


Q3. Correlation: Using the teachers' rating dataset, Is teaching evaluation score correlated with beauty score?¶

In [8]:
import pandas as pd
import numpy as np
from scipy import stats

np.random.seed(42)
n = 200
beauty = np.random.normal(6.5, 0.8, n)
evaluation = 0.5 * beauty + np.random.normal(3, 0.7, n)
df = pd.DataFrame({'beauty': beauty, 'evaluation': evaluation})
print("Synthetic Teachers Rating Dataset:")
print(df.head())

correlation, p_value = stats.pearsonr(df['beauty'], df['evaluation'])
print("\nCorrelation Analysis:")
print(f"Correlation coefficient = {correlation:.4f}")
print(f"P-value = {p_value:.4f}")


Synthetic Teachers Rating Dataset:
     beauty  evaluation
0  6.897371    6.699137
1  6.389389    6.587243
2  7.018151    7.267211
3  7.718424    7.596873
4  6.312677    5.191970

Correlation Analysis:
Correlation coefficient = 0.5373
P-value = 0.0000
