<a href="https://colab.research.google.com/github/francji1/01NAEX/blob/main/code/01NAEX_Exercise_02_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##  Recap the code from the Lecture (Python instead of R)

In [None]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
import statsmodels.formula.api as smf

from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.api import durbin_watson
import statsmodels.api as sm


In [None]:
import pandas as pd

# URL to the etch rate data
url = "https://raw.githubusercontent.com/francji1/01NAEX/refs/heads/main/data/etchrate.txt"

# Load the etch rate data directly from the URL
etch_rate = pd.read_csv(url, delim_whitespace=True)

# Display the first few rows to verify the data has been loaded correctly
print(etch_rate.head())

In [None]:
# Group the data by RF and calculate the mean etch rate for each group
grp_means = etch_rate.groupby('RF')['rate'].mean()

# Create a scatter plot for both the mean etch rates and all data points
plt.scatter(etch_rate['RF'], etch_rate['rate'], label='Data Points', color='lightblue')
plt.scatter(grp_means.index, grp_means.values, label='Mean Etch Rate', color='darkblue')

# Add labels and title
plt.xlabel("RF Power (W)")
plt.ylabel("Observed Etch Rate (Å/min)")
plt.title("Etch Rate Data")
plt.legend()

# Display the plot
plt.show()

In [None]:
# Boxplot by 'RF Power'
etch_rate.boxplot(column='rate', by='RF')
plt.title('Etch Rate by RF Power')
plt.xlabel('RF Power (W)')
plt.ylabel('Observed Etch Rate (Å/min)')

# Display the boxplots
plt.tight_layout()
plt.show()

In [None]:
# boxplot for 'rate' grouped by 'run' considering the runs as a factor
plt.figure(figsize=(8, 6))

# Boxplot by 'run' only
etch_rate.boxplot(column='rate', by='run', grid=False)
plt.title('Etch Rate by Run (Run as Factor)')
plt.xlabel('Run')
plt.ylabel('Observed Etch Rate (Å/min)')

# Display the boxplot
plt.tight_layout()
plt.show()

In [None]:
# Linear models for etch rate
etch_rate['Power'] = etch_rate['RF'].astype('category')
etch_rate['Run'] = etch_rate['run'].astype('category')

model2 = ols('rate ~ Power', data=etch_rate).fit()
model0 = ols('rate ~ Power - 1', data=etch_rate).fit()
print(model2.summary())
print(model0.summary())


In [None]:
etch_rate

In [None]:
# Ensure that 'run' is treated as a categorical variable
etch_rate['Run'] = etch_rate['run'].astype('category')
etch_rate['Power'] = etch_rate['Power'].astype('category')

model1 = ols('rate ~ Power + Run', data=etch_rate).fit()
anova_table1 = sm.stats.anova_lm(model1)

# Display the summary of the ANOVA table
anova_table1

In [None]:
model2 = ols('rate ~ Power', data=etch_rate).fit()
anova_table2 = sm.stats.anova_lm(model2)

# Display the summary of the ANOVA table
anova_table2

In [None]:
# Perform ANOVA comparison between the two models
anova_comparison = sm.stats.anova_lm(model2, model1)

# Display the result of the ANOVA comparison
anova_comparison


In [None]:
# Display the summary of the model
model_summary = model2.summary()

# Compute the confidence intervals for the coefficients
conf_intervals = model2.conf_int()

model_summary, conf_intervals


In [None]:
# Display the summary of the model
model_summary = model0.summary()

# Compute the confidence intervals for the coefficients
conf_intervals = model0.conf_int()

model_summary, conf_intervals


In [None]:
# Create the new variables Power1 and Power2, as transformations of the original Power variable.

# Power1 is the original RF power and Power2 is the square of the RF power
etch_rate['Power1'] = etch_rate['Power'].astype(float)  # Convert Power to numeric for quantitative analysis
etch_rate['Power2'] = etch_rate['Power1'] ** 2  # Square of Power

# First model: Erch_rate ~ Power1
model3 = smf.ols('rate ~ Power1', data=etch_rate).fit()
model3_summary = model3.summary()

# Second model: Erch_rate ~ Power1 + Power2
model4 = smf.ols('rate ~ Power1 + Power2', data=etch_rate).fit()
model4_summary = model4.summary()

model3_summary, model4_summary


In [None]:
# Generate predicted values from both models (model3 and model4)
etch_rate['pred_model3'] = model3.fittedvalues  # Predicted values from the linear model (model3)
etch_rate['pred_model4'] = model4.fittedvalues  # Predicted values from the quadratic model (model4)

# Create the plot to display actual data points and model predictions
plt.figure(figsize=(10, 6))

# Scatter plot of the actual data points with 'x' markers
plt.scatter(etch_rate['Power1'], etch_rate['rate'], color='blue', marker='x', label='Data Points')

# Plot the predicted values from the linear model (Model 3) as a line
plt.plot(etch_rate['Power1'], etch_rate['pred_model3'], color='green', marker='o', linestyle='-', label='Linear Model')

# Plot the predicted values from the quadratic model (Model 4) as a line
plt.plot(etch_rate['Power1'], etch_rate['pred_model4'], color='red', marker='o', linestyle='-', label='Quadratic Model')

# Add labels and title
plt.xlabel('Power (RF Power in W)')
plt.ylabel('Etch Rate (Å/min)')
plt.title('Comparison of Linear and Quadratic Regression Models')

# Add a legend
plt.legend()

# Show the plot
plt.show()


In [None]:
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests

# Prepare the data by grouping the etch rate data by Power1
groups = etch_rate.groupby('Power1')['rate'].apply(list)

# List to store pairwise t-test results
p_values = []

# Perform pairwise t-tests between all combinations of Power1 groups
power_levels = sorted(groups.keys())
for i in range(len(power_levels)):
    for j in range(i+1, len(power_levels)):
        # Perform t-test between groups i and j
        t_stat, p_val = ttest_ind(groups[power_levels[i]], groups[power_levels[j]], equal_var=True)
        p_values.append((power_levels[i], power_levels[j], p_val))

# Extract the p-values for adjustment
p_vals = [p_val for _, _, p_val in p_values]

# Apply Bonferroni correction
_, p_vals_bonf, _, _ = multipletests(p_vals, method='bonferroni')

# Apply Holm (Hochberg's step-up procedure)
_, p_vals_hoch, _, _ = multipletests(p_vals, method='holm')

# Display the results
print("Pairwise t-test results with Bonferroni adjustment:")
for (level1, level2, _), p_val_bonf in zip(p_values, p_vals_bonf):
    print(f"{level1} vs {level2}: p-value = {p_val_bonf:.4e}")

print("\nPairwise t-test results with Hochberg (Holm) adjustment:")
for (level1, level2, _), p_val_hoch in zip(p_values, p_vals_hoch):
    print(f"{level1} vs {level2}: p-value = {p_val_hoch:.4e}")


In [None]:
# Post-ANOVA Comparison of Means (Tukey HSD)
tukey_result = pairwise_tukeyhsd(etch_rate['rate'], etch_rate['Power'])
print(tukey_result)
tukey_result.plot_simultaneous()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Perform Tukey's HSD test
tukey = pairwise_tukeyhsd(endog=etch_rate['rate'], groups=etch_rate['Power'], alpha=0.05)

# Extract the Tukey HSD results
means_diff = tukey.meandiffs  # Mean differences
ci_lower = tukey.confint[:, 0]  # Lower bound of confidence intervals
ci_upper = tukey.confint[:, 1]  # Upper bound of confidence intervals
comparisons = tukey.summary().data[1:]  # Extract the group comparisons from the Tukey summary

# Create a plot for confidence intervals of pairwise comparisons
plt.figure(figsize=(10, 6))

# Plot the confidence intervals as horizontal lines
for i, (lower, upper) in enumerate(zip(ci_lower, ci_upper)):
    plt.plot([lower, upper], [i, i], color='black')
    plt.plot(means_diff[i], i, 'o', color='red')  # Plot the mean difference as a red point

# Labeling the y-axis with the comparisons
comparison_labels = [f"{row[0]} - {row[1]}" for row in comparisons]  # Create labels like "180-160"
plt.yticks(np.arange(len(comparison_labels)), comparison_labels)

# Add vertical line at 0 (no difference)
plt.axvline(0, color='grey', linestyle='--')

# Add labels and title
plt.xlabel('Mean Difference with 95% CI')
plt.title('Tukey HSD Confidence Intervals')

# Show the plot
plt.tight_layout()
plt.show()


In [None]:


means = etch_rate.groupby('Power', observed=True)['rate'].mean()  # Group means by Power levels
std_errs = etch_rate.groupby('Power', observed=True)['rate'].std() / np.sqrt(etch_rate.groupby('Power', observed=True)['rate'].count())  # Standard errors
n = etch_rate.groupby('Power', observed=True)['rate'].count()  # Replicates per group
df_error = 16  # Given degrees of freedom for error
mse = 334  # Given Mean Square Error
alpha = 0.05  # Significance level

# Calculate the critical value of t for the given degrees of freedom and alpha level
t_critical = stats.t.ppf(1 - alpha/2, df_error)

# Calculate the Least Significant Difference (LSD)
lsd = t_critical * np.sqrt(2 * mse / n.mean())  # Assuming balanced design for simplicity

# Calculate the confidence intervals for each group
ci_lower = means - t_critical * std_errs
ci_upper = means + t_critical * std_errs

# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Mean': means,
    'Std. Error': std_errs,
    'Lower CI (95%)': ci_lower,
    'Upper CI (95%)': ci_upper,
    'Min': etch_rate.groupby('Power', observed=True)['rate'].min(),
    'Max': etch_rate.groupby('Power', observed=True)['rate'].max()
})

# Print summary
print("LSD Test Results:")
print(summary_df)
print(f"\nLeast Significant Difference (LSD): {lsd:.5f}")
print(f"Critical Value of t: {t_critical:.5f}")

# Perform pairwise comparisons to group means
groups = means.sort_values(ascending=False).index
letters = ['a', 'b', 'c', 'd']  # Group labels based on LSD results
group_labels = {}

# Initialize the group labeling
for i, group1 in enumerate(groups):
    group_labels[group1] = letters[i]

    for group2 in groups[i + 1:]:
        mean_diff = abs(means[group1] - means[group2])
        if mean_diff > lsd:
            # Means are significantly different, keep different labels
            pass
        else:
            # Means are not significantly different, assign the same letter
            group_labels[group2] = group_labels[group1]

# Create a DataFrame to show group labels and means
grouped_means_df = pd.DataFrame({
    'Group': groups,
    'Mean': means[groups],
    'Label': [group_labels[g] for g in groups]
})

print("\nGroups, Treatments and Means with Labels:")
print(grouped_means_df)


In [None]:
import numpy as np
from statsmodels.stats.power import FTestAnovaPower

# Parameters
sd = 25  # Standard deviation (within-group variance)
max_difference = 75  # Maximum difference between group means
alpha = 0.01  # Significance level
groups = 4  # Number of groups

# Sequence of sample sizes
nn = np.arange(4, 11)  # Sample sizes from 4 to 10

# Compute the effect size based on R/Minitab convention
effect_size = (max_difference / 2) / sd  # Effect size of 1.5

# Initialize FTestAnovaPower object
power_analysis = FTestAnovaPower()

# Calculate power for each sample size
beta = []
for n in nn:
    power = power_analysis.power(effect_size=effect_size, nobs=n * groups, alpha=alpha, k_groups=groups)
    beta.append(power)

# Display the results
print("Sample Sizes:", nn)
print("Power:", beta)


In [None]:
#!pip install statsmodels==0.13.2 numpy==1.22.3 scipy==1.8.0#

#import statsmodels
#import numpy as np
#import scipy

#print(f"Statsmodels Version: {statsmodels.__version__}")
#print(f"NumPy Version: {np.__version__}")
#print(f"SciPy Version: {scipy.__version__}")

In [None]:
import statsmodels
import numpy as np
import scipy

print(f"Statsmodels Version: {statsmodels.__version__}")
print(f"NumPy Version: {np.__version__}")
print(f"SciPy Version: {scipy.__version__}")

In [None]:
from statsmodels.stats.power import FTestAnovaPower

# Power analysis example (ANOVA power calculation)
power_analysis = FTestAnovaPower()
effect_size = np.var(grp_means) / (25**2)  # Effect size calculation
power = power_analysis.solve_power(effect_size=effect_size, nobs=10, alpha=0.01, k_groups=4)  # Corrected argument
print("ANOVA Power:", power)


In [None]:
# Generating operating characteristic curve for sample size
sd_vals = np.arange(20, 81, 1)
nn_vals = np.arange(3, 11, 1)
beta_matrix = np.zeros((len(sd_vals), len(nn_vals)))

for i, sd in enumerate(sd_vals):
    for j, n in enumerate(nn_vals):
        effect_size = (75**2) / (2 * 4 * (sd**2))
        beta_matrix[i, j] = power_analysis.solve_power(effect_size=effect_size, nobs=n, alpha=0.01, k_groups=4)

# Plotting the operating characteristic curve
plt.figure(figsize=(8, 6))
for i, n in enumerate(nn_vals):
    plt.plot(sd_vals, beta_matrix[:, i], label=f'n={n}')
plt.xlabel('Standard Deviation (σ)')
plt.ylabel('1 - β (Power)')
plt.title('Operating Characteristic Curve for a = 4 Treatment Means')
plt.grid(True)
plt.legend()
plt.show()


In [None]:

# Operating characteristic curve based on a grid of standard deviations and sample sizes
sd_grid = np.arange(20, 81, 1)  # Range of standard deviations
nn_grid = np.arange(4, 13, 1)  # Range of sample sizes
beta_grid = np.zeros((len(sd_grid), len(nn_grid)))

# Fill beta (power) matrix
for i, sd in enumerate(sd_grid):
    for j, n in enumerate(nn_grid):
        effect_size = (75 ** 2) / (2 * (4 - 1) * (sd ** 2))  # Using 4 groups in this example
        beta_grid[i, j] = power_analysis.solve_power(effect_size=effect_size, nobs=n, alpha=0.01, k_groups=4)


In [None]:

# Plot the operating characteristic curve
plt.figure(figsize=(8, 6))
for j, n in enumerate(nn_grid):
    plt.plot(sd_grid, beta_grid[:, j], label=f'n={n}')
plt.xlabel('Standard Deviation (σ)')
plt.ylabel('1 - β (Power)')
plt.title('Operating Characteristic Curve for a=4 Treatment Means')
plt.grid(True)
plt.legend(title='Sample Size (n)')
plt.show()


In [None]:

# Bartlett's test for homogeneity of variances
bartlett_result = stats.bartlett(etch_rate['rate'], etch_rate['RF'])
print("Bartlett's Test for Homogeneity of Variances:", bartlett_result)


In [None]:

# Levene's test for homogeneity of variances
levene_result = stats.levene(etch_rate['rate'], etch_rate['RF'], center='mean')
print("Levene's Test for Homogeneity of Variances:", levene_result)


In [None]:

# Durbin-Watson test for autocorrelation of residuals
durbin_watson_stat = durbin_watson(model1.resid)
print("Durbin-Watson Test Statistic:", durbin_watson_stat)


In [None]:

# Use the correct exogenous variable name for regression diagnostics
fig, ax = plt.subplots(2, 2, figsize=(12, 12))

# Use one of the variables from model1.model.exog_names, such as 'Power[T.180]'
sm.graphics.plot_regress_exog(model1, 'Power[T.180]', fig=fig)
plt.show()


In [None]:
model1.summary()

In [None]:

# Regression model comparison (with and without intercept)
anova_result1 = anova_lm(model1)
anova_result2 = anova_lm(model2)
print("ANOVA Results for Model 1 (with intercept):")
print(anova_result1)
print("ANOVA Results for Model 2 (without intercept):")
print(anova_result2)


In [None]:

# Post-hoc comparisons using Tukey HSD for multiple comparisons
tukey_result = pairwise_tukeyhsd(etch_rate['rate'], etch_rate['Power'], alpha=0.05)
print("Tukey HSD Results:")
print(tukey_result)
tukey_result.plot_simultaneous()
plt.show()


In [None]:

# Final summary statistics for the etch rate data
overall_mean = np.mean(etch_rate['rate'])
treatment_effects = etch_rate.groupby('RF')['rate'].mean() - overall_mean

print("Overall mean of etch rate:", overall_mean)
print("Treatment effects relative to overall mean:")
print(treatment_effects)


In [None]:

# Example of generating summary tables from ANOVA
model_tables = anova_lm(model1)
print("Model Tables from ANOVA:")
print(model_tables)


In [None]:

# Calculate the pooled standard deviation
MSe = anova_result1['mean_sq'][1]
SD_pool = np.sqrt(MSe / 16)
print("Pooled Standard Deviation:", SD_pool)


In [None]:

# Post-ANOVA comparison of means using Tukey HSD
print("Tukey HSD Comparison of Means:")
tukey_result = pairwise_tukeyhsd(etch_rate['rate'], etch_rate['Power'], alpha=0.05)
print(tukey_result)
tukey_result.plot_simultaneous()
plt.show()


In [None]:

# Plotting residuals and ANOVA diagnostics
fig = plt.figure(figsize=(12, 8))
sm.graphics.plot_partregress_grid(model1, fig=fig)
plt.show()


In [None]:

import matplotlib.pyplot as plt
import statsmodels.api as sm

# Residuals vs Fitted
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
# Plot residuals vs fitted values
sm.graphics.plot_regress_exog(model1, 'Power[T.180]', fig=fig)
plt.show()

In [None]:



# Influence Plot
fig_influence = sm.graphics.influence_plot(model1, figsize=(12, 8))
plt.show()


In [None]:
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Create a figure and axis first
fig, ax = plt.subplots(figsize=(12, 8))

# Plot leverage vs residuals squared
sm.graphics.plot_leverage_resid2(model1, ax=ax)
plt.show()



In [None]:

# Additional tests
durbin_watson_stat = durbin_watson(model1.resid)
print("Durbin-Watson Test:", durbin_watson_stat)


In [None]:
# Example of power calculation
grp_means = [575, 600, 650, 675]  # Group means
between_var = np.var(grp_means)
within_var = 25 ** 2
power_result = power_analysis.solve_power(effect_size=between_var / within_var, nobs=10, alpha=0.01, k_groups=4)
print("Power of the test:", power_result)
