<a href="https://colab.research.google.com/github/francji1/01NAEX/blob/main/code/01NAEX_Exercise_07_python_student_solution_PV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# O1NAEX Exercise 07

In [None]:
!pip install pyDOE3

In [None]:
import importlib.util
import subprocess
import sys


def ensure_pkg(pkg_name: str) -> None:
    # Install the package with pip if it is missing.
    if importlib.util.find_spec(pkg_name) is None:
        print(f"Installing {pkg_name} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_name])


for package in ("pyDOE3",):
    ensure_pkg(package)


In [None]:


import itertools
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyDOE3 import ff2n
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.factorplots import interaction_plot
from statsmodels.stats.anova import anova_lm
from scipy import stats as st

plt.style.use("seaborn-v0_8-whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)



In [None]:
def daniel_plot(effects, names):
    effects = np.asarray(effects, dtype=float)
    m = len(effects)
    x = st.norm.ppf((np.arange(1, m + 1) - 0.5) / m)
    order = np.argsort(effects)
    y = effects[order]

    plt.figure()
    plt.scatter(x, y)
    lo, hi = int(0.25 * m), int(0.75 * m)
    if hi > lo:
        b, a = np.polyfit(x[lo:hi], y[lo:hi], 1)
        xx = np.linspace(x.min(), x.max(), 200)
        plt.plot(xx, a + b * xx, color="tab:red", linewidth=1.2)
    for xi, yi, nm in zip(x, y, [names[i] for i in order]):
        plt.annotate(nm, (xi, yi), fontsize=8, textcoords="offset points", xytext=(4, 4))
    plt.axhline(0, color="black", linewidth=0.8, linestyle="--")
    plt.xlabel("Normal quantiles")
    plt.ylabel("Effect")
    plt.title("Daniel plot")
    plt.tight_layout()


def halfnormal_plot(effects, names):
    effects = np.asarray(effects, dtype=float)
    ae = np.abs(effects)
    m = len(ae)
    x = st.halfnorm.ppf((np.arange(1, m + 1) - 0.5) / m)
    order = np.argsort(ae)
    y = ae[order]

    plt.figure()
    plt.scatter(x, y)
    lo, hi = int(0.25 * m), int(0.75 * m)
    if hi > lo:
        b, a = np.polyfit(x[lo:hi], y[lo:hi], 1)
        xx = np.linspace(x.min(), x.max(), 200)
        plt.plot(xx, a + b * xx, color="tab:red", linewidth=1.2)
    idx_pos = {idx: pos for pos, idx in enumerate(order)}
    for idx in order[-10:]:
        pos = idx_pos[idx]
        plt.annotate(names[idx], (x[pos], y[pos]), fontsize=8, textcoords="offset points", xytext=(4, 4))
    plt.xlabel("Half-normal quantiles")
    plt.ylabel("|Effect|")
    plt.title("Half-normal plot")
    plt.tight_layout()


def lenth_pse(contrasts):
    contrasts = np.asarray(contrasts, dtype=float)
    s0 = 1.5 * np.median(np.abs(contrasts))
    inlier_mask = np.abs(contrasts) < 2.5 * s0
    pse = 1.5 * np.median(np.abs(contrasts[inlier_mask])) if np.any(inlier_mask) else s0
    m = len(contrasts)
    d = m / 3.0
    ME = st.t.ppf(0.975, d) * pse
    gamma = 1 - (1 + 0.95 ** (1 / m)) / 2
    SME = st.t.ppf(1 - gamma, d) * pse
    return pse, ME, SME


def pareto_lenth(contrasts, names):
    contrasts = np.asarray(contrasts, dtype=float)
    pse, ME, SME = lenth_pse(contrasts)
    order = np.argsort(np.abs(contrasts))
    vals = np.abs(contrasts)[order]
    labs = [names[i] for i in order]
    y = np.arange(len(vals))

    plt.figure()
    plt.barh(y, vals, color="tab:blue")
    plt.axvline(ME, color="tab:red", linestyle="--", label="ME")
    plt.axvline(SME, color="tab:orange", linestyle=":", label="SME")
    plt.yticks(y, labs)
    plt.xlabel("|contrast|")
    plt.title("Pareto (Lenth)")
    plt.legend()
    plt.tight_layout()
    return pse, ME, SME


def contour_slices(model, x_var, y_var, fixed_var, fixed_levels, grid_points=41, cmap="viridis", title=""):
    all_vars = {"A_num", "C_num", "D_num"}
    xs = np.linspace(-1, 1, grid_points)
    ys = np.linspace(-1, 1, grid_points)
    X, Y = np.meshgrid(xs, ys)
    fig, axes = plt.subplots(1, len(fixed_levels), figsize=(5 * len(fixed_levels), 4), sharex=True, sharey=True)
    if not isinstance(axes, np.ndarray):
        axes = np.array([axes])
    levels = None
    cs = None

    for ax, lvl in zip(axes, fixed_levels):
        data = pd.DataFrame({x_var: X.ravel(), y_var: Y.ravel()})
        for var in all_vars - {x_var, y_var, fixed_var}:
            data[var] = 0.0
        data[fixed_var] = lvl
        Z = model.predict(data).to_numpy().reshape(X.shape)
        if levels is None:
            levels = np.linspace(Z.min(), Z.max(), 20)
        cs = ax.contourf(xs, ys, Z, levels=levels, cmap=cmap)
        ax.set_title(f"{fixed_var.replace('_num', '').upper()} = {lvl}")
        ax.set_xlabel(x_var.replace('_num', ''))
        ax.set_ylabel(y_var.replace('_num', ''))

    fig.suptitle(title)
    if cs is not None:
        fig.colorbar(cs, ax=axes.ravel().tolist(), label="Predicted Rate")
    plt.tight_layout()

## Problem 6.31 + 6.32

from the chapter 6, D. C. Montgomery DAoE - 8. edition.

An experiment was conducted on a chemical process that produces a
polymer. The four factors studied were temperature (A), catalyst
concentration (B), time (C), and pressure (D). Two responses, molecular
weight and viscosity, were observed. The design matrix and response data
are following:

Task: Rerun the analysis from previous lecture with centerpoints.

In [None]:
df631 = pd.read_csv("https://raw.githubusercontent.com/francji1/01NAEX/main/data/Problem_6_31.txt", sep=";")
df631

In [None]:
m = ff2n(4)
m = m[:, ::-1]

m_df = pd.DataFrame(m, columns=["A", "B", "C", "D"])
print(m_df)
Weight = df631.loc[0:15, "Weight"].values
Viscosity = df631.loc[0:15, "Viscosity"].values
m_df['Weight'] = Weight
m_df['Viscosity'] = Viscosity
response_names = m_df.columns[-2:].tolist()

print("Response names:", response_names)
print(m_df)


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import scipy.stats as st

# We infer centers from the data structure (midpoint of factorial range)
# A: Temp (100, 120) -> Center 110, Step 10
# B: Conc (4, 8) -> Center 6, Step 2
# C: Time (20, 30) -> Center 25, Step 5
# D: Pressure (60, 75) -> Center 67.5, Step 7.5

df_coded = df631.copy()
df_coded['A'] = (df631['Temperature'] - 110) / 10
df_coded['B'] = (df631['Concentration'] - 6) / 2
df_coded['C'] = (df631['Time'] - 25) / 5
df_coded['D'] = (df631['Pressure'] - 67.5) / 7.5

# Split factorial and center points
df_fact = df_coded.iloc[:16]
df_cent = df_coded.iloc[16:]

#  MOLECULAR WEIGHT

# daniel plot
model_mw_full = smf.ols("Weight ~ A*B*C*D", data=df_fact).fit()
effects_mw = model_mw_full.params.drop("Intercept") * 2
effect_names = [term.replace(":", "").replace("[T.1.0]", "") for term in effects_mw.index]
daniel_plot(effects_mw.values, effect_names)

# Refined Model
model_mw_reduced = smf.ols("Weight ~ A + B + C + A:B", data=df_coded).fit()
print("\nMW ANOVA (Refined with C):")
print(sm.stats.anova_lm(model_mw_reduced, typ=1))


model_visc_full = smf.ols("Viscosity ~ A*B*C*D", data=df_fact).fit()
effects_visc = model_visc_full.params.drop("Intercept") * 2
effect_names = [term.replace(":", "").replace("[T.1.0]", "") for term in effects_visc.index]
daniel_plot(effects_visc.values, effect_names)

# Refined Model
model_visc_reduced = smf.ols("Viscosity ~ A + B", data=df_coded).fit()
print("\nViscosity ANOVA:")
print(sm.stats.anova_lm(model_visc_reduced, typ=1))



In [None]:
a_vals = np.linspace(-1.5, 1.5, 100)
b_vals = np.linspace(-1.5, 1.5, 100)
AA, BB = np.meshgrid(a_vals, b_vals)

pred_df = pd.DataFrame({'A': AA.ravel(), 'B': BB.ravel(), 'C': 0, 'D': 0})

mw_pred = model_mw_reduced.predict(pred_df).values.reshape(AA.shape)
visc_pred = model_visc_reduced.predict(pred_df).values.reshape(AA.shape)

fig, ax = plt.subplots(figsize=(10, 8))

# 1. Viscosity (Background Color)
cp = ax.contourf(AA, BB, visc_pred, cmap='viridis', levels=20)
fig.colorbar(cp, label='Viscosity')

# 2. Molecular Weight (The Constraints)
# Dashed white lines for the target 2400 and 2500
cs = ax.contour(AA, BB, mw_pred, levels=[2400, 2500], colors='white', linewidths=3, linestyles='dashed')
ax.clabel(cs, fmt='%1.0f')

# 3. Highlight the "Sweet Spot"
# Hatched region for the valid MW band
ax.contourf(AA, BB, mw_pred, levels=[2400, 2500], colors='none', hatches=['//'], alpha=0.3)

ax.set_xlabel('Temperature (A)')
ax.set_ylabel('Concentration (B)')
ax.set_title('Optimization: Minimize Viscosity within MW Target (2400-2500)')
plt.show()

In [None]:
import scipy.stats as stats

# 1. Define Factorial and Center Sets
# Rows 0-15 are factorial, 16-19 are center points
y_fact_mw = df631['Weight'].iloc[:16]
y_cent_mw = df631['Weight'].iloc[16:]

y_fact_visc = df631['Viscosity'].iloc[:16]
y_cent_visc = df631['Viscosity'].iloc[16:]
def test_curvature(y_fact, y_cent, name):
    # Calculate Means
    mean_f = y_fact.mean()
    mean_c = y_cent.mean()

    mse_pure = y_cent.var(ddof=1)

    # Calculate Contrast and SS_Curvature
    n_f = len(y_fact)
    n_c = len(y_cent)
    ss_curve = (n_f * n_c * (mean_f - mean_c)**2) / (n_f + n_c)

    # F-Test
    f_0 = ss_curve / mse_pure
    p_val = 1 - stats.f.cdf(f_0, 1, n_c - 1)

    print(f"--- Curvature Test: {name} ---")
    print(f"Mean Factorial: {mean_f:.2f}")
    print(f"Mean Center:    {mean_c:.2f}")
    print(f"SS Curvature:   {ss_curve:.4f}")
    print(f"F stat:         {f_0:.2f}")
    print(f"P-value:        {p_val:.4f}")


test_curvature(y_fact_mw, y_cent_mw, "Molecular Weight")
test_curvature(y_fact_visc, y_cent_visc, "Viscosity")

Linear model is sufficient for both cases. The hypothesis cannot be rejected.

##	Problems 6.26 and 6.27 and 7.7
from the chapters 6 and 7, D. C. Montgomery DAoE - 8. edition.

An experiment was run in a semiconductor fabrication plant in an effort to increase yield. Five factors, each at two levels, were studied. The factors (and levels) were
+ A = aperture setting (small, large),
+ B = exposure time (20% below nominal, 20% above nominal),
+ C = development time (30 and 45 s),
+ D = mask dimension (small, large), and
+ E = etch time (14.5 and 15.5min).

The unreplicated $2^5$ design shown below was run.

In [None]:
df626 = pd.read_csv("https://raw.githubusercontent.com/francji1/01NAEX/main/data/Ex06_26.csv", sep=";")
df626

In [None]:
import pandas as pd

# Define new column names
new_column_names = ["A", "B", "C", "D", "E"]

# Rename columns
df626.rename(columns=dict(zip(df626.columns, new_column_names)), inplace=True)


In [None]:
df626.head()

In [None]:
pd.set_option('future.no_silent_downcasting', True)

# Rename variable values
df626["B"] = df626["B"].apply(lambda x: -1.0 if x < 15 else 1.0)
df626["E"] = df626["E"].apply(lambda x: -1.0 if x < 15 else 1.0)
df626["C"] = df626["C"].apply(lambda x: -1.0 if x < 40 else 1.0)

df626["A"] = df626["A"].replace({"small": -1.0, "large": 1.0}).infer_objects(copy=False)
df626["D"] = df626["D"].replace({"Small": -1.0, "Large": 1.0}).infer_objects(copy=False)

# Change data types to categorical
df626[new_column_names] = df626[new_column_names].astype("category")

# View the renamed dataframe
df626.info()

In [None]:
df626.head()

In [None]:
df626_cp = df626.iloc[-4:] # Center points
df626_df = df626.iloc[:-4] # Working points

In [None]:
df626_cp

In [None]:
len(df626_df)

(a) Construct a normal probability plot of the effect estimates.
    Which effects appear to be large?

(b) Conduct an analysis of variance to confirm your findings
    for part (a).

(c) Write down the regression model relating yield to the
    significant process variables.

(d) Plot the residuals on normal probability paper. Is the
    plot satisfactory?

(e) Plot the residuals versus the predicted yields and versus
    each of the five factors. Comment on the plots.

(f) Interpret any significant interactions.

(g) What are your recommendations regarding process
    operating conditions?

(h) Project the 2^5 design in this problem into a 2^k design
    in the important factors. Sketch the design and show.

(i) Suppose that the experimenter had run four center points
    in addition to the 32 trials in the original experiment.
    The yields obtained at the center point runs were 68, 74,
    76, and 70. Reanalyze the experiment, including a test for
    pure quadratic curvature. Discuss what your next step
    should be.

(j) Construct and analyze a design in two blocks with ABCDE
confounded with blocks.

(k) Assuming now that four blocks are necessary. Suggest a reasonable confounding scheme.

(l) Suppose that it was necessary to run this design in four blocks with ACDE and BCD (and consequently ABE) confounded. Analyze the data from this design.




(a) Construct a normal probability plot of the effect estimates.
    Which effects appear to be large?

In [None]:
import statsmodels.formula.api as ols

formula = "Yield_dbl ~ (A + B + C + D + E)**4"
effects_model = smf.ols(formula, data=df626_df.assign(Yield_dbl=2 * df626_df["Yield"])).fit()

# 2. Extract Effects
# In a 2^k design (numeric -1/+1): Effect = 2 * Beta_Coefficient
effects = effects_model.params.drop("Intercept")
# 3. Clean Names
# statsmodels uses ":" for interaction (A:B). Your replace fixes this.
effect_names = [term.replace(":", "").replace("[T.1.0]", "") for term in effects.index]

# 4. Create Table for inspection
effect_table = pd.DataFrame({"effect": effects.values}, index=effect_names)
print(effect_table.sort_values(by="effect"))

# 5. Plot
daniel_plot(effects.values, effect_names)
halfnormal_plot(effects.values, effect_names)

Most important are the factors C and B and interaction AB, thus we include the A aswell to have hierarchical model



(b) Conduct an analysis of variance to confirm your findings
    for part (a).

In [None]:
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm

model_reduced = smf.ols('Yield ~ A * B  + C', data=df626_df).fit()
sm.stats.anova_lm(model_reduced, typ=2)

(c) Write down the regression model relating yield to the
    significant process variables.


In [None]:
model_reduced.summary()

(d) Plot the residuals on normal probability paper. Is the
    plot satisfactory?

In [None]:
import matplotlib.pyplot as plt
import statsmodels.api as sm

# 1. Get Residuals from your reduced model
# (Assuming model_reduced = smf.ols('Yield ~ A + B + C + A:B', data=df626_df).fit())
residuals = model_reduced.resid

# 2. Generate the QQ Plot
# line='s' fits a line to the standardized residuals (expected vs theoretical)
fig = sm.qqplot(residuals, line='s')
plt.title("Normal Probability Plot of Residuals")
plt.show()
# Optional: Shapiro-Wilk Test for numeric confirmation
from scipy import stats
shapiro_test = stats.shapiro(residuals)
print(f"Shapiro-Wilk p-value: {shapiro_test.pvalue:.4f}")

Good

(e) Plot the residuals versus the predicted yields and versus
    each of the five factors. Comment on the plots.



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get residuals and predictions
residuals = model_reduced.resid
predicted = model_reduced.fittedvalues
factors = ['A', 'B', 'C', 'D', 'E']

# 1. Residuals vs. Predicted
plt.figure(figsize=(8, 5))
plt.scatter(predicted, residuals, alpha=0.7)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Predicted Yield")
plt.ylabel("Residuals")
plt.title("Residuals vs. Predicted")
plt.grid(True)
plt.show()

# 2. Residuals vs. Factors
fig, axes = plt.subplots(1, 5, figsize=(20, 4), sharey=True)
for ax, factor in zip(axes, factors):
    sns.stripplot(x=df626_df[factor], y=residuals, ax=ax, jitter=True)
    ax.axhline(0, color='red', linestyle='--')
    ax.set_title(f"Resid vs {factor}")
plt.tight_layout()
plt.show()

(g) What are your recommendations regarding process
    operating conditions?

In [None]:
factor_cols = list("ABCDE")
fig, axes = plt.subplots(1, 5, figsize=(22, 4), sharey=True)
for ax, factor in zip(axes.flat, factor_cols):
    means = df626_df.groupby(factor, observed=True)["Yield"].mean().sort_index()
    ax.plot(means.index, means.values, marker="o")
    ax.set_title(f"Main effect: {factor}")
    ax.set_xlabel(f"{factor} level")
    ax.set_ylabel("Mean Rate")
fig.suptitle("Main effects on filtration rate", y=1.02)
plt.tight_layout()



In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pairs = list(itertools.combinations(factor_cols, 2))
fig, axes = plt.subplots(2, 5, figsize=(20, 8), sharey=True)

for ax, (f1, f2) in zip(axes.flat, pairs):
    interaction_plot(
        df626_df[f1],
        df626_df[f2],
        df626_df["Yield"],
        ax=ax,
        colors=["tab:blue", "tab:orange"]
    )
    ax.set_title(f"{f1} x {f2}")

plt.tight_layout()
plt.show()

(f) Interpret any significant interactions.

A:B - exposure*aperture
Higher aperture A=1 need more light and profit more from longer exposures

My recomendation is to use large aperture setting and long exposure

(h) Project the 2^5 design in this problem into a 2^k design
    in the important factors. Sketch the design and show.



In [None]:
projected_design = df626_df.groupby(['A', 'B', 'C'])['Yield'].agg(['count', 'mean', 'std'])

print("Projected 2^3 Design (with Replicates):")
print(projected_design)

(i) Suppose that the experimenter had run four center points
    in addition to the 32 trials in the original experiment.
    The yields obtained at the center point runs were 68, 74,
    76, and 70. Reanalyze the experiment, including a test for
    pure quadratic curvature. Discuss what your next step
    should be.


In [None]:
import numpy as np
import scipy.stats as stats

y_fact = df626_df['Yield']
y_cent = df626_cp['Yield']

test_curvature(y_cent=y_cent, y_fact=y_fact, name="yield")

The curvature test is highly significant (F stat 458,p<.001), with center points 41.5 units higher than the factorial points. The linear model is inadequate. My instinct is adding polynomial features $A^2$, $B^2$ etc. Gemini suggested: "The next step is to augment the current design with axial runs to form a Central Composite Design"


(j) Construct and analyze a design in two blocks with ABCDE
confounded with blocks.




In [None]:
def to_num(series):
    return series.astype(float)

# 2. Calculate the Interaction Sign
interaction_sign = (
    to_num(df626_df['A']) *
    to_num(df626_df['B']) *
    to_num(df626_df['C']) *
    to_num(df626_df['D']) *
    to_num(df626_df['E'])
)

# 3. Assign Blocks
# If product is positive (+1) -> Block 1
# If product is negative (-1) -> Block 2
df626_df = df626_df.copy()

# Now this line won't throw a warning
df626_df['Block'] = interaction_sign.apply(lambda x: 'Block_1' if x > 0 else 'Block_2')

# 4. Check the split (Should be exactly 16 and 16)
print(df626_df['Block'].value_counts())

# 5. Run the ANOVA
# 'C(Block)' tells statsmodels to treat Block as a categorical grouping factor
formula_blocked = "Yield ~ Block + A + B + C + A:B"

model_blocked = smf.ols(formula_blocked, data=df626_df).fit()
anova_blocked = anova_lm(model_blocked, typ=1)

print(anova_blocked)

There is no significant difference between the two blocks.

(k) Assuming now that four blocks are necessary. Suggest a reasonable confounding scheme.



In [None]:
daniel_plot(effects.values, effect_names)

Block by the interactions ABDE and BCE as they seems influential in Daniel plot.

In [None]:
# Generator 1: ABDE
sign_ABDE = to_num(df626_df['A']) * to_num(df626_df['B']) * to_num(df626_df['D']) * to_num(df626_df['E'])

# Generator 2: BCE
sign_BCE = to_num(df626_df['B']) * to_num(df626_df['C']) * to_num(df626_df['E'])

# 3. Assign Blocks based on the (ADE, BCE) tuple
def assign_4_blocks(row_idx):
    s1 = sign_ABDE[row_idx]
    s2 = sign_BCE[row_idx]

    if s1 > 0 and s2 > 0: return 'Block_1'
    if s1 < 0 and s2 > 0: return 'Block_2'
    if s1 > 0 and s2 < 0: return 'Block_3'
    return 'Block_4' # (-1, -1)

df626_df['Block4'] = [assign_4_blocks(i) for i in df626_df.index]

# 4. Verify Balance (Should be 8 runs per block)
print(df626_df['Block4'].value_counts())

# 5. ANOVA
# Note: 'Block4' automatically absorbs ADE, BCE, and ABCD variance.
# Do NOT put those terms in the formula.
formula_4blocks = "Yield ~ Block4 + A + B + C + A:B"
model_4blocks = smf.ols(formula_4blocks, data=df626_df).fit()

print(sm.stats.anova_lm(model_4blocks, typ=1))

(l) Suppose that it was necessary to run this design in four blocks with ACDE and BCD (and consequently ABE) confounded. Analyze the data from this design.

In [None]:
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm

# 1. Define Helper for Numeric Conversion
def to_num(series):
    return series.astype(float)

# 2. Calculate Generator Signs
# Generator 1: ACDE
sign_ACDE = (to_num(df626_df['A']) * to_num(df626_df['C']) * to_num(df626_df['D']) * to_num(df626_df['E']))

# Generator 2: BCD
sign_BCD = (to_num(df626_df['B']) * to_num(df626_df['C']) * to_num(df626_df['D']))

# 3. Assign 4 Blocks based on signs
def assign_custom_blocks(row_idx):
    s1 = sign_ACDE[row_idx]
    s2 = sign_BCD[row_idx]

    if s1 > 0 and s2 > 0: return 'Block_1'
    if s1 < 0 and s2 > 0: return 'Block_2'
    if s1 > 0 and s2 < 0: return 'Block_3'
    return 'Block_4' # (-1, -1)

df626_df['Block_L'] = [assign_custom_blocks(i) for i in df626_df.index]

print("Runs per Block:", df626_df['Block_L'].value_counts())

# The variance of ABE, BCD, and ACDE is now captured in "Block_L"
formula_L = "Yield ~ Block_L + A + B + C + A:B"
model_L = smf.ols(formula_L, data=df626_df).fit()

print(anova_lm(model_L, typ=1))