##  Recap the code from the Lecture (Python instead of R)

In [None]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
import statsmodels.formula.api as smf

from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.api import durbin_watson
import statsmodels.api as sm


### Exercises 3.07

The tensile strength of Portland cement is being studied. Four different mixing techniques can be used economically. A completely randomized experiment was conducted	and the following data were collected:

| Mixing | Technique Tensile Strength (lb/in2)||||
|--------------------------------------------|||||
| 1      |  3129  |  3000  |  2865  |  2890  |
| 2      |  3200  |  3300  |  2975  |  3150  |
| 3      |  2800  |  2900  |  2985  |  3050  |
| 4      |  2600  |  2700  |  2600  |  2765  |

* Construct a graphical display to compare the mean tensile strengths for the
four mixing techniques. What are your conclusions?
* Test the hypothesis that mixing techniques affect the
strength of the cement. Use  $\alpha = 0.05$.
* Use the Fisher LSD method with  $\alpha = 0.05$ to make
comparisons between pairs of means.
*  Construct a normal probability plot of the residuals.
What conclusion would you draw about the validity of
the normality assumption?
*  Plot the residuals versus the predicted tensile strength. Comment on the plot.
* Prepare a scatter plot of the results to aid the interpretation of the results of this experiment.

In [None]:
# Read the data from the CSV file
url = "https://raw.githubusercontent.com/francji1/01NAEX/main/data/Ex03_7.csv"
Ex03_7 = pd.read_csv(url, sep=";")

# Show the first few rows of the dataset
#print(Ex03_7.head())
# Show the structure of the dataset
#print(Ex03_7.info())

# Transform the data, renaming columns and converting Technique to a factor (category)
data = Ex03_7.assign(
    stren=Ex03_7['Tensile_Strength'],
    tech=Ex03_7['Technique'].astype('category')
)

# Create the boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(x='tech', y='stren', data=data)

# Add the mean as a point on the plot
sns.pointplot(x='tech', y='stren', data=data, join=False, color='red', markers='D', ci=None)

# Show the plot
plt.title("Tensile Strength by Technique")
plt.show()

In [None]:
# ANOVA - One-way
# Formula: 'response_variable ~ explanatory_variable'
anova_model = ols('stren ~ tech', data=data).fit()
anova_table = sm.stats.anova_lm(anova_model, typ=2) #type 1,2,3, vygoogli si

# Add the 'mean_sq' column by dividing 'sum_sq' by 'df'
anova_table['mean_sq'] = anova_table['sum_sq'] / anova_table['df']



mse = anova_table['mean_sq'][1]
anova_table

In [None]:
# Set your alpha level (e.g., 0.05)
alpha = 0.05
# Extract the p-value
p_value = anova_table["PR(>F)"][0]

# Print out if the result is significant based on the alpha level
if p_value < alpha:
    print(f"Since p-value = {p_value:.4f} is less than alpha = {alpha}, we reject the null hypothesis of equal means.")
else:
    print(f"Since p-value = {p_value:.4f} is greater than alpha = {alpha}, we fail to reject the null hypothesis.")

In [None]:
# ANOVA says that there is significant difference between the means -> time to do the pairwise comparisons between group means using Fisher's LSD.
# Mean values of each group
group_means = data.groupby('tech')['stren'].mean()
print(group_means)

# Number of observations per group
group_sizes = data.groupby('tech').size()
print(group_sizes)

# Perform pairwise t-tests (Fisher's LSD)
comparisons = []
mean_diffs = []
lsd_diffs = []

group_means.index

#alpha
alpha = 0.05

# Degrees of freedom
n_total = len(data)
n_groups = len(group_sizes)
df_error = n_total - n_groups #degrees of freedom for the error N-a

for i, tech1 in enumerate(group_means.index):
      for tech2 in group_means.index[i+1:]:
            mean_diff = abs(group_means[tech1] - group_means[tech2])

            pooled_se = np.sqrt(mse * (1/group_sizes[tech1] + 1/group_sizes[tech2]))

            # t-statistic
            t_critical =  stats.t.ppf(1 - alpha/2, df_error)

            #LSD
            lsd = t_critical * pooled_se


            # Calculate the p-value for the t-statistic
            comparisons.append(f"{tech1} vs {tech2}")
            mean_diffs.append(mean_diff)
            lsd_diffs.append(lsd)

            if mean_diff > lsd:
                print(f"Significant difference between {tech1} and {tech2}")
            else:
                print(f"No significant difference between {tech1} and {tech2}")


In [None]:
# Obtain residuals from the model
residuals = anova_model.resid

# Q-Q plot of the residuals
sm.qqplot(residuals, line = "45", fit = True)
plt.title('Normal Q-Q Plot of Residuals')
plt.show() #seems pretty okayyy

In [None]:
fitted_values = anova_model.fittedvalues

# Plot residuals versus fitted values
plt.scatter(fitted_values, residuals)
plt.axhline(0, color='red', linestyle='--', lw=2)
plt.title('Residuals vs. Predicted Tensile Strength')
plt.xlabel('Predicted Tensile Strength')
plt.ylabel('Residuals')
plt.show() #no transformations needed, we don't see any patterns, the plot is structureless

In [None]:
# Create a scatter plot with a boxplot overlay
plt.figure(figsize=(8,6))
sns.stripplot(x='tech', y='stren', data=data, jitter=True, color='blue', alpha=0.7)
sns.boxplot(x='tech', y='stren', data=data, showcaps=True, boxprops={'facecolor':'None'},
            showfliers=False, whiskerprops={'linewidth':2})

plt.title('Scatter Plot of Tensile Strength by Technique')
plt.xlabel('Technique')
plt.ylabel('Tensile Strength')
plt.show()

### Exercises 3.08 and 3.09

Reconsider the experiment in Problem 3.07.

* Rework part (3) of Problem 3.07 using Tukey’s test	with 	$\alpha = 0.05$. Do you get the same conclusions from Tukey’s test that you did from the graphical procedure and/or the Fisher LSD method?
* Explain the difference between the Tukey and Fisher procedures.
*Find a 95percent confidence interval on the mean tensile strength of the Portland cement produced by each of the four mixing techniques. Also find a 95 percent confidence interval on the difference in means for techniques 1 and 3. Does this aid you in interpreting the results of the experiment?

In [None]:
# Perform Tukey's HSD test for pairwise comparisons
tukey_test = pairwise_tukeyhsd(endog=data['stren'],  # Dependent variable (Tensile Strength)
                               groups=data['tech'],  # Independent variable (Technique)
                               alpha=0.05)  # Significance level

# Display Tukey's HSD test results
print(tukey_test) # the conclusions are different

In [None]:
# Explain the difference between the Tukey and Fisher procedures.
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Specify the path to your image file
image_path = 'Fisher_vs_Tukey.png'

# Load the image
img = mpimg.imread(image_path)

# Display the image
plt.imshow(img)
plt.axis('off')  # Hide axes

In [None]:
# Get the 95% confidence intervals for the mean tensile strength of each technique
mean_ci = anova_model.get_prediction().summary_frame(alpha=0.05)
print(mean_ci.iloc[[0, 4, 8,12]])

# Extract predictions for each technique group mean
group_means = data.groupby('tech')['stren'].mean()
print("\nGroup Means:\n", group_means)

In [None]:
# Perform Tukey's HSD test
tukey = pairwise_tukeyhsd(endog=data['Tensile_Strength'], groups=data['Technique'], alpha=0.05)
print(tukey)

In [None]:
# Extract the Tukey HSD results
means_diff = tukey.meandiffs  # Mean differences
ci_lower = tukey.confint[:, 0]  # Lower bound of confidence intervals
ci_upper = tukey.confint[:, 1]  # Upper bound of confidence intervals
comparisons = tukey.summary().data[1:]  # Extract the group comparisons from the Tukey summary

# Create a plot for confidence intervals of pairwise comparisons
plt.figure(figsize=(10, 6))

# Plot the confidence intervals as horizontal lines
for i, (lower, upper) in enumerate(zip(ci_lower, ci_upper)):
    plt.plot([lower, upper], [i, i], color='black')
    plt.plot(means_diff[i], i, 'o', color='red')  # Plot the mean difference as a red point

# Labeling the y-axis with the comparisons
comparison_labels = [f"{row[0]} - {row[1]}" for row in comparisons]  # Create labels like "180-160"
plt.yticks(np.arange(len(comparison_labels)), comparison_labels)

# Add vertical line at 0 (no difference)
plt.axvline(0, color='grey', linestyle='--')

# Add labels and title
plt.xlabel('Mean Difference with 95% CI')
plt.title('Tukey HSD Confidence Intervals')

# Show the plot
plt.tight_layout()
plt.show()

### Exercises 3.10

A product developer is investigating the tensile strength
of a new synthetic fiber that will be used to make cloth for
men’s shirts. Strength is usually affected by the percentage of
cotton used in the blend of materials for the fiber. The engineer
conducts a completely randomized experiment with five levels
of cotton content and replicates the experiment five times.


* Is there evidence to support the claim that cotton content
affects the mean tensile strength? Use $\alpha = 0.05$.
* Use the Fisher LSD method to make comparisons
between the pairs of means. What conclusions can you
draw?
* Analyze the residuals from this experiment and comment
on model adequacy.

In [None]:
# Read the data from the CSV file
url = "https://raw.githubusercontent.com/francji1/01NAEX/main/data/Ex03_10.csv"
Ex03_10 = pd.read_csv(url, sep=";")

Ex03_10.head()


In [None]:
# Transform the data
data2 = Ex03_10.assign(
    percentage=Ex03_10['Cotton_Weight '].astype('category'),  # Convert Cotton_Weight to categorical
    strength=Ex03_10['Observations']                         # Keep the Observations column as 'strength'
)

In [None]:
# Create the boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(x='percentage', y='strength', data=data2)

# Add the mean as a point on the plot
sns.pointplot(x='percentage', y='strength', data=data2, join=False, color='red', markers='D', ci=None)

# Show the plot
plt.title("Tensile Strength by Cotton levels")
plt.show()

In [None]:
# ANOVA - One-way
# Formula: 'response_variable ~ explanatory_variable'
anova_model = ols('strength ~ percentage', data=data2).fit()
anova_table = sm.stats.anova_lm(anova_model, typ=2) #type 1,2,3, vygoogli si

# Add the 'mean_sq' column by dividing 'sum_sq' by 'df'
anova_table['mean_sq'] = anova_table['sum_sq'] / anova_table['df']
mse = anova_table['mean_sq'][1]

anova_table

In [None]:
# Set your alpha level (e.g., 0.05)
alpha = 0.05
# Extract the p-value
p_value = anova_table["PR(>F)"][0]

# Print out if the result is significant based on the alpha level
if p_value < alpha:
    print(f"Since p-value = {p_value:.4f} is less than alpha = {alpha}, we reject the null hypothesis of equal means.")
else:
    print(f"Since p-value = {p_value:.4f} is greater than alpha = {alpha}, we fail to reject the null hypothesis.")

In [None]:
# ANOVA says that there is significant difference between the means -> time to do the pairwise comparisons between group means using Fisher's LSD.
# Mean values of each group
group_means = data2.groupby('percentage')['strength'].mean()
print(group_means)

# Number of observations per group
group_sizes = data2.groupby('percentage').size()
print(group_sizes)

# Perform pairwise t-tests (Fisher's LSD)
comparisons = []
mean_diffs = []
lsd_diffs = []

group_means.index

#alpha
alpha = 0.05

# Degrees of freedom
n_total = len(data2)
n_groups = len(group_sizes)
df_error = n_total - n_groups #degrees of freedom for the error N-a

for i, tech1 in enumerate(group_means.index):
      for tech2 in group_means.index[i+1:]:
            mean_diff = abs(group_means[tech1] - group_means[tech2])

            pooled_se = np.sqrt(mse * (1/group_sizes[tech1] + 1/group_sizes[tech2]))

            # t-statistic
            t_critical =  stats.t.ppf(1 - alpha/2, df_error)

            #LSD
            lsd = t_critical * pooled_se


            # Calculate the p-value for the t-statistic
            comparisons.append(f"{tech1} vs {tech2}")
            mean_diffs.append(mean_diff)
            lsd_diffs.append(lsd)

            if mean_diff > lsd:
                print(f"Significant difference between {tech1} and {tech2}")
            else:
                print(f"No significant difference between {tech1} and {tech2}")


In [None]:
# Obtain residuals from the model
residuals = anova_model.resid

# Q-Q plot of the residuals
sm.qqplot(residuals, line = "45", fit = True)
plt.title('Normal Q-Q Plot of Residuals')
plt.show() #seems pretty okayyy

In [None]:
fitted_values = anova_model.fittedvalues

# Plot residuals versus fitted values
plt.scatter(fitted_values, residuals)
plt.axhline(0, color='red', linestyle='--', lw=2)
plt.title('Residuals vs. Predicted Tensile Strength')
plt.xlabel('Predicted Tensile Strength')
plt.ylabel('Residuals')
plt.show() #no transformations needed, we don't see any patterns, the plot is structureless