In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from statsmodels.formula.api import ols      # For n-way ANOVA
from statsmodels.stats.anova import _get_covariance,anova_lm # For n-way ANOVA
%matplotlib inline
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [2]:
## Loading the dataset Excel file
data=pd.read_excel('Dental_Hardness_data.xlsx')

### Dental implant data: The hardness of metal implant in dental cavities depends on multiple factors, such as the method of implant, the temperature at which the metal is treated, the alloy used as well as on the dentists who may favour one method above another and may work better in his/her favourite method. The response is the variable of interest.



### 𝐻0: 𝜇1 = 𝜇2 = 𝜇3 = 𝜇4 against 𝐻𝑎: At least for one dental implant response is different from the rest.


In [3]:
data.head()

Unnamed: 0,Dentist,Method,Alloy,Temp,Response,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,1.0,1.0,1.0,1500.0,813.0,,,Anova: Two-Factor Without Replication,,,,,,
1,1.0,1.0,1.0,1600.0,792.0,,,,,,,,,
2,1.0,1.0,1.0,1700.0,792.0,,,SUMMARY,Count,Sum,Average,Variance,,
3,1.0,1.0,2.0,1500.0,907.0,,,1,4,2315,578.75,523721.583333,,
4,1.0,1.0,2.0,1600.0,792.0,,,1,4,2394,598.5,584819,,


In [4]:
data.shape

(107, 14)

In [5]:
data.describe(exclude=np.number)

Unnamed: 0,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
count,102,100,100,99,98,3,3
unique,17,8,88,87,91,3,3
top,1,4,2220,582,524696,P-value,F crit
freq,18,90,2,2,2,1,1


In [6]:
data.isnull().sum()

Dentist         17
Method          17
Alloy           17
Temp            17
Response        17
Unnamed: 5     107
Unnamed: 6     107
Unnamed: 7       5
Unnamed: 8       7
Unnamed: 9       7
Unnamed: 10      8
Unnamed: 11      9
Unnamed: 12    104
Unnamed: 13    104
dtype: int64

In [7]:
data.describe()

Unnamed: 0,Dentist,Method,Alloy,Temp,Response,Unnamed: 5,Unnamed: 6
count,90.0,90.0,90.0,90.0,90.0,0.0,0.0
mean,3.0,2.0,1.5,1600.0,741.777778,,
std,1.422136,0.821071,0.502801,82.107083,145.767845,,
min,1.0,1.0,1.0,1500.0,289.0,,
25%,2.0,1.0,1.0,1500.0,698.0,,
50%,3.0,2.0,1.5,1600.0,767.0,,
75%,4.0,3.0,2.0,1700.0,824.0,,
max,5.0,3.0,2.0,1700.0,1115.0,,


In [8]:
# Drop columns 
data.drop(['Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13'], axis=1, inplace= True,)

In [9]:
data.head()

Unnamed: 0,Dentist,Method,Alloy,Temp,Response
0,1.0,1.0,1.0,1500.0,813.0
1,1.0,1.0,1.0,1600.0,792.0
2,1.0,1.0,1.0,1700.0,792.0
3,1.0,1.0,2.0,1500.0,907.0
4,1.0,1.0,2.0,1600.0,792.0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Dentist   90 non-null     float64
 1   Method    90 non-null     float64
 2   Alloy     90 non-null     float64
 3   Temp      90 non-null     float64
 4   Response  90 non-null     float64
dtypes: float64(5)
memory usage: 4.3 KB


In [11]:
data.dtypes

Dentist     float64
Method      float64
Alloy       float64
Temp        float64
Response    float64
dtype: object

In [12]:
data['Dentist'].value_counts()

1.0    18
2.0    18
3.0    18
4.0    18
5.0    18
Name: Dentist, dtype: int64

In [13]:
data['Method'].value_counts()

1.0    30
2.0    30
3.0    30
Name: Method, dtype: int64

In [14]:
data['Alloy'].value_counts()

1.0    45
2.0    45
Name: Alloy, dtype: int64

In [15]:
#data['Alloy'].replace(to_replace=1.0, value='Alloy1', inplace=True)
#data['Alloy'].replace(to_replace=2.0, value='Alloy2', inplace=True)

In [16]:
data['Temp'].value_counts()

1500.0    30
1600.0    30
1700.0    30
Name: Temp, dtype: int64

In [17]:
data['Response'].value_counts().head()

743.0    6
792.0    6
813.0    5
835.0    5
715.0    5
Name: Response, dtype: int64

In [18]:
ano=ols('Dentist~Method',data=data).fit()
one=sm.stats.anova_lm(ano, type=2)
one

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
Method,1.0,3.0814880000000004e-33,3.0814880000000004e-33,1.506505e-33,1.0
Residual,88.0,180.0,2.045455,,


In [19]:
ano2=ols('Dentist~Method+Alloy',data=data).fit()
two=sm.stats.anova_lm(ano2, type=2)
two

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
Method,1.0,3.0814880000000004e-33,3.0814880000000004e-33,1.489386e-33,1.0
Alloy,1.0,2.359264e-31,2.359264e-31,1.140311e-31,1.0
Residual,87.0,180.0,2.068966,,


In [20]:
ano3=ols('Dentist~Method+Alloy+Temp',data=data).fit()
three=sm.stats.anova_lm(ano3, type=2)
three

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
Method,1.0,3.0814880000000004e-33,3.0814880000000004e-33,1.4722660000000001e-33,1.0
Alloy,1.0,2.359264e-31,2.359264e-31,1.127204e-31,1.0
Temp,1.0,1.701752e-30,1.701752e-30,8.130591e-31,1.0
Residual,86.0,180.0,2.093023,,


In [21]:
ano4=ols('Dentist~Method+Alloy+Temp+Response',data=data).fit()
three=sm.stats.anova_lm(ano3, type=2)
three

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
Method,1.0,3.0814880000000004e-33,3.0814880000000004e-33,1.4722660000000001e-33,1.0
Alloy,1.0,2.359264e-31,2.359264e-31,1.127204e-31,1.0
Temp,1.0,1.701752e-30,1.701752e-30,8.130591e-31,1.0
Residual,86.0,180.0,2.093023,,


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Dentist   90 non-null     float64
 1   Method    90 non-null     float64
 2   Alloy     90 non-null     float64
 3   Temp      90 non-null     float64
 4   Response  90 non-null     float64
dtypes: float64(5)
memory usage: 4.3 KB


### Dental implant data: 
### The hardness of metal implant in dental cavities depends on multiple factors, such as the method of implant, the temperature at which the metal is treated, the alloy used as well as on the dentists who may favour one method above another and may work better in his/her favourite method. The response is the variable of interest.


#### If p_value is less than alpha_value, We have evidence to reject the null hypothesis since p value < Level of significance¶
#### If p_value is greater than alpha_value, We fail to reject the null hypothesis since p value > Level of significance
#### Since p_value > alpha_value, here we fail to reject the null hypothesis.



### H0 : The means of 'Response' variable with respect to each Dentist is equal.

### H1 : At least one of the means of 'Response' variable with respect to each Dentist is unequal.


### 1. Test whether there is any difference among the dentists on the implant hardness. State the null and alternative hypotheses. Note that both types of alloys cannot be considered together. You must state the null and alternative hypotheses separately for the two types of alloys.?

In [23]:
data['Response'].value_counts().head()

743.0    6
792.0    6
813.0    5
835.0    5
715.0    5
Name: Response, dtype: int64

In [24]:
data['Dentist'].value_counts()

1.0    18
2.0    18
3.0    18
4.0    18
5.0    18
Name: Dentist, dtype: int64

In [25]:
pd.crosstab(data['Dentist'],data['Alloy'])


Alloy,1.0,2.0
Dentist,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,9,9
2.0,9,9
3.0,9,9
4.0,9,9
5.0,9,9


In [26]:
data1 = data[data['Alloy']== 1.0]
data1.head()

Unnamed: 0,Dentist,Method,Alloy,Temp,Response
0,1.0,1.0,1.0,1500.0,813.0
1,1.0,1.0,1.0,1600.0,792.0
2,1.0,1.0,1.0,1700.0,792.0
6,1.0,2.0,1.0,1500.0,782.0
7,1.0,2.0,1.0,1600.0,698.0


In [27]:
data2 = data[data['Alloy']== 2.0]
data2.head()

Unnamed: 0,Dentist,Method,Alloy,Temp,Response
3,1.0,1.0,2.0,1500.0,907.0
4,1.0,1.0,2.0,1600.0,792.0
5,1.0,1.0,2.0,1700.0,835.0
9,1.0,2.0,2.0,1500.0,1115.0
10,1.0,2.0,2.0,1600.0,835.0


### Formulate the Null and Alternate Hypothesis

#### Null Hypothesis $H_0$ : The means of 'Response' variable with respect to each Dentist is equal.

#### Alternate Hypothesis $H_A$ : At least one of the means of 'Response' variable with respect to each Dentist is unequal.


In [None]:
formula = 'Response ~ Dentist'
model = ols(formula, data1).fit()
aov_table = anova_lm(model)
print(aov_table)

            df         sum_sq       mean_sq         F    PR(>F)
Dentist    1.0   94802.677778  94802.677778  7.392027  0.009409
Residual  43.0  551474.566667  12824.989922       NaN       NaN


In [None]:
formula = 'Response ~ Dentist'
model = ols(formula, data2).fit()
aov_table = anova_lm(model)
print(aov_table)

In [None]:
data1.groupby('Dentist')['Response'].mean().head()

In [None]:
data2.groupby('Dentist')['Response'].mean().head() 

### 2.Before the hypotheses may be tested, state the required assumptions. Are the assumptions fulfilled? Comment separately on both alloy types.? 


#### Null Hypothesis $H_0$ : The means of 'Response' variable with respect to each Dentist is equal.

#### Alternate Hypothesis $H_A$ : At least one of the means of 'Response' variable with respect to each Dentist is unequal.

#### If p_value is less than alpha_value, We have evidence to reject the null hypothesis since p value < Level of significance¶
#### If p_value is greater than alpha_value, We fail to reject the null hypothesis since p value > Level of significance
#### Since p_value > alpha_value, here we fail to reject the null hypothesis.

#### Comment separately on both alloy types.? 
p-values for both type of Alloys

#### p-value for Alloy 1.0(data1) = 0.009409
(Alternate Hypothesis) $H_A$ : At least one of the means of 'Response' variable with respect to each Dentist is unequal.
above assumption is fulfilled for alloy 1.0

#### p-value for Alloy 2.0(data2) = 0.14879
(Null Hypothesis) $H_0$ : The means of 'Response' variable with respect to each Dentist is equal.
above assumption is fulfilled for alloy 2.0

### 3. Irrespective of your conclusion in 2, we will continue with the testing procedure. What do you conclude regarding whether implant hardness depends on dentists? Clearly state your conclusion. If the null hypothesis is rejected, is it possible to identify which pairs of dentists differ?


Interpretation: p-value for data1(Alloy1) = 0.009409
#### **Conclusion: Since the p value is less than the significance level (0.05), we can reject the null hypothesis and conclude that there is a difference in the means of 'Response' variable is different with at least one dentist is unequal**

Interpretation: p-value for data2(Alloy2) = 0.14879
#### **Conclusion: Since the p value is greater than the significance level (0.05), we fail to reject the null hypothesis and conclude that there is no difference in the mean of  'Response' variable is different with at least one dentist is unequal **

#### **If the null hypothesis is rejected is it possible to identify which pairs of dentists differ?**
Dentist 4 & 5. Since, mean Response of dentist 4 & 5 for Alloy 1.0 and Alloy 2.0 differ largely   


### 4. Now test whether there is any difference among the methods on the hardness of dental implant, separately for the two types of alloys. What are your conclusions? If the null hypothesis is rejected, is it possible to identify which pairs of methods differ?


In [None]:
data.Method.value_counts()

In [None]:
formula = 'Response ~ Method'
model = ols(formula, data1).fit()
aov_table = anova_lm(model)
print(aov_table)

In [None]:
formula = 'Response ~ Method'
model = ols(formula, data2).fit()
aov_table = anova_lm(model)
print(aov_table)

In [None]:
data1.groupby('Method')['Response'].mean().head()

In [None]:
data2.groupby('Method')['Response'].mean().head()

#### Null Hypothesis $H_0$ : The means of 'Response' variable with respect to each method is equal.

#### Alternate Hypothesis $H_A$ : At least one of the means of 'Response' variable with respect to each method is unequal.


Interpretation: p-value for data1(Alloy1) = 0.0036
#### **Conclusion: Since the p value is less than the significance level (0.05), we can reject the null hupothesis and conclude that there is a difference in the means of 'Response' variable is different among the with at least one methods on the hardness of dental implant**

Interpretation: p-value for data2(Alloy2) = 0.000149
#### **Conclusion: Since the p value is less than the significance level (0.05), we can reject the null hupothesis and conclude that there is a difference in the means of 'Response' variable is different among the with at least one methods on the hardness of dental implant**

#### **If the null hypothesis is rejected is it possible to identify which pairs of methods differ?
since, null hypothesis of both methods are rejected for Alloy 1.0 and Alloy 2.0 we cannot identify which pair of method differ


### 5. Now test whether there is any difference among the temperature levels on the hardness of dental implant, separately for the two types of alloys. What are your conclusions? If the null hypothesis is rejected, is it possible to identify which levels of temperatures differ?


In [None]:
formula = 'Response ~ Temp'
model = ols(formula, data1).fit()
aov_table = anova_lm(model)
print(aov_table)

In [None]:
formula = 'Response ~ Temp'
model = ols(formula, data2).fit()
aov_table = anova_lm(model)
print(aov_table)

In [None]:
data1.groupby('Temp')['Response'].mean().head()

In [None]:
data2.groupby('Temp')['Response'].mean().head()

#### Null Hypothesis $H_0$ : The means of 'Response' variable with respect to each temperature is equal. 

#### Alternate Hypothesis $H_A$ : At least one of the means of 'Response' variable with respect to each temperature is unequal.


Interpretation: p-value for data1(Alloy1) = 0.413618
#### **Conclusion: Since the p value is greater than the significance level (0.05), we fail to reject the null hypothesis and conclude that there is no difference in the means of 'Response' variable is different with at least one among the temperature levels is unequal**

Interpretation: p-value for data2(Alloy2) = 0.067246
#### **Conclusion: Since the p value is greater than the significance level (0.05), we fail to reject the null hypothesis and conclude that there is no difference in the means of 'Response' variable is different with at least one among the temperature levels is unequal**

#### **If the null hypothesis is rejected is it possible to identify which pairs of temperature differ?
since, null hypothesis of all given temperatures are accepted for Alloy 1.0 and Alloy 2.0 we cannot identify which pair of temperature differ


### 6. Consider the interaction effect of dentist and method and comment on the interaction plot, separately for the two types of alloys?


In [None]:
sns.pointplot(x='Dentist', y='Response', data=data1,color='black', hue='Alloy',ci=None);
sns.pointplot(x='Dentist', y='Response', data=data2,color=None, hue='Alloy',ci=None);

#### From the above interaction plot, since the both lines are not crossing each other, that means there is no interaction effect of these 2 variables.

In [None]:
sns.pointplot(x='Method', y='Response', data=data1, color='black', hue='Alloy',ci=None);
sns.pointplot(x='Method', y='Response', data=data2, hue='Alloy',ci=None);

#### From the above interaction plot, since the both lines are crossing each other, that means there is an interaction effect of these 2 variables.

### 7. Now consider the effect of both factors, dentist, and method, separately on each alloy. What do you conclude? Is it possible to identify which dentists are different, which methods are different, and which interaction levels are different?


In [None]:
formula = 'Response ~ C(Dentist) + C(Method)'
model = ols(formula, data1).fit()
aov_table = anova_lm(model)
print(aov_table)


In [None]:
formula = 'Response ~ C(Dentist) + C(Method)'
model = ols(formula, data2).fit()
aov_table = anova_lm(model)
print(aov_table)


In [None]:
data1.groupby(['Dentist','Method'])['Response'].mean()

In [None]:
data2.groupby(['Dentist','Method'])['Response'].mean()

####  1: Null Hypothesis $H_0$ : The means of 'Response' variable with respect to each dentist is equal. 

#### Alternate Hypothesis $H_A$ : At least one of the means of 'Response' variable with respect to each dentist is unequal.

#### 2: Null Hypothesis $H_0$ : The means of 'Response' variable with respect to each method is equal. 

#### Alternate Hypothesis $H_A$ : At least one of the means of 'Response' variable with respect to each method is unequal.


C(Dentist) Interpretation: p-value for data1(Alloy1) = C(Dentist)  0.051875
#### **Conclusion: Since the p value is greater than the significance level (0.05), we fail to reject the null hypothesis and conclude that there is no difference in the means of 'Response' variable is different with at least one among the dentist is unequal**

C(Method) Interpretation: p-value for data1(Alloy1) = C(Method)   0.002211
#### **Conclusion: Since the p value is less than the significance level (0.05), we can reject the null hypothesis and conclude that there is a difference in the means of 'Response' variable is different with at least one method is unequal**



C(Dentist) Interpretation: p-value for data2(Alloy2) = C(Dentist)   0.458933
#### **Conclusion: Since the p value is less than the significance level (0.05), we can reject the null hypothesis and conclude that there is a difference in the means of 'Response' variable is different with at least one Dentist is unequal**

C(Method) Interpretation: p-value for data2(Alloy2) = C(Method)   0.000008
#### **Conclusion: Since the p value is less than the significance level (0.05), we can reject the null hypothesis and conclude that there is a difference in the means of 'Response' variable is different with at least one method is unequal**



#### **If the null hypothesis is rejected is it possible to identify which pairs of Dentist and  method differ?

since, null hypothesis of all given temperatures are accepted for Alloy 1.0 and Alloy 2.0 we cannot identify which pair of temperature differ

In [None]:
import matplotlib.pyplot as plt
plt.subplots(figsize=(10,5))
sns.boxplot(x='Dentist', y='Response', hue='Alloy', data=data)

In [None]:
import matplotlib.pyplot as plt
plt.subplots(figsize=(10,5))
sns.boxplot(x='Method', y='Response', hue='Alloy', data=data)