# Lab | Inferential statistics - T-test & P-value

In [225]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

1. One tailed t-test

In [226]:
#Null hypothesis (H0) no significant difference in the average speed of packing between the new machine and the old machine
#Alternative (H1) new machine packs faster on average than the old machine.

In [227]:
machine = pd.read_csv('files_for_lab/machine.txt', sep='\t', encoding='utf-16')

In [228]:
machine.head()

Unnamed: 0,New machine,Old machine
0,42.1,42.7
1,41.0,43.6
2,41.3,43.8
3,41.8,43.3
4,42.4,42.5


In [229]:
machine.columns = machine.columns.str.strip()

In [230]:
machine.head()

Unnamed: 0,New machine,Old machine
0,42.1,42.7
1,41.0,43.6
2,41.3,43.8
3,41.8,43.3
4,42.4,42.5


In [231]:
old_mean = machine['Old machine'].mean()
old_std = machine['Old machine'].std()
new_mean = machine['New machine'].mean()
new_std = machine['New machine'].std()

In [232]:
print(f"Old machine mean: {old_mean:.2f}, standard deviation: {old_std:.2f}")
print(f"New machine mean: {new_mean:.2f}, standard deviation: {new_std:.2f}")

Old machine mean: 43.23, standard deviation: 0.75
New machine mean: 42.14, standard deviation: 0.68


In [233]:
diff = machine.copy()
diff['difference'] = diff['Old machine']-diff['New machine']
diff.head()

Unnamed: 0,New machine,Old machine,difference
0,42.1,42.7,0.6
1,41.0,43.6,2.6
2,41.3,43.8,2.5
3,41.8,43.3,1.5
4,42.4,42.5,0.1


In [234]:
t_statistic, p_value = st.ttest_rel(machine['New machine'], machine['Old machine'], alternative='greater')
print("The t statistic of our sample is: {:.2f} and the corresponding p-value is: {:.2f}".format(t_statistic, p_value))

The t statistic of our sample is: -3.06 and the corresponding p-value is: 0.99


In [235]:
#0.99 > 0.05
#This means that Null hypothesis is accepted since P is close to 1.
#We will need more evidence to conclude that there is not enough evidence to support the claim that new machines are faster.

Matched Pairs Test

In [236]:
#Null hypothesis (H0): The defense and attack scores of the Pokemon are equal.
#Alternative hypothesis (H1): The defense and attack scores of the Pokemon are not equal.

In [237]:
pokemon = pd.read_csv('files_for_lab/pokemon.csv')

In [238]:
pokemon.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [239]:
pok_diff = pokemon.copy()
pok_diff['difference'] = pok_diff['Defense']-pok_diff['Attack']
pok_diff.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,difference
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False,0
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False,1
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False,1
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False,23
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False,-9


In [240]:
t_statistic, p_value = st.ttest_rel(pokemon['Defense'], pokemon['Attack'])
print("The t statistic of our sample is: {:.2f} and the corresponding p-value is: {:.2f}".format(t_statistic, p_value))

The t statistic of our sample is: -4.33 and the corresponding p-value is: 0.00


In [241]:
#P-value of 0, suggests difference between two variables. We reject null and accept alternative.

# ANOVA

Part 1

In [242]:
anova = pd.read_excel('files_for_lab/anova_lab_data.xlsx')

In [243]:
anova

Unnamed: 0,Power,Etching Rate
0,160 W,5.43
1,180 W,6.24
2,200 W,8.79
3,160 W,5.71
4,180 W,6.71
5,200 W,9.2
6,160 W,6.22
7,180 W,5.98
8,200 W,7.9
9,160 W,6.01


In [244]:
#Null: Changing of power has no efffect on the etching rate
#Alternate: Changing the power of the plasma beam has an effect on the etching
#Significance level: 0.05 or 5%.
#Degrees of freedom of model= Depends on groups?

Part 2

In [245]:
anova.columns

Index(['Power ', 'Etching Rate'], dtype='object')

In [246]:
anova.columns = anova.columns.str.strip()

In [247]:
anova.columns

Index(['Power', 'Etching Rate'], dtype='object')

In [248]:
anova['Power'] = anova['Power'].str.replace(' W', '')

In [249]:
#group_df = anova.groupby('Etching Rate')['Power'].agg(Power_mean='mean',Samples='size').reset_index()
#group_df

In [250]:
group_df1 = anova.groupby('Power')['Etching Rate'].agg(Etching_mean='mean',Samples='size').reset_index()
group_df1

Unnamed: 0,Power,Etching_mean,Samples
0,160,5.792,5
1,180,6.238,5
2,200,8.318,5


In [251]:
#group_df = interest_r.groupby('City')['Rate'].agg(City_mean='mean',Samples='size').reset_index()
#group_df

In [252]:
S2t = 0
for Power in anova['Power'].unique():
    ng = len(anova[anova['Power'] == Power])  
    S2t  += ( ( anova[anova['Power'] == Power]['Etching Rate'].mean() - anova['Etching Rate'].mean() ) ** 2) * ng
S2t /= ( anova['Power'].nunique() - 1 )
print("The value of S2t is {:.2f}".format(S2t)) 

The value of S2t is 9.09


In [253]:
S2E = 0
for Power in anova['Power'].unique():
    for rate in anova[anova['Power'] == Power]['Etching Rate']:
        S2E += (rate - anova[anova['Power'] == Power]['Etching Rate'].mean()) ** 2
S2E /= (len(anova) - anova['Power'].nunique())
print("The value of S2E is {:.2f}".format(S2E))

The value of S2E is 0.25


In [254]:
F = S2t / S2E
print("The value of F is {:.2f}".format(F))

The value of F is 36.88


In [255]:
d1 = anova['Power'].nunique() - 1
d2 = len(anova) - anova['Power'].nunique()
print("Number of degrees of freedom d1: ",d1)
print("Number of degrees of freedom d2: ",d2)

Number of degrees of freedom d1:  2
Number of degrees of freedom d2:  12


In [256]:
st.f.cdf(F,dfn=d1, dfd=d2)

0.9999924934157276

In [257]:
#NOT
my_p = 1 - st.f.cdf(F,dfn=d1, dfd=d2)
print("P is:", my_p)
#P Value

P is: 7.5065842723986975e-06


In [258]:
Fc = st.f.ppf(1-0.05,dfn=d1, dfd=d2)
print("The critical value which corresponds to an area of 0.05 is: {:.2f}".format(Fc))

The critical value which corresponds to an area of 0.05 is: 3.89


In [259]:
#Use this to calculate all at once
f_value, p_value = st.f_oneway(anova[anova['Power'] == '160']['Etching Rate'],
                             anova[anova['Power'] == '180']['Etching Rate'],
                             anova[anova['Power'] == '200']['Etching Rate'])
print('F-value:', f_value)
print('P-value:', p_value)

F-value: 36.87895470100505
P-value: 7.506584272358903e-06


In [260]:
print('Scipy-F-value:', f_value)
print('Scipy-P-value:', p_value)
print(" ")
print('My-F-value:', F)
print('My-P-value:', my_p)

Scipy-F-value: 36.87895470100505
Scipy-P-value: 7.506584272358903e-06
 
My-F-value: 36.87895470100503
My-P-value: 7.5065842723986975e-06


In [261]:
# Pvalue is > TC = Reject null hypothesis
#Meaning Etching rate is determined by power level.