In [1]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

import pandas as pd
import pandas as pd
import numpy as np


import warnings
warnings.filterwarnings('ignore')


import matplotlib.pyplot as plt
%matplotlib inline


import seaborn as sns

 Lab | Inferential statistics - T-test & P-value

### Instructions

1. *One tailed t-test* - In a packing plant, a machine packs cartons with jars. It is supposed that a new machine will pack faster on the average than the machine currently used. To test that hypothesis, the times it takes each machine to pack ten cartons are recorded. The results, in seconds, are shown in the tables in the file `files_for_lab/ttest_machine.xlsx.`.
   Assume that there is sufficient evidence to conduct the t test, does the data provide sufficient evidence to show if one machine is better than the other?

2. *Matched Pairs Test* - In this challenge we will compare dependent samples of data describing our Pokemon (file `files_for_lab/pokemon.csv`). Our goal is to see whether there is a significant difference between each Pokemon's defense and attack scores. Our hypothesis is that the defense and attack scores are equal. Compare the two columns to see if there is a statistically significant difference between them and comment your result.


# Inferential statistics - ANOVA

Note: The following lab is divided in 2 sections.

## Part 1

In this activity, we will look at another example. Your task is to understand the problem and write down all the steps to set up ANOVA. After the next lesson, we will ask you to solve this problem using Python. Here are the steps that you would need to work on:
    - Null hypothesis
    - Alternate hypothesis
    - Level of significance
    - Test statistic
    - P-value
    - F table
    


### Context

In this challenge,we will return to the Pokemon dataset.   We want to understand whether there are significant differences among various types of pokemons' Total value, i.e. Grass vs Poison vs Fire vs Dragon... There are many types of pokemons which makes it a perfect use case for ANOVA. (file `files_for_lab/pokemon.csv`)
First let's obtain the unique values of the pokemon types.
Second we will create a list named pokemon_totals to contain the Total values of each unique type of pokemons.
Third we run ANOVA test on pokemon_totals.


- State the null hypothesis
- State the alternate hypothesis
- What is the significance level
- What are the degrees of freedom of model, error terms, and total DoF



## Part 2


- What conclusions can you draw from the experiment and why?
- Interpret the ANOVA test result. Is the difference significant?


# T-Test & P_Value

In [2]:
# 1. one tailed t-test
ttest_machine = pd.read_csv(r'C:\Users\Pedro Gomes\Desktop\Filipa_Ironhack\Labs\ML\lab-t-tests-p-values\files_for_lab\ttest_machine.txt', sep = ' ')
ttest_machine.describe()

#Since we have 2 independent variables we have to use the ttest_ind

from scipy.stats import ttest_ind #for 2 independent variables )

# Creat 2 assrays with the columns we want 
old_machine_data = np.array(ttest_machine['Old_machine'])
new_machine_data = np.array(ttest_machine['New_machine'])

# Perform one-tailed t-test with the 2 list we created
t_statistic, p_value = ttest_ind(old_machine_data,new_machine_data )

# Specify the significance level 5%
alpha = 0.05

# Check if p-value is less than alpha for a one-tailed test

print ('The Pvalue is:', (p_value))

if p_value < alpha:
    print("Since the P_value is lower 5% we reject the null hypothesis, we means we are 95% confident that at least one of the machines is faster then the other")
    
else:
    print("Since the P_value is higher then the alfa we fail to reject the null hypothesis.")

 # Answer: We can say that 1 is better then the other because the means are different, but we cannot say wich one is better   

The Pvalue is: 0.0032111425007745158
Since the P_value is lower 5% we reject the null hypothesis, we means we are 95% confident that at least one of the machines is faster then the other


In [3]:
pokemon_data = pd.read_csv(r'C:\Users\Pedro Gomes\Desktop\Filipa_Ironhack\Labs\ML\lab-t-tests-p-values\files_for_lab\pokemon.txt')
pokemon_data

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


In [4]:
# 2. Matched Pairs Test
# H0 or the null hypothesis is that the defense and attack scores are equal
# H1 the scores for atack em defence are diferent.

pokemon_attack = np.array(pokemon_data['Attack'])
pokemon_defense = np.array(pokemon_data['Defense'])

# Perform one-tailed t-test with the 2 list we created
t_statistic, p_value = ttest_ind(pokemon_attack,pokemon_defense )

# Specify the significance level 5%
alpha = 0.05

# Check if p-value is less than alpha for a one-tailed test

print ('The Pvalue is:', (p_value))

if p_value < alpha:
    print("Since the P_value is lower 5% we reject the null hypothesis, we means we are 95% confident that the mean scores of pokemon attack and defense are diferent")
    
else:
    print("Since the P_value is higher then the alfa we fail to reject the null hypothesis.")


The Pvalue is: 0.0012123980547321454
Since the P_value is lower 5% we reject the null hypothesis, we means we are 95% confident that the mean scores of pokemon attack and defense are diferent


# Inferential Statistics - ANOVA 

In [5]:
# Part 1

# so the null hypothesis or H0 is that the total means for every unique type of pokemon are the same 
# the alternate hypothesis or H1 is where at least one mean is diferent
# alfa is = 5%


In [6]:
# unique types of pokemon values for each pokemon 

print(pokemon_data['Type 1'].unique())
print(pokemon_data['Type 2'].unique())

# On type 2 there are a lot of Nan values, so I will not use it. 

print(pokemon_data.value_counts(['Type 2']))
print(pokemon_data.isna().sum()/len(pokemon_data))

['Grass' 'Fire' 'Water' 'Bug' 'Normal' 'Poison' 'Electric' 'Ground'
 'Fairy' 'Fighting' 'Psychic' 'Rock' 'Ghost' 'Ice' 'Dragon' 'Dark' 'Steel'
 'Flying']
['Poison' nan 'Flying' 'Dragon' 'Ground' 'Fairy' 'Grass' 'Fighting'
 'Psychic' 'Steel' 'Ice' 'Rock' 'Dark' 'Water' 'Electric' 'Fire' 'Ghost'
 'Bug' 'Normal']
Type 2  
Flying      97
Ground      35
Poison      34
Psychic     33
Fighting    26
Grass       25
Fairy       23
Steel       22
Dark        20
Dragon      18
Rock        14
Water       14
Ice         14
Ghost       14
Fire        12
Electric     6
Normal       4
Bug          3
dtype: int64
#             0.0000
Name          0.0000
Type 1        0.0000
Type 2        0.4825
Total         0.0000
HP            0.0000
Attack        0.0000
Defense       0.0000
Sp. Atk       0.0000
Sp. Def       0.0000
Speed         0.0000
Generation    0.0000
Legendary     0.0000
dtype: float64


In [8]:
# Do the list with the value counts os each unique type 1 pokemon
#there are 19 unique types in type 2 one of them is nan so we can drop it.

unique_types = list(set(pokemon_data['Type 1'].unique().tolist() + pokemon_data['Type 2'].dropna().unique().tolist()))
unique_types

['Fire',
 'Psychic',
 'Grass',
 'Dragon',
 'Fairy',
 'Bug',
 'Rock',
 'Fighting',
 'Ghost',
 'Ground',
 'Steel',
 'Poison',
 'Normal',
 'Water',
 'Electric',
 'Flying',
 'Ice',
 'Dark']

In [10]:
#looping through all unique types in type 1 and 2 and add them do a list 
pokemon_totals=[]

for t in unique_types:
    
    pokemon_groups = pokemon_data[(pokemon_data['Type 1'] == t) | (pokemon_data['Type 2'] == t)]['Total']
    
    pokemon_totals.append(pokemon_groups.values)

In [13]:
from scipy import stats
from scipy.stats import f_oneway

f_value, p_value = stats.f_oneway(*pokemon_totals)
print ('the value of P is:', p_value)
print('the value of f is:', f_value)

the value of P is: 2.6457458815984803e-15
the value of f is: 6.6175382960055344


In [None]:
# As we Can see the p value is less then 0,5 which means we reject the null hypothesis, which means we are 95% confident 
# that the means of the pokemon type are segnificsantly different, but we cannot know how different and which ones are different