# Variable Distribution Type Tests (Gaussian)
- Shapiro-Wilk Test
- D’Agostino’s K^2 Test
- Anderson-Darling Test

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(font_scale=2, palette= "viridis")
from scipy import stats

In [None]:
data = pd.read_csv('../data/pulse_data.csv')
data.head() 

## Visual Normality Check 

In [None]:
data.Height.describe() 

In [None]:
data.skew()

In [None]:
data.kurtosis() 

In [None]:
plt.figure(figsize=(10,8))
sns.histplot(data=data, x='Height')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.histplot(data=data, x='Age', kde=True)
plt.show()

In [None]:
# Checking for normality by Q-Q plot graph
plt.figure(figsize=(12, 8))
stats.probplot(data['Age'], plot=plt, dist='norm')
plt.show()

__the data should be on the red line. If there are data points that are far off of it, it’s an indication that there are some deviations from normality.__

In [None]:
# Checking for normality by Q-Q plot graph
plt.figure(figsize=(12, 8))
stats.probplot(data['Height'], plot=plt, dist='norm')
plt.show()

__the data should be on the red line. If there are data points that are far off of it, it’s an indication that there are some deviations from normality.__

## Shapiro-Wilk Test
Tests whether a data sample has a Gaussian distribution/normal distribution.

### Assumptions
Observations in each sample are independent and identically distributed (iid).

### Interpretation
- H0: The sample has a Gaussian/normal distribution.
- Ha: The sample does not have a Gaussian/normal distribution.

In [None]:
stats.shapiro(data['Age'])

In [None]:
stat, p_value = shapiro(data['Age'])
print(f'statistic = {stat}, p-value = {p_value}')
alpha = 0.05 
if p_value > alpha: 
    print("The sample has normal distribution(Fail to reject the null hypothesis, the result is not significant)")
else: 
    print("The sample does not have a normal distribution(Reject the null hypothesis, the result is significant)")

## D’Agostino’s K^2 Test
Tests whether a data sample has a Gaussian distribution/normal distribution.

### Assumptions
Observations in each sample are independent and identically distributed (iid).

### Interpretation
- H0: The sample has a Gaussian/normal distribution.
- Ha: The sample does not have a Gaussian/normal distribution.

In [None]:
stats.normaltest(data['Age'])

In [None]:
stat, p_value = stats.normaltest(data['Age'])
print(f'statistic = {stat}, p-value = {p_value}')
alpha = 0.05 
if p_value > alpha: 
    print("The sample has normal distribution(Fail to reject the null hypothesis, the result is not significant)")
else: 
    print("The sample does not have a normal distribution(Reject the null hypothesis, the result is significant)")

__Remember__
- If Data Is Gaussian:
	- Use Parametric Statistical Methods
- Else:
	- Use Nonparametric Statistical Methods