In [3]:
#import the libraries
import pandas as pd 
import numpy as np
from scipy import stats

## Question - 1

#### A F&B manager wants to determine whether there is any significant difference in the diameter of the cutlet between two units. A randomly selected sample of cutlets was collected from both units and measured? Analyze the data and draw inferences at 5% significance level. Please state the assumptions and tests that you carried out to check validity of the assumptions.

File : Cutlets.csv

#### Assumption
* For H0 - There is no significant difference in the diameter of the cutlets between the two units
    > H0: Unit A = Unit B
* For HA - There is a difference in the diameter of the cutlets between the two units i.e., it is lesser or greater than each other
    > HA: Unit A <> Unit B
* Since there are 2 samples provided and we need to check the condition if it is equal or not, so we need check for 2 sample 2 tail test

In [39]:
# Reading the data from the file 'Q9_a.csv'
q1_data = pd.read_csv("Cutlets.csv")
q1_data.head()

Unnamed: 0,Unit A,Unit B
0,6.809,6.7703
1,6.4376,7.5093
2,6.9157,6.73
3,7.3012,6.7878
4,7.4488,7.1522


In [40]:
# 2-Sample 2-tail test
q1_data['Unit A'].head()
q1_data['Unit B'].head()

0    6.7703
1    7.5093
2    6.7300
3    6.7878
4    7.1522
Name: Unit B, dtype: float64

In [41]:
stats.ttest_ind(q1_data['Unit A'], q1_data['Unit B'])

Ttest_indResult(statistic=0.7228688704678063, pvalue=0.47223947245995)

In [42]:
p_value_1 = stats.ttest_ind(q1_data['Unit A'], q1_data['Unit B'])[1]  # 2-Sample, 2-Tail Test
p_value_1 # 0.47223947245995

0.47223947245995

In [None]:
# α = 0.05 (At 5% significance level)
# Since p_value i.e., 0.47223947245995 is greater than α i.e., 0.05 we fail to reject the Null Hypothesis and so Null Hypothesis H0 stands good
# Interpretation: Unit A and Unit B are of the same size i.e. Unit A = Unit B.

## Question - 2

#### A hospital wants to determine whether there is any difference in the average Turn Around Time (TAT) of reports of the laboratories on their preferred list. They collected a random sample and recorded TAT for reports of 4 laboratories. TAT is defined as sample collected to report dispatch.

#### Analyze the data and determine whether there is any difference in average TAT among the different laboratories at 5% significance level.

File: LabTAT.csv

#### Assumption
* For H0 - There is no difference in the average Turn Around Time (TAT) of reports of the laboratories on their preferred list
    > H0:  Laboratory 1 = Laboratory 2 = Laboratory 3 = Laboratory 4
* For HA - There is a difference in the average Turn Around Time (TAT) of reports of the laboratories on their preferred list
    > HA: Laboratory 1 <> Laboratory 2 <> Laboratory 3 <> Laboratory 4
* Since there are 4 samples provided and we need to check the condition if it is equal or not, so we need check for one way ANOVA

In [43]:
#import the libraries
import pandas as pd 
import numpy as np
from scipy import stats

In [44]:
# Reading the data from the file 'Q9_a.csv'
q2_data = pd.read_csv("LabTAT.csv")
q2_data.head()

Unnamed: 0,Laboratory 1,Laboratory 2,Laboratory 3,Laboratory 4
0,185.35,165.53,176.7,166.13
1,170.49,185.91,198.45,160.79
2,192.77,194.92,201.23,185.18
3,177.33,183.0,199.61,176.42
4,193.41,169.57,204.63,152.6


In [7]:
stats.f_oneway(q2_data['Laboratory 1'], q2_data['Laboratory 2'], q2_data['Laboratory 3'], q2_data['Laboratory 4'])

F_onewayResult(statistic=118.70421654401437, pvalue=2.1156708949992414e-57)

In [8]:
p_value_2 = stats.f_oneway(q2_data['Laboratory 1'], q2_data['Laboratory 2'], q2_data['Laboratory 3'], q2_data['Laboratory 4'])[1]
p_value_2 # 2.1156708949992414e-57

2.1156708949992414e-57

In [18]:
# α = 0.05 (At 5% significance level)
# Since p_value of 2.1156708949992414e-57 is lesser than α we reject the Null Hypothesis and so Alternate Hypothesis HA holds good
# Interpretation: There is a difference in the average Turn Around Time (TAT) of reports of the laboratories on their preferred list
# i.e. Laboratory 1 <> Laboratory 2 or Laboratory 2 <> Laboratory 3 or Laboratory 3 <> Laboratory 4 or Laboratory 1 <> Laboratory 3
# So at least 1 of the laboratories are different.

## Question - 3

#### Sales of products in four different regions is tabulated for males and females. Find if male-female buyer rations are similar across regions.
#### Analyze the data and determine whether there is any difference in average TAT among the different laboratories at 5% significance level.
>                     East      West    North   South
>           Males	50	  142       131     70
>           Females	550	  351       480     350


File: BuyerRatio.csv

#### Assumption
* For H0 - All proportions are equal
    > H0:  Laboratory 1 = Laboratory 2 = Laboratory 3 = Laboratory 4
* For HA - All proportions are not equal
    > HA: Laboratory 1 <> Laboratory 2 <> Laboratory 3 <> Laboratory 4
* We need to check the pvalue using stats.chi2_contingency.
We check if pvalue is lesser than α = 0.05 then reject Null Hypothesis else accept Null Hypothesis.

In [33]:
#import the libraries
import pandas as pd 
import numpy as np
from scipy import stats
from scipy.stats import chi2_contingency

In [34]:
# Reading the data from the file 'Q9_a.csv'
data_q3 = pd.read_csv("BuyerRatio.csv")
data_q3 = data_q3.set_index('Observed Values')
data_q3

Unnamed: 0_level_0,East,West,North,South
Observed Values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Males,50,142,131,70
Females,435,1523,1356,750


In [35]:
stats.chi2_contingency(data_q3) # test statistic, p-value,df,expected frequency

Chi2ContingencyResult(statistic=1.595945538661058, pvalue=0.6603094907091882, dof=3, expected_freq=array([[  42.76531299,  146.81287862,  131.11756787,   72.30424052],
       [ 442.23468701, 1518.18712138, 1355.88243213,  747.69575948]]))

In [36]:
stats.chi2_contingency(data_q3)[1] # 0.6603094907091882

0.6603094907091882

In [29]:
# p-value 0.6603094907091882 > 0.05 so we fail to reject null hypothesis. Accept H0: All proportions are equal.

## Question - 4

#### TeleCall uses 4 centers around the globe to process customer order forms. They audit a certain %  of the customer order forms. Any error in order form renders it defective and has to be reworked before processing.  The manager wants to check whether the defective %  varies by centre. Please analyze the data at 5% significance level and help the manager draw appropriate inferences

File: CustomerOrderForm.csv

#### Assumption
* For H0 - defective % doesn't varies by center
    > H0:  (Mean of centers equal, C1=C2=C3=C4)
* For HA - defective % varies by center
    > HA:  C1 to C4 varies
* We need to check the pvalue using stats.chi2_contingency.
We check if pvalue is lesser than significance percentage (5%) i.e., α = 0.05 then reject Null Hypothesis else accept Null Hypothesis.

In [67]:
#import the libraries
import pandas as pd 
import numpy as np
from scipy import stats

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import shapiro
from scipy.stats import normaltest
from scipy.stats import chi2_contingency
from scipy.stats import chi2

In [104]:
# Reading the data from the file 'Q9_a.csv'
data_q4 = pd.read_csv("CustomerOrderForm.csv")
#data_q3 = data_q3.set_index('Observed Values')
data_q4.head()

Unnamed: 0,Phillippines,Indonesia,Malta,India
0,Error Free,Error Free,Defective,Error Free
1,Error Free,Error Free,Error Free,Defective
2,Error Free,Defective,Defective,Error Free
3,Error Free,Error Free,Error Free,Error Free
4,Error Free,Error Free,Defective,Error Free


In [103]:
data4 = data_q4
#.replace({'Error Free', 'Defective'},{1,0})
data4 = data_q4.replace({'Error Free': 1, 'Defective': 0})
data4.head()

Unnamed: 0,Phillippines,Indonesia,Malta,India
0,1,1,0,1
1,1,1,1,0
2,1,0,0,1
3,1,1,1,1
4,1,1,0,1


##### Using stats.f_oneway

In [84]:
# Using stats.f_oneway
p_value_4 = stats.f_oneway(data4['Phillippines'],data4['Indonesia'],data4['Malta'],data4['India'])
p_value_4

F_onewayResult(statistic=1.286168556089167, pvalue=0.2776780955705948)

In [83]:

p_value_4 = stats.f_oneway(data4['Phillippines'],data4['Indonesia'],data4['Malta'],data4['India'])[1]
p_value_4 # 0.2776780955705948
# p-value 0.2776780955705948 > 0.05 (Significance given as 5%) so we fail to reject null hypothesis. Accept H0: defective % doesn't varies by center.

0.2776780955705948

##### Using chi2_contingency

In [99]:
# Using chi2_contingency

# Creating frequency table
data4_frequency =data4.stack().reset_index()
data4_frequency = data4_frequency.rename(columns = {'level_1':'Country',0:'Category'})
data4_frequency.head()
data4_frequency = pd.crosstab(data4_frequency['Country'],data4_frequency['Category'])
data4_frequency.head()


Category,0,1
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
India,20,280
Indonesia,33,267
Malta,31,269
Phillippines,29,271


In [97]:
chi2_contingency(data4_frequency)

Chi2ContingencyResult(statistic=3.8589606858203545, pvalue=0.27710209912331435, dof=3, expected_freq=array([[ 28.25, 271.75],
       [ 28.25, 271.75],
       [ 28.25, 271.75],
       [ 28.25, 271.75]]))

In [98]:
chi2_contingency(data4_frequency)[1]

0.27710209912331435

In [100]:
# p-value 0.27710209912331435 > 0.05 so we fail to reject null hypothesis. Accept H0: defective % doesn't varies by center