In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import statsmodels.stats.api as sms
import matplotlib.pyplot as plt
from scipy.stats import shapiro, levene, ttest_ind, mannwhitneyu, kruskal

## Statistical Analysis the dataset "Tips"
# Tip -  smoker, sex, time, size analysis
# Bill - smoker, sex, time, size analysis
# ANOVA code 

In [2]:
df = sns.load_dataset("Tips")

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.785943,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9511,1.0,2.0,2.0,3.0,6.0


In [10]:
# Discovering the data

In [6]:
df["total_bill"].mean()

19.785942622950824

In [7]:
df["tip"].mean()

2.9982786885245902

In [9]:
df["sex"].value_counts()

Male      157
Female     87
Name: sex, dtype: int64

In [12]:
df["smoker"].value_counts()

No     151
Yes     93
Name: smoker, dtype: int64

In [13]:
df["day"].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [14]:
df["time"].value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [15]:
df["size"].value_counts()

2    156
3     38
4     37
5      5
1      4
6      4
Name: size, dtype: int64

In [18]:
df.groupby("sex").agg({"total_bill" : ["mean"]})

Unnamed: 0_level_0,total_bill
Unnamed: 0_level_1,mean
sex,Unnamed: 1_level_2
Female,18.056897
Male,20.744076


In [19]:
df.groupby("time").agg({"total_bill" : ["mean"]})

Unnamed: 0_level_0,total_bill
Unnamed: 0_level_1,mean
time,Unnamed: 1_level_2
Dinner,20.797159
Lunch,17.168676


In [22]:
df.groupby("size").agg({
    "total_bill" : ["mean"]})

Unnamed: 0_level_0,total_bill
Unnamed: 0_level_1,mean
size,Unnamed: 1_level_2
1,7.2425
2,16.448013
3,23.277632
4,28.613514
5,30.068
6,34.83


In [23]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# Tip Sex Analysis

In [30]:
df.groupby("sex").agg({"tip": ["mean"]})

Unnamed: 0_level_0,tip
Unnamed: 0_level_1,mean
sex,Unnamed: 1_level_2
Female,2.833448
Male,3.089618


In [31]:
test_stat, pvalue = shapiro(df.loc[df["sex"] == "Female", "tip"])

print("Test Stat = %.4f p-value = %.4f" % (test_stat, pvalue))

Test Stat = 0.9568 p-value = 0.0054


In [32]:
# Comment: p-value < 0.05, not a normal distribution - then non parametric tests can be implemented 

In [33]:
test_stat, pvalue = shapiro(df.loc[df["sex"]== "Male", "tip"])

print("Test Stat = %.4f p-value = %.4f" % (test_stat, pvalue))

Test Stat = 0.8759 p-value = 0.0000


In [34]:
# Comment: p-value < 0.05, not a normal distribution - then non parametric tests can be implemented (in this case mannwhitneyu)

In [35]:
# H0 = There is no significant difference between the mean of the tip that Female and Male pay.
# H1 = There is a significant difference between the mean of the tip that Female and Male pay.

test_stat, pvalue = mannwhitneyu(df.loc[df["sex"] == "Female", "tip"],
                          df.loc[df["sex"] == "Male", "tip"])
print("Test Stat = %.4f p-value = %.4f" % (test_stat, pvalue))

Test Stat = 6369.5000 p-value = 0.3834


In [36]:
# p-value < 0.05 - there is no significant difference between the tip male give and female give.

# Tip - Time analysis

In [37]:
test_stat, pvalue = shapiro(df.loc[df["time"]== "Lunch", "tip"])
print("Test Stat = %.4f p-value = %.4f" % (test_stat, pvalue))

Test Stat = 0.8687 p-value = 0.0000


In [38]:
# Comment: p-value < 0.05, not a normal distribution - then non parametric tests can be implemented (in this case mannwhitneyu)

In [39]:
test_stat, pvalue = shapiro(df.loc[df["time"]== "Dinner", "tip"])
print("Test Stat = %.4f p-value = %.4f" % (test_stat, pvalue))

Test Stat = 0.9009 p-value = 0.0000


In [40]:
# Comment: p-value < 0.05, not a normal distribution - then non parametric tests can be implemented (in this case mannwhitneyu)

In [42]:
# H0 = There is no significant difference between the tip given for Lunch and Dinnner
# H1 = There is a significant difference between the tip given for the Lunch and Dinner

test_stat, pvalue = mannwhitneyu(df.loc[df["time"] == "Lunch", "tip"],
                          df.loc[df["time"] == "Dinner", "tip"])
print("Test Stat = %.4f p-value = %.4f" % (test_stat, pvalue))

Test Stat = 4905.0000 p-value = 0.0288


In [43]:
# p-value < 0.05 - there is a significance difference - then we select H1 

# Tip - Size Analysis

In [44]:
df["size_stat"] =1
df



Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,size_stat
0,16.99,1.01,Female,No,Sun,Dinner,2,1
1,10.34,1.66,Male,No,Sun,Dinner,3,1
2,21.01,3.50,Male,No,Sun,Dinner,3,1
3,23.68,3.31,Male,No,Sun,Dinner,2,1
4,24.59,3.61,Female,No,Sun,Dinner,4,1
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,1
240,27.18,2.00,Female,Yes,Sat,Dinner,2,1
241,22.67,2.00,Male,Yes,Sat,Dinner,2,1
242,17.82,1.75,Male,No,Sat,Dinner,2,1


In [45]:
df =df.drop("size_stat", axis=1)

In [47]:
def size_check(size):
    if size <= 2 :
        return "1-2"
    elif size == 3 :
        return "3"
    else :
        return "4+"

In [48]:
df["size_stat"]= df.apply(lambda x: size_check(x["size"]), axis=1)

In [49]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,size_stat
0,16.99,1.01,Female,No,Sun,Dinner,2,1-2
1,10.34,1.66,Male,No,Sun,Dinner,3,3
2,21.01,3.5,Male,No,Sun,Dinner,3,3
3,23.68,3.31,Male,No,Sun,Dinner,2,1-2
4,24.59,3.61,Female,No,Sun,Dinner,4,4+


In [50]:
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,size_stat
239,29.03,5.92,Male,No,Sat,Dinner,3,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2,1-2
241,22.67,2.0,Male,Yes,Sat,Dinner,2,1-2
242,17.82,1.75,Male,No,Sat,Dinner,2,1-2
243,18.78,3.0,Female,No,Thur,Dinner,2,1-2


In [51]:
test_stat, pvalue = shapiro(df.loc[df["size_stat"]== "1-2", "tip"])
print("Test Stat = %.4f p-value = %.4f" % (test_stat, pvalue))

Test Stat = 0.9428 p-value = 0.0000


In [52]:
# Comment: p-value < 0.05, not a normal distribution - then non parametric tests can be implemented (in this case kruskal)

In [53]:
test_stat, pvalue = shapiro(df.loc[df["size_stat"]== "3", "tip"])
print("Test Stat = %.4f p-value = %.4f" % (test_stat, pvalue))

Test Stat = 0.8009 p-value = 0.0000


In [54]:
# Comment: p-value < 0.05, not a normal distribution - then non parametric tests can be implemented (in this case kruskal)

In [55]:
test_stat, pvalue = shapiro(df.loc[df["size_stat"]== "4+", "tip"])
print("Test Stat = %.4f p-value = %.4f" % (test_stat, pvalue))

Test Stat = 0.9449 p-value = 0.0299


In [56]:
# Comment: p-value < 0.05, not a normal distribution - then non parametric tests can be implemented (in this case kruskal)

In [62]:
#H0 = There is no significant difference for the total_bill and size of the people
#H1 = There is a significant difference for the total_bill and size of the people




stat, p = kruskal(df[df["size_stat"]=="1-2"]["tip"],
                  df[df["size_stat"]=="3"]["tip"],
                  df[df["size_stat"]=="4+"]["tip"])
print('Statistics=%.3f, p=%.3f' % (stat, p))

Statistics=49.228, p=0.000


In [63]:
# p-value < 0 .05 - we reject H0 and continue with the H1 