In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 100)

import scipy.stats as st
import statsmodels.stats.proportion as sm

In [2]:
#  Importing and preparing Dataset

poll = pd.read_excel(r'COMPLETE 'CLEAN_POLL.XLSX' FILE PATH')
poll = poll.drop(['Unnamed: 0'],axis=1)

poll.head()

Unnamed: 0,ID,Male?,Age,Home population,Country,Qty medic.,% med used 1 month,% med used 6 month,% med used 12 month,% med expire unusefull,have heart,have mental,have breathing,have pressure,have cancer,"Colesterol, diabetes",have general,have dental,have alergies,have comments,extra heart,extra mental,extra breath,extra pressure,extra cancer,extra diabetes colesterol,extra general,extra dental,extra alergie,extra comment,will donate?,will deliver?,most pilds?,comment
0,94016141,Male,21 - 35,0 - 1,Portugal,21 - 27,0-20%,41-60%,41-60%,41-60%,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,1,0,YES,YES,YES,0
1,94016315,Female,76 - 85,0 - 1,Argentina,4 - 7,81-100%,81-100%,81-100%,0-20%,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,YES,YES,YES,0
2,94016388,Female,21 - 35,0 - 1,Francia,4 - 7,41-60%,0-20%,0-20%,21-40%,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,YES,YES,YES,0
3,94016392,Male,36 - 50,2 - 4,Spain,8 - 11,21-40%,41-60%,0-20%,0-20%,0,0,0,0,0,0,1,0,1,"Nerve painful treatment, children vitamins",0,0,0,0,0,0,0,0,0,0,YES,YES,YES,roque.ourense@gmail.com
4,94016414,Male,36 - 50,0 - 1,Austria,12 - 15,0-20%,21-40%,0-20%,41-60%,0,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,1,0,0,YES,NO,YES,0


In [3]:
#   *********************  STATS  ***********************

In [4]:
#  Creating subset for Qty Med stats

avg_med = poll['Qty medic.']
avg_med.replace(['0 - 3','4 - 7','8 - 11','12 - 15','16 - 20','21 - 27','28 - 35','36 <'], [1,2,3,4,5,6,7,8], inplace=True)

In [5]:
#    ******  QTY MEDICINE RANGE P-VALUE  *********

n = 30
print("Sample qty: ", n, '\n')

# H0 : mu_med = 4 (12-15) -> H1 : mu_med < 4 (12-15) -> Significance 5%
avg_med_sample1 = avg_med.sample(n)
print("avg med sample 1 mean: ", avg_med_sample1.mean())
print("avg med sample 1 std: ", avg_med_sample1.std())
print(st.ttest_1samp(avg_med_sample1, 4))
print('Low P-Value means reject Ho, avg medicine stock is no between 12-15un', '\n')

# H0 : mu_med = 3 (8-11) -> H1 : mu_med < 3 (8-11) -> Significance 5%
avg_med_sample2 = avg_med.sample(n)
print("avg med sample 2 mean: ", avg_med_sample2.mean())
print("avg med sample 2 std: ", avg_med_sample2.std())
print(st.ttest_1samp(avg_med_sample2, 3))
print('High P-Value means accept Ho, avg medicine stock is between 8-11un', '\n')

Sample qty:  30 

avg med sample 1 mean:  2.9
avg med sample 1 std:  1.1846722229638327
Ttest_1sampResult(statistic=-5.085751160336581, pvalue=1.9986302400153327e-05)
Low P-Value means reject Ho, avg medicine stock is no between 12-15un 

avg med sample 2 mean:  3.033333333333333
avg med sample 2 std:  1.5196036990935664
Ttest_1sampResult(statistic=0.12014592090290317, pvalue=0.9051960834865327)
High P-Value means accept Ho, avg medicine stock is between 8-11un 



In [6]:
#  Creating subset for Unusefull Med

unused_med = poll['% med expire unusefull']
unused_med.replace(['0-20%','21-40%','41-60%','61-80%','81-100%'], [1,2,3,4,5], inplace=True)

In [7]:
#    ******  UNUSEFULL MEDICINE RANGE P-VALUE  *********

n = 40
print("Sample qty: ", n, '\n')

# H0 : mu_unusefull = 4 (61-80%) -> H1 : mu_unusefull < 3 (61-80%) -> Significance 5%
avg_unusefull_sample0 = unused_med.sample(n)
print("avg med sample 1 mean: ", avg_unusefull_sample0.mean())
print("avg med sample 1 std: ", avg_unusefull_sample0.std())
print(st.ttest_1samp(avg_unusefull_sample0, 4))
print('Low P-Value means reject Ho, avg unusefull stock is not between 61-80%', '\n')

# H0 : mu_unusefull = 3 (41-60%) -> H1 : mu_unusefull < 3 (41-60%) -> Significance 5%
avg_unusefull_sample1 = unused_med.sample(n)
print("avg med sample 1 mean: ", avg_unusefull_sample1.mean())
print("avg med sample 1 std: ", avg_unusefull_sample1.std())
print(st.ttest_1samp(avg_unusefull_sample1, 3))
print('Low P-Value means reject Ho, avg unusefull stock is not between 41-60%', '\n')

# H0 : mu_unusefull = 2 (21-40%) -> H1 : mu_unusefull < 2 (21-40%) -> Significance 5%
avg_unusefull_sample2 = unused_med.sample(n)
print("avg med sample 2 mean: ", avg_unusefull_sample2.mean())
print("avg med sample 2 std: ", avg_unusefull_sample2.std())
print(st.ttest_1samp(avg_unusefull_sample2, 2))
print('High P-Value means accept Ho, avg medicine stock is between 21-40%', '\n')

# H0 : mu_unusefull = 1 (0-20%) -> H1 : mu_unusefull < 1 (0-20%) -> Significance 5%
avg_unusefull_sample3 = unused_med.sample(n)
print("avg med sample 3 mean: ", avg_unusefull_sample3.mean())
print("avg med sample 3 std: ", avg_unusefull_sample3.std())
print(st.ttest_1samp(avg_unusefull_sample3, 1))
print('High P-Value means accept Ho, avg medicine stock is not between 0-20%', '\n')

Sample qty:  40 

avg med sample 1 mean:  1.525
avg med sample 1 std:  0.8766925023004043
Ttest_1sampResult(statistic=-17.854919914063316, pvalue=2.276224825606646e-20)
Low P-Value means reject Ho, avg unusefull stock is not between 61-80% 

avg med sample 1 mean:  1.525
avg med sample 1 std:  0.8766925023004044
Ttest_1sampResult(statistic=-10.640810857876117, pvalue=4.2805171566544023e-13)
Low P-Value means reject Ho, avg unusefull stock is not between 41-60% 

avg med sample 2 mean:  1.75
avg med sample 2 std:  1.1712364713391024
Ttest_1sampResult(statistic=-1.349974039210405, pvalue=0.1848077868736682)
High P-Value means accept Ho, avg medicine stock is between 21-40% 

avg med sample 3 mean:  1.75
avg med sample 3 std:  1.1491356841177722
Ttest_1sampResult(statistic=4.127812368731931, pvalue=0.00018651858576236287)
High P-Value means accept Ho, avg medicine stock is not between 0-20% 



In [8]:
#  Creating female & mael subset for Qty Med stats

avg_med_female = poll[poll['Male?']=='Female']['Qty medic.']
avg_med_female.replace(['0 - 3','4 - 7','8 - 11','12 - 15','16 - 20','21 - 27','28 - 35','36 <'], [1,2,3,4,5,6,7,8], inplace=True)

avg_med_male = poll[poll['Male?']=='Male']['Qty medic.']
avg_med_male.replace(['0 - 3','4 - 7','8 - 11','12 - 15','16 - 20','21 - 27','28 - 35','36 <'], [1,2,3,4,5,6,7,8], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  avg_med_female.replace(['0 - 3','4 - 7','8 - 11','12 - 15','16 - 20','21 - 27','28 - 35','36 <'], [1,2,3,4,5,6,7,8], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  avg_med_male.replace(['0 - 3','4 - 7','8 - 11','12 - 15','16 - 20','21 - 27','28 - 35','36 <'], [1,2,3,4,5,6,7,8], inplace=True)


In [9]:
n = 21
print("Sample qty: ", n, '\n')

avg_med_female1 = avg_med_female.sample(n)
avg_med_male1 = avg_med_male.sample(n)

print(st.ttest_rel(avg_med_female1,avg_med_male1))
print(st.ttest_rel(avg_med_female1,avg_med_male1, alternative='less'))
print(st.ttest_ind(avg_med_female1,avg_med_male1,equal_var=False))

Sample qty:  21 

Ttest_relResult(statistic=-1.3302757262512923, pvalue=0.19839839592792793)
Ttest_relResult(statistic=-1.3302757262512923, pvalue=0.09919919796396397)
Ttest_indResult(statistic=-1.3451475166470612, pvalue=0.18620991753721)


In [10]:
#  Creating subset for Unusefull Med

unused_med_female = poll[poll['Male?']=='Female']['% med expire unusefull']
unused_med_female.replace(['0-20%','21-40%','41-60%','61-80%','81-100%'], [1,2,3,4,5], inplace=True)

unused_med_male = poll[poll['Male?']=='Male']['% med expire unusefull']
unused_med_male.replace(['0-20%','21-40%','41-60%','61-80%','81-100%'], [1,2,3,4,5], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unused_med_female.replace(['0-20%','21-40%','41-60%','61-80%','81-100%'], [1,2,3,4,5], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unused_med_male.replace(['0-20%','21-40%','41-60%','61-80%','81-100%'], [1,2,3,4,5], inplace=True)


In [11]:
n = 21
print("Sample qty: ", n, '\n')

unused_med_female1 = unused_med_female.sample(n)
unused_med_male1 = unused_med_male.sample(n)

print(st.ttest_rel(unused_med_female1,unused_med_male1))
print(st.ttest_rel(unused_med_female1,unused_med_male1, alternative='greater'))
print(st.ttest_ind(unused_med_female1,unused_med_male1,equal_var=False))

Sample qty:  21 

Ttest_relResult(statistic=0.6779076806833005, pvalue=0.5055981116688764)
Ttest_relResult(statistic=0.6779076806833005, pvalue=0.2527990558344382)
Ttest_indResult(statistic=0.7324484191363096, pvalue=0.4692170963124246)
