In [None]:
'''
Howdy! 

In this project, we will be making statistically significant observations on the titanic dataset. 
'''

#What does that mean?

'''
Well, this data set contains a lot of information of some of the passengers on board. Their name, age, sex, class, etc. And whether or not they survived.
Your goal is to correlate one of these attributes to whether or not they survived.
'''

#How do I do this?

#STEP 1 - hypothesize
'''
1) You need to choose what your hypothesis is. You are trying to show that an AVERAGE value of individuals of the surviving group and the not surviving group are 
different OR if there are only 2 possible values (like biological sex) then you should try to show that the PROPORTION of those who survived and did not survive are different.

The null hypothesis (H0) is the hypothesis that states there is no difference between two groups. The alternative hypothesis (Ha) says there is a difference.

For example,

H0: P(males surviving) = P(females surviving) Ha: P(males surviving) != P(females surviving)                                   this is a probability problem since there are only 2 biological sexes and cannot be averaged
H0: average age of survivors = average age of non-survivors Ha: average age of survivors = average age of non-survivors        this is an average problem since there are multiple ages and can be averaged

*P(blah blah) means probability, != means not equal
'''

#STEP 2 - loading, cleaning, and visualizing data
'''
2) You need to isolate those data points and clean the data.
a) Load the data into a Pandas DataFrame
b) Make the column you are going to use along with the results into a dataframe.
c) Clean the data! (Get rid of nans, empty cells, incomplete data, mention how you deal with outliers)
d) Visualize the data however you like.
'''

#STEP 3 - Stats
'''
3) Make the analysis (WE DO MOST OF THIS FOR YOU)
a) Compute a 5 number summary of the data for the entire data set, the data for one sample, and data for the other sample. Print them out!
b) put you average value OR probability in the function called test(). This will do all the statstical work for you and will output a p-value

Whats a p-value?
Actual definition: the probability that a particular statistical measure, such as the mean or standard deviation, of an assumed probability distribution will be greater than or equal to 
(or less than or equal to in some instances) observed results. (Oxford Languages, https://www.google.com/search?q=define+pvalue&rlz=1C5CHFA_enUS859US859&oq=define+pvalue&aqs=chrome..69i57j0i13j0i13i30l5j0i13i15i30l3.3098j1j7&sourceid=chrome&ie=UTF-8)

What that actually means:
Basically, if the p value is really low, it is very unlikely that the 2 groups you are testing have a similar average or proportion.
In order for your results to be "statistically significant," the p value must be lower than 0.05. 
If it is, that means you have strong evidence to support Ha. Thus you "reject the null hypothesis."
If it is NOT, that means you do not have strong enough evidence to challenge H0. Thus you "fail to reject the null hypothesis."


Also, if you do not make a statistically significant observation. THAT IS OKAY!!! In fact, that would be an interesting thing to report. You can just say,
there actually is not that much evidence to suggest that men and women had different survivial rates. 
'''

#Lastly
'''
HAVE FUN! ASK US QUESTIONS! WE KNOW THAT FOR A LOT OF YOU, THIS CAN BE INTIMIDATING! YOU HAVE A WHOLE MONTH SO TAKE YOUR TIME.

Also, if you are more advanced in Python or more advanced in statistics. Do some extra visualizations or even write the statistical calcuations yourself.
Have this project challenge yourself.

- tidaltamu

'''




In [17]:
#Here is the functions you will use to calculate your p-value. Click the play button to ensure there are no errors.
#YOU DO NOT NEED TO MODIFY ANYTHING HERE

def comparingProportion(px, m, py, n, alpha=.05):
  """
  Calculates a pvalue based on comparing two sample proportions

  :param float px: sample proportion of group 1
  :param int m: sample size of group 1
  :param float py: sample proportion of group 2
  :param int n: sample size of group 2
  :param float alpha: alpha level
  :raises ValueError: if the Central Limit Theorem does not apply
  :raises ValueError: if numerical values are inappropriate 
  """
  from scipy import stats
  import math

  #Incorrect Values
  if not (0 <= px <= 1 and 0 <= py <= 1 and m >=0 and n >= 0 and 0 <= alpha <= 1):
    raise ValueError("ERROR: Incorrect input values. Probability values should be in [0,1]. Sample sizes should be non-negative. Alpha level should be in [0,1].")

  #Central Limit Theorem check for Z-test
  if(px * m < 5 or (1-px)*m <5 or py * n < 5 or (1-py)*n <5):
    raise ValueError("ERROR: Central Limit Theorem does not apply")
  
  #Calculate average proportion between group 1 and group 2
  p = (px*m + py*n)/(m+n)

  #Calculate Z-value
  Z = (px - py) / math.sqrt(p*(1-p) * (1/m + 1/n))

  #Calcluate area to right of Z-value times 2 (since Ha should be a 2-tailed test)
  pval = 2*stats.norm.sf(abs(Z))
  
  #Bonferroni Correction (one can cherry pick 5 hypothesis tests (1 from each row))
  pval *= 5

  #In case p-val exceeds 1 with Bonferonni Correction...
  if pval >1: pval = 1

  return pval


def comparingMean(ux, sx, m, uy, sy, n, alpha=.05):
  """
  Calculates a pvalue based on comparing two means with a delta of 0

  :param float ux: sample average value of group 1
  :param float sx: sample standard deviation of group 1
  :param int m: sample size of group 1
  :param float uy: sample average value of group 2
  :param float sy: sample standard deviation of group 2
  :param int n: sample size of group 2
  :param float alpha: alpha level
  :raises ValueError: if the Central Limit Theorem does not apply
  :raises ValueError: if numerical values are inappropriate 
  """
  from scipy import stats
  import math

  #Incorrect Values
  if not (m >=0 and n >= 0 and 0 <= alpha <= 1):
    raise ValueError("ERROR: Incorrect input values. Sample sizes should be non-negative. Alpha level should be in [0,1].")


  #Central Limit Theorem check for Z-test
  if(m < 30 or n < 30):
    raise ValueError("ERROR: Central Limit Theorem does not apply")

  #Calculate Z-value
  Z = (ux - uy) / math.sqrt(sx*sx/m + sy*sy/n)

  #Calcluate area to right of Z-value times 2 (since Ha should be a 2-tailed test)
  pval = 2*stats.norm.sf(abs(Z))
  
  #Bonferroni Correction (one can cherry pick 5 hypothesis tests (1 from each row))
  pval *= 5

  #In case p-val exceeds 1 with Bonferonni Correction...
  if pval >1: pval = 1

  return pval


print("No Errors! Both functions are now declared in your Notebook!")

No Errors! Both functions are now declared in your Notebook!


In [11]:
import pandas as pd 
hypo='''
Put your hypothesis here...
H0: 
Ha: 
'''

df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

In [3]:
#View the Data (make more cells if you need!)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
#Clean it (make more cells if you need!)



In [None]:
#Data Visualization (make more cells if you need!)




In [None]:
#If you are doing a probability problem run this block of code...

#REPLACE ALL NONES WITH YOUR VALUES

probability_g1 = None
sample_size_g1 = None
probability_g2 = None
sample_size_g2 = None

'---------------- No Need to Change Anything Below ----------------'
alpha = 0.05 


pval = comparingProportion(probability_g1, sample_size_g1, probability_g2, sample_size_g2, alpha)
ha = hypo.split("\n")[3]


if (pval < 0.05):
  print(f"The calculated p-value is ~{round(pval,10)}, which is less than our alpha value of {alpha}.")
  print("Thus, we reject H0.")
  print(f"There is statistically significant evidence to support {ha}")
else:
  print(f"The calculated p-value is ~{round(pval,10)}, which is greater than our alpha value of {alpha}.")
  print("Thus, we fail to reject H0.")
  print(f"There is not statistically significant evidence to support {ha}")

  

In [None]:
#If you are doing an averages problem run this block of code...

#REPLACE ALL NONES WITH YOUR VALUES

average_g1 = None
standard_deviation_g1 = None
sample_size_g1 = None
average_g2 = None
standard_deviation_g2 = None
sample_size_g2 = None

'---------------- No Need to Change Anything Below ----------------'
alpha = 0.05 


pval = comparingMean(average_g1, standard_deviation_g1, sample_size_g1, average_g2, standard_deviation_g2, sample_size_g2, alpha)
ha = hypo.split("\n")[3]


if (pval < 0.05):
  print(f"The calculated p-value is ~{round(pval,10)}, which is less than our alpha value of {alpha}.")
  print("Thus, we reject H0.")
  print(f"There is statistically significant evidence to support {ha}")
else:
  print(f"The calculated p-value is ~{round(pval,10)}, which is greater than our alpha value of {alpha}.")
  print("Thus, we fail to reject H0.")
  print(f"There is not statistically significant evidence to support {ha}")
