### Basic Statistical Testing

In [18]:
#use statistics in a lot of different ways in data science, and on this lecture, I want to refresh your
# knowledge of hypothesis testing, which is a core data analysis activity behind experimentation. The goal of
# hypothesis testing is to determine if, for instance, the two different conditions we have in an experiment 
# have resulted in different impacts

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action= "ignore")

In [2]:
#scipy is an interesting collection of libraries for data science and you'll use most or perpahs all of
# these libraries. It includes numpy and pandas, but also plotting libraries such as matplotlib, and a
# number of scientific library functions as well
from scipy import stats

In [3]:
import os
os.chdir('C://Users//my/Desktop//')

In [4]:
# When we do hypothesis testing, we actually have two statements of interest: the first is our actual
# explanation, which we call the alternative hypothesis, and the second is that the explanation we have is not
# sufficient, and we call this the null hypothesis. Our actual testing method is to determine whether the null
# hypothesis is true or not. If we find that there is a difference between groups, then we can reject the null
# hypothesis and we accept our alternative.

df= pd.read_csv("corsera_dataset/grades.csv")
df.head(3)

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
0,B73F2C11-70F0-E37D-8B10-1D20AFED50B1,92.733946,2015-11-02 06:55:34.282000000,83.030552,2015-11-09 02:22:58.938000000,67.164441,2015-11-12 08:58:33.998000000,53.011553,2015-11-16 01:21:24.663000000,47.710398,2015-11-20 13:24:59.692000000,38.168318,2015-11-22 18:31:15.934000000
1,98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1,86.790821,2015-11-29 14:57:44.429000000,86.290821,2015-12-06 17:41:18.449000000,69.772657,2015-12-10 08:54:55.904000000,55.098125,2015-12-13 17:32:30.941000000,49.588313,2015-12-19 23:26:39.285000000,44.629482,2015-12-21 17:07:24.275000000
2,D0F62040-CEB0-904C-F563-2F8620916C4E,85.512541,2016-01-09 05:36:02.389000000,85.512541,2016-01-09 06:39:44.416000000,68.410033,2016-01-15 20:22:45.882000000,54.728026,2016-01-11 12:41:50.749000000,49.255224,2016-01-11 17:31:12.489000000,44.329701,2016-01-17 16:24:42.765000000


In [5]:
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns")

There are 2315 rows and 13 columns


In [6]:
early_finishers=df[pd.to_datetime(df['assignment1_submission']) < '2016']
early_finishers.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
0,B73F2C11-70F0-E37D-8B10-1D20AFED50B1,92.733946,2015-11-02 06:55:34.282000000,83.030552,2015-11-09 02:22:58.938000000,67.164441,2015-11-12 08:58:33.998000000,53.011553,2015-11-16 01:21:24.663000000,47.710398,2015-11-20 13:24:59.692000000,38.168318,2015-11-22 18:31:15.934000000
1,98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1,86.790821,2015-11-29 14:57:44.429000000,86.290821,2015-12-06 17:41:18.449000000,69.772657,2015-12-10 08:54:55.904000000,55.098125,2015-12-13 17:32:30.941000000,49.588313,2015-12-19 23:26:39.285000000,44.629482,2015-12-21 17:07:24.275000000
4,5ECBEEB6-F1CE-80AE-3164-E45E99473FB4,64.8138,2015-12-13 17:06:10.750000000,51.49104,2015-12-14 12:25:12.056000000,41.932832,2015-12-29 14:25:22.594000000,36.929549,2015-12-28 01:29:55.901000000,33.236594,2015-12-29 14:46:06.628000000,33.236594,2016-01-05 01:06:59.546000000
5,D09000A0-827B-C0FF-3433-BF8FF286E15B,71.647278,2015-12-28 04:35:32.836000000,64.05255,2016-01-03 21:05:38.392000000,64.75255,2016-01-07 08:55:43.692000000,57.467295,2016-01-11 00:45:28.706000000,57.467295,2016-01-11 00:54:13.579000000,57.467295,2016-01-20 19:54:46.166000000
8,C9D51293-BD58-F113-4167-A7C0BAFCB6E5,66.595568,2015-12-25 02:29:28.415000000,52.916454,2015-12-31 01:42:30.046000000,48.344809,2016-01-05 23:34:02.180000000,47.444809,2016-01-02 07:48:42.517000000,37.955847,2016-01-03 21:27:04.266000000,37.955847,2016-01-19 15:24:31.060000000


In [7]:
late_finishers=df[~df.index.isin(early_finishers.index)]
late_finishers.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
2,D0F62040-CEB0-904C-F563-2F8620916C4E,85.512541,2016-01-09 05:36:02.389000000,85.512541,2016-01-09 06:39:44.416000000,68.410033,2016-01-15 20:22:45.882000000,54.728026,2016-01-11 12:41:50.749000000,49.255224,2016-01-11 17:31:12.489000000,44.329701,2016-01-17 16:24:42.765000000
3,FFDF2B2C-F514-EF7F-6538-A6A53518E9DC,86.030665,2016-04-30 06:50:39.801000000,68.824532,2016-04-30 17:20:38.727000000,61.942079,2016-05-12 07:47:16.326000000,49.553663,2016-05-07 16:09:20.485000000,49.553663,2016-05-24 12:51:18.016000000,44.598297,2016-05-26 08:09:12.058000000
6,3217BE3F-E4B0-C3B6-9F64-462456819CE4,87.498744,2016-03-05 11:05:25.408000000,69.998995,2016-03-09 07:29:52.405000000,55.999196,2016-03-16 22:31:24.316000000,50.399276,2016-03-18 07:19:26.032000000,45.359349,2016-03-19 10:35:41.869000000,45.359349,2016-03-23 14:02:00.987000000
7,F1CB5AA1-B3DE-5460-FAFF-BE951FD38B5F,80.57609,2016-01-24 18:24:25.619000000,72.518481,2016-01-27 13:37:12.943000000,65.266633,2016-01-30 14:34:36.581000000,65.266633,2016-02-03 22:08:49.002000000,65.266633,2016-02-16 14:22:23.664000000,65.266633,2016-02-18 08:35:04.796000000
9,E2C617C2-4654-622C-AB50-1550C4BE42A0,59.270882,2016-03-06 12:06:26.185000000,59.270882,2016-03-13 02:07:25.289000000,53.343794,2016-03-17 07:30:09.241000000,53.343794,2016-03-20 21:45:56.229000000,42.675035,2016-03-27 15:55:04.414000000,38.407532,2016-03-30 20:33:13.554000000


In [12]:
display(early_finishers["assignment1_grade"].mean() , late_finishers["assignment1_grade"].mean())

74.94728457024303

74.0450648477065

In [13]:
#these look pretty similar. But, are they the same? What do we mean by similar? This is where the
# students' t-test comes in. It allows us to form the alternative hypothesis ("These are different") as well
# as the null hypothesis ("These are the same") and then test that null hypothesis.

# When doing hypothesis testing, we have to choose a significance level as a threshold for how much of a
# chance we're willing to accept. This significance level is typically called alpha. 

# The SciPy library contains a number of different statistical tests and forms a basis for hypothesis testing
# in Python and we're going to use the ttest_ind() function which does an independent t-test (meaning the
# populations are not related to one another). The result of ttest_index() are the t-statistic and a p-value.
# It's this latter value, the probability, which is most important to us, as it indicates the chance (between
# 0 and 1) of our null hypothesis being True.

from scipy.stats import ttest_ind

ttest_ind(early_finishers["assignment1_grade"], late_finishers["assignment1_grade"])

Ttest_indResult(statistic=1.3223540853721596, pvalue=0.18618101101713855)

In [None]:
# So here we see that the probability is 0.18, and this is above our alpha value of 0.05. This means that we
# cannot reject the null hypothesis. The null hypothesis was that the two populations are the same, and we
# don't have enough certainty in our evidence (because it is greater than alpha) to come to a conclusion to
# the contrary. This doesn't mean that we have proven the populations are the same.

In [14]:
print(ttest_ind(early_finishers['assignment2_grade'], late_finishers['assignment2_grade']))
print(ttest_ind(early_finishers['assignment3_grade'], late_finishers['assignment3_grade']))
print(ttest_ind(early_finishers['assignment4_grade'], late_finishers['assignment4_grade']))
print(ttest_ind(early_finishers['assignment5_grade'], late_finishers['assignment5_grade']))
print(ttest_ind(early_finishers['assignment6_grade'], late_finishers['assignment6_grade']))

Ttest_indResult(statistic=1.2514717608216366, pvalue=0.2108889627004424)
Ttest_indResult(statistic=1.6133726558705392, pvalue=0.10679998102227865)
Ttest_indResult(statistic=0.049671157386456125, pvalue=0.960388729789337)
Ttest_indResult(statistic=-0.05279315545404755, pvalue=0.9579012739746492)
Ttest_indResult(statistic=-0.11609743352612056, pvalue=0.9075854011989656)


In [20]:
#so it looks like in this data we do not have enough evidence to suggest the populations differ with
# respect to grade. Let's take a look at those p-values for a moment though, because they are saying things
# that can inform experimental design down the road. For instance, one of the assignments, assignment 3, has a
# p-value around 0.1. This means that if we accepted a level of chance similarity of 11% this would have been
# considered statistically significant. As a research, this would suggest to me that there is something here
# worth considering following up on. For instance, if we had a small number of participants (we don't) or if
# there was something unique about this assignment as it relates to our experiment (whatever it was) then
# there may be followup experiments we could run.

df1 = pd.DataFrame([np.random.random(100) for x in range(100)])
df1.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.562624,0.734762,0.450167,0.135036,0.381848,0.954843,0.586981,0.206077,0.030201,0.426666,...,0.352224,0.147727,0.201138,0.821812,0.181758,0.824589,0.005845,0.832335,0.577227,0.448926
1,0.869969,0.223512,0.937033,0.992627,0.995981,0.911422,0.157568,0.104131,0.672676,0.811101,...,0.254744,0.619088,0.496029,0.648789,0.240043,0.995848,0.784236,0.792829,0.242622,0.745433
2,0.486956,0.901989,0.222628,0.492279,0.826255,0.069888,0.182524,0.801027,0.324387,0.016809,...,0.195242,0.570475,0.491595,0.755397,0.628917,0.971251,0.70037,0.458263,0.734556,0.607327


In [21]:
df2=pd.DataFrame([np.random.random(100) for x in range(100)])

In [22]:
# Are these two DataFrames the same? Maybe a better question is, for a given row inside of df1, is it the same
# as the row inside df2?

#  say our critical value is 0.1, or and alpha of 10%. And we're going to compare each
# column in df1 to the same numbered column in df2. And we'll report when the p-value isn't less than 10%,
# which means that we have sufficient evidence to say that the columns are different.

# Let's write this in a function called test_columns
def test_columns(alpha=0.1):
    # want to keep track of how many differ
    num_diff=0
    # And now we can just iterate over the columns
    for col in df1.columns:
        # we can run out ttest_ind between the two dataframes
        teststat,pval=ttest_ind(df1[col],df2[col])
        # and we check the pvalue versus the alpha
        if pval<=alpha:
            # And now we'll just print out if they are different and increment the num_diff
            print("Col {} is statistically significantly different at alpha={}, pval={}".format(col,alpha,pval))
            num_diff=num_diff+1
    # and let's print out some summary stats
    print("Total number different was {}, which is {}%".format(num_diff,float(num_diff)/len(df1.columns)*100))

# And now lets actually run this
test_columns()

Col 2 is statistically significantly different at alpha=0.1, pval=0.05549564054250612
Col 4 is statistically significantly different at alpha=0.1, pval=0.008813129019133698
Col 9 is statistically significantly different at alpha=0.1, pval=0.0676729611395607
Col 16 is statistically significantly different at alpha=0.1, pval=0.007490353533992622
Col 25 is statistically significantly different at alpha=0.1, pval=0.05777106264660425
Col 34 is statistically significantly different at alpha=0.1, pval=0.01817640838103316
Col 35 is statistically significantly different at alpha=0.1, pval=0.08896141618005393
Col 39 is statistically significantly different at alpha=0.1, pval=0.09247999146587359
Col 51 is statistically significantly different at alpha=0.1, pval=0.021704932374747684
Col 54 is statistically significantly different at alpha=0.1, pval=0.03165142605777532
Col 74 is statistically significantly different at alpha=0.1, pval=0.026823492360532688
Col 76 is statistically significantly diffe

In [23]:
# Interesting, so we see that there are a bunch of columns that are different! In fact, that number looks a
# lot like the alpha value we chose. So what's going on - shouldn't all of the columns be the same? Remember
# that all the ttest does is check if two sets are similar given some level of confidence, in our case, 10%.
# The more random comparisons you do, the more will just happen to be the same by chance. In this example, we
# checked 100 columns, so we would expect there to be roughly 10 of them if our alpha was 0.1.

test_columns(0.05)

Col 4 is statistically significantly different at alpha=0.05, pval=0.008813129019133698
Col 16 is statistically significantly different at alpha=0.05, pval=0.007490353533992622
Col 34 is statistically significantly different at alpha=0.05, pval=0.01817640838103316
Col 51 is statistically significantly different at alpha=0.05, pval=0.021704932374747684
Col 54 is statistically significantly different at alpha=0.05, pval=0.03165142605777532
Col 74 is statistically significantly different at alpha=0.05, pval=0.026823492360532688
Col 76 is statistically significantly different at alpha=0.05, pval=0.001647287036590556
Col 83 is statistically significantly different at alpha=0.05, pval=0.021002084165629344
Col 88 is statistically significantly different at alpha=0.05, pval=0.029265140169607066
Total number different was 9, which is 9.0%


In [24]:
#  keep this in mind when you are doing statistical tests like the t-test which has a p-value. Understand
# that this p-value isn't magic, that it's a threshold for you when reporting results and trying to answer
# your hypothesis. What's a reasonable threshold? Depends on your question, and you need to engage domain
# experts to better understand what they would consider significant.

# Just for fun, lets recreate that second dataframe using a non-normal distribution, I'll arbitrarily chose
# chi squared
df2=pd.DataFrame([np.random.chisquare(df=1,size=100) for x in range(100)])
test_columns()

Col 0 is statistically significantly different at alpha=0.1, pval=0.0027582381232711646
Col 1 is statistically significantly different at alpha=0.1, pval=0.00028734516491424865
Col 2 is statistically significantly different at alpha=0.1, pval=0.0002382416238918026
Col 3 is statistically significantly different at alpha=0.1, pval=3.8110506692449035e-07
Col 4 is statistically significantly different at alpha=0.1, pval=0.0017357181677036503
Col 5 is statistically significantly different at alpha=0.1, pval=0.0002742129892948521
Col 6 is statistically significantly different at alpha=0.1, pval=7.143580973940678e-05
Col 7 is statistically significantly different at alpha=0.1, pval=4.9014379577468675e-05
Col 8 is statistically significantly different at alpha=0.1, pval=0.0003690203180085159
Col 9 is statistically significantly different at alpha=0.1, pval=0.00021104726884047423
Col 10 is statistically significantly different at alpha=0.1, pval=0.0002999023438875441
Col 11 is statistically sig