IN this lecture we're going to review some of the basics of statistical testing in python. We're going to talk about hypothesis testing, statistical significance, and using scipy to run student's t-tests

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [None]:
# Scipy is an interesting collection of libraries for data science. It includes numpy and pandas.
# but also for plotting libraries such as matplotlib and a number of scientific library functions as well.


In [2]:
# When we do hypothesis testing, we actually habe two statements of interest: the first is our actual explanation, which we call the alternative hyothesis,
# and the other second is that the explanation we have is not sufficient, and we call this the null hypothesis. OUr actual testing method is to determine whether 
# the null hypothesis is true or no. If we find a that there is a difference between groups, then we can reject the null hypothesis and we accept our alternative.

# example
df = pd.read_csv('./resources/grades.csv')
df.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
0,B73F2C11-70F0-E37D-8B10-1D20AFED50B1,92.733946,2015-11-02 06:55:34.282000000,83.030552,2015-11-09 02:22:58.938000000,67.164441,2015-11-12 08:58:33.998000000,53.011553,2015-11-16 01:21:24.663000000,47.710398,2015-11-20 13:24:59.692000000,38.168318,2015-11-22 18:31:15.934000000
1,98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1,86.790821,2015-11-29 14:57:44.429000000,86.290821,2015-12-06 17:41:18.449000000,69.772657,2015-12-10 08:54:55.904000000,55.098125,2015-12-13 17:32:30.941000000,49.588313,2015-12-19 23:26:39.285000000,44.629482,2015-12-21 17:07:24.275000000
2,D0F62040-CEB0-904C-F563-2F8620916C4E,85.512541,2016-01-09 05:36:02.389000000,85.512541,2016-01-09 06:39:44.416000000,68.410033,2016-01-15 20:22:45.882000000,54.728026,2016-01-11 12:41:50.749000000,49.255224,2016-01-11 17:31:12.489000000,44.329701,2016-01-17 16:24:42.765000000
3,FFDF2B2C-F514-EF7F-6538-A6A53518E9DC,86.030665,2016-04-30 06:50:39.801000000,68.824532,2016-04-30 17:20:38.727000000,61.942079,2016-05-12 07:47:16.326000000,49.553663,2016-05-07 16:09:20.485000000,49.553663,2016-05-24 12:51:18.016000000,44.598297,2016-05-26 08:09:12.058000000
4,5ECBEEB6-F1CE-80AE-3164-E45E99473FB4,64.8138,2015-12-13 17:06:10.750000000,51.49104,2015-12-14 12:25:12.056000000,41.932832,2015-12-29 14:25:22.594000000,36.929549,2015-12-28 01:29:55.901000000,33.236594,2015-12-29 14:46:06.628000000,33.236594,2016-01-05 01:06:59.546000000


In [3]:
# Six different assignment. Lets look at summary statistics for the dataframe
print("there are {} rows and {} columns".format(df.shape[0], df.shape[1]))


there are 2315 rows and 13 columns


In [4]:
# Say those who finish the first assignment by the end of December 2015, call those early finishers. and the ones who finishe
# after will be called late finishers.

early_finishers = df[pd.to_datetime(df['assignment1_submission']) < '2016']
early_finishers.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
0,B73F2C11-70F0-E37D-8B10-1D20AFED50B1,92.733946,2015-11-02 06:55:34.282000000,83.030552,2015-11-09 02:22:58.938000000,67.164441,2015-11-12 08:58:33.998000000,53.011553,2015-11-16 01:21:24.663000000,47.710398,2015-11-20 13:24:59.692000000,38.168318,2015-11-22 18:31:15.934000000
1,98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1,86.790821,2015-11-29 14:57:44.429000000,86.290821,2015-12-06 17:41:18.449000000,69.772657,2015-12-10 08:54:55.904000000,55.098125,2015-12-13 17:32:30.941000000,49.588313,2015-12-19 23:26:39.285000000,44.629482,2015-12-21 17:07:24.275000000
4,5ECBEEB6-F1CE-80AE-3164-E45E99473FB4,64.8138,2015-12-13 17:06:10.750000000,51.49104,2015-12-14 12:25:12.056000000,41.932832,2015-12-29 14:25:22.594000000,36.929549,2015-12-28 01:29:55.901000000,33.236594,2015-12-29 14:46:06.628000000,33.236594,2016-01-05 01:06:59.546000000
5,D09000A0-827B-C0FF-3433-BF8FF286E15B,71.647278,2015-12-28 04:35:32.836000000,64.05255,2016-01-03 21:05:38.392000000,64.75255,2016-01-07 08:55:43.692000000,57.467295,2016-01-11 00:45:28.706000000,57.467295,2016-01-11 00:54:13.579000000,57.467295,2016-01-20 19:54:46.166000000
8,C9D51293-BD58-F113-4167-A7C0BAFCB6E5,66.595568,2015-12-25 02:29:28.415000000,52.916454,2015-12-31 01:42:30.046000000,48.344809,2016-01-05 23:34:02.180000000,47.444809,2016-01-02 07:48:42.517000000,37.955847,2016-01-03 21:27:04.266000000,37.955847,2016-01-19 15:24:31.060000000


In [5]:
late_finishers = df[~df.index.isin(early_finishers.index)]
late_finishers.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
2,D0F62040-CEB0-904C-F563-2F8620916C4E,85.512541,2016-01-09 05:36:02.389000000,85.512541,2016-01-09 06:39:44.416000000,68.410033,2016-01-15 20:22:45.882000000,54.728026,2016-01-11 12:41:50.749000000,49.255224,2016-01-11 17:31:12.489000000,44.329701,2016-01-17 16:24:42.765000000
3,FFDF2B2C-F514-EF7F-6538-A6A53518E9DC,86.030665,2016-04-30 06:50:39.801000000,68.824532,2016-04-30 17:20:38.727000000,61.942079,2016-05-12 07:47:16.326000000,49.553663,2016-05-07 16:09:20.485000000,49.553663,2016-05-24 12:51:18.016000000,44.598297,2016-05-26 08:09:12.058000000
6,3217BE3F-E4B0-C3B6-9F64-462456819CE4,87.498744,2016-03-05 11:05:25.408000000,69.998995,2016-03-09 07:29:52.405000000,55.999196,2016-03-16 22:31:24.316000000,50.399276,2016-03-18 07:19:26.032000000,45.359349,2016-03-19 10:35:41.869000000,45.359349,2016-03-23 14:02:00.987000000
7,F1CB5AA1-B3DE-5460-FAFF-BE951FD38B5F,80.57609,2016-01-24 18:24:25.619000000,72.518481,2016-01-27 13:37:12.943000000,65.266633,2016-01-30 14:34:36.581000000,65.266633,2016-02-03 22:08:49.002000000,65.266633,2016-02-16 14:22:23.664000000,65.266633,2016-02-18 08:35:04.796000000
9,E2C617C2-4654-622C-AB50-1550C4BE42A0,59.270882,2016-03-06 12:06:26.185000000,59.270882,2016-03-13 02:07:25.289000000,53.343794,2016-03-17 07:30:09.241000000,53.343794,2016-03-20 21:45:56.229000000,42.675035,2016-03-27 15:55:04.414000000,38.407532,2016-03-30 20:33:13.554000000


In [6]:
# Compare the means for the two populations
print(early_finishers['assignment1_grade'].mean())
print(late_finishers['assignment1_grade'].mean())

74.94728457024304
74.0450648477065


In [7]:
# These look pretty similar. But, are tehy the same? What do we mean by similar? This is where the students' t-test
# comes in. It allows us to form the alternative hypothesis ("These are different") as well as the null hypothesis ("The same")
# and then test the null hypothesis.

# When doing hypothesis testing. we have to choose a significance level as a threshold for how much of a chance we're willing to accept
# Its calledf alpha. Common ones are 10% 5%

# The SciPy library contains a number of different statistical tests and forms a basis for hypothesis testing in python and we're going to use the 
# ttest_ind() function which does an independent t-test (meaning the populations are not related to one another). The result of ttest_index() are the t-
# t-statistic and a p-value. It's the latter value, the probability, which is most important to us. as it indicates the chance between 0 andd 1 of the null hypothesis being true

from scipy.stats import ttest_ind
ttest_ind(early_finishers['assignment1_grade'], late_finishers['assignment1_grade'])

Ttest_indResult(statistic=1.3223540853721598, pvalue=0.18618101101713855)

In [9]:
# So here we see that the probability is 0.18, and this is above our alpha value of 0.05. This means that we cannot reject the null hypothesis.
# The null was that the two populations are the same and we don't have enough evidence (bc p is greater than our alpha) to come to a conclusion to the contrary.
# This doesn't mean that we have proven the population are the same. 
print(ttest_ind(early_finishers['assignment2_grade'], late_finishers['assignment2_grade']))
print(ttest_ind(early_finishers['assignment3_grade'], late_finishers['assignment3_grade']))
print(ttest_ind(early_finishers['assignment4_grade'], late_finishers['assignment4_grade']))
print(ttest_ind(early_finishers['assignment5_grade'], late_finishers['assignment5_grade']))
print(ttest_ind(early_finishers['assignment6_grade'], late_finishers['assignment6_grade']))

Ttest_indResult(statistic=1.2514717608216366, pvalue=0.2108889627004424)
Ttest_indResult(statistic=1.6133726558705392, pvalue=0.10679998102227865)
Ttest_indResult(statistic=0.049671157386456125, pvalue=0.960388729789337)
Ttest_indResult(statistic=-0.05279315545404755, pvalue=0.9579012739746492)
Ttest_indResult(statistic=-0.11609743352612056, pvalue=0.9075854011989656)


In [10]:
# It looks like in this data we do not have evidence to support the poulations differ with respect to grade.
# Let's take a look at those p-values for a moment though, because they are saying things that can inform experimental 
# design down the road. For instance, one of the assignments, assignment 3, has a p-value of around 0.1. This means that if we
# accepted a level of chance similarity of 11% this would have been considered statistically significant. As a research, this would suggest to me that there is
# something here worth considering following up on. For instance, if we had a small number of participants (we don't) or if there was something unique about thi
# assignment as it relates to our experiment (whatever it was) then there may be a followup experiments we could run

# P-valuues have come under fire recently for being insufficient for telling us enough about the interactions which are happening, and two other 
# techiniques, confidence intervals, and bayesian analayses, are being used more regularly. One issue with p-values is that as you run more tests you are likely to get a 
# value which is statistically significant just by chance. 

# Let's see a simulation of this. Create a df with 100 columns and 100 numbers
df1 = pd.DataFrame([np.random.random(100) for x in range(100)])
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.239796,0.243331,0.110652,0.040915,0.284886,0.031647,0.256684,0.187661,0.583259,0.76495,...,0.467105,0.469841,0.224793,0.397638,0.671529,0.227507,0.348802,0.575807,0.413006,0.709163
1,0.852414,0.187939,0.965049,0.185127,0.332596,0.358507,0.062911,0.591005,0.79845,0.695182,...,0.523758,0.035071,0.312644,0.201709,0.60631,0.266135,0.723597,0.313503,0.253035,0.833882
2,0.395766,0.213253,0.952933,0.329324,0.253352,0.538309,0.322298,0.261677,0.207521,0.415205,...,0.199429,0.285207,0.422392,0.587769,0.253352,0.471243,0.470088,0.716622,0.409354,0.102103
3,0.437664,0.409195,0.225995,0.29268,0.198569,0.114085,0.011826,0.683696,0.464844,0.833155,...,0.316199,0.237281,0.930885,0.198848,0.669303,0.374191,0.981576,0.546622,0.037375,0.187434
4,0.37354,0.415846,0.418899,0.542513,0.089651,0.88713,0.038497,0.470738,0.651103,0.778454,...,0.588832,0.483323,0.002974,0.787416,0.68974,0.53701,0.785484,0.474681,0.130362,0.86099


In [12]:
df2 = pd.DataFrame([np.random.random(100) for x in range(100)]) # Generate 100 values in a list and iterate over 100 
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.105839,0.44301,0.788119,0.623642,0.236229,0.765969,0.42689,0.860723,0.53007,0.369578,...,0.825335,0.105044,0.96279,0.02701,0.445039,0.390611,0.980893,0.230512,0.366576,0.213643
1,0.453633,0.135311,0.281535,0.120041,0.988638,0.10392,0.274895,0.930805,0.477285,0.111537,...,0.935227,0.403721,0.121483,0.044943,0.041251,0.059546,0.722385,0.247276,0.775994,0.447443
2,0.837933,0.244804,0.59755,0.050351,0.178654,0.60226,0.929516,0.934651,0.400664,0.206452,...,0.641118,0.263996,0.469829,0.253385,0.62258,0.255316,0.582073,0.668983,0.995482,0.069024
3,0.81161,0.269976,0.815511,0.321712,0.464682,0.074282,0.590991,0.223275,0.261252,0.55192,...,0.791339,0.540824,0.830747,0.120638,0.835029,0.399354,0.695313,0.906416,0.246741,0.317056
4,0.365954,0.223305,0.302212,0.801185,0.388309,0.188862,0.280473,0.615711,0.999309,0.065219,...,0.359474,0.037581,0.507008,0.335783,0.826952,0.88937,0.94633,0.624637,0.716095,0.24136


In [13]:
# Are these two the same? 
# alpha will be 0.1

def test_columns(alpha=0.1):
    num_diff = 0
    for col in df1.columns:
        teststat,pval=ttest_ind(df1[col], df2[col])
        if pval <= alpha:
            print("Col {} is statistically significantly different at alpha={}, pval={}".format(col,alpha,pval))

test_columns()

Col 14 is statistically significantly different at alpha=0.1, pval=0.0037356939145681473
Col 19 is statistically significantly different at alpha=0.1, pval=0.09177172109841514
Col 32 is statistically significantly different at alpha=0.1, pval=0.07405237407088287
Col 36 is statistically significantly different at alpha=0.1, pval=0.06221664362039139
Col 42 is statistically significantly different at alpha=0.1, pval=0.06145353866379682
Col 69 is statistically significantly different at alpha=0.1, pval=0.031512183589475225
Col 71 is statistically significantly different at alpha=0.1, pval=0.045268883319577326
Col 76 is statistically significantly different at alpha=0.1, pval=0.05546639880275018


In [15]:
# A bunch of columns are different where they all should be the same. Remember that all the ttest does is check if two sets
# are similar given some level of confidence, in our case, 10%. The more random comparisons you do, the more will just happen to be the same by chance.
# This is called p-hacking

# Try another df
df2 = pd.DataFrame([np.random.chisquare(df=1, size=100) for x in range(100)])
test_columns()

Col 0 is statistically significantly different at alpha=0.1, pval=0.054318120396902446
Col 1 is statistically significantly different at alpha=0.1, pval=2.6252377638924654e-06
Col 2 is statistically significantly different at alpha=0.1, pval=3.6749095637728924e-06
Col 3 is statistically significantly different at alpha=0.1, pval=1.898784833034196e-05
Col 4 is statistically significantly different at alpha=0.1, pval=7.825141077855523e-05
Col 5 is statistically significantly different at alpha=0.1, pval=0.0026478179246082678
Col 6 is statistically significantly different at alpha=0.1, pval=1.5171607303331636e-05
Col 7 is statistically significantly different at alpha=0.1, pval=0.004151681454797037
Col 8 is statistically significantly different at alpha=0.1, pval=0.000134752249941399
Col 9 is statistically significantly different at alpha=0.1, pval=8.369821907983355e-07
Col 10 is statistically significantly different at alpha=0.1, pval=0.0006711146258998183
Col 11 is statistically signifi

In [None]:
# All or most columns are statisticall significant