# Categorical data from pre-screen survey

In [9]:
%pylab inline
import pandas as pd
import os
from collections import Counter 
from scipy.stats import stats

Populating the interactive namespace from numpy and matplotlib


In [14]:
data = pd.read_csv("Crowdclass_Data.csv")
data =data.dropna(how="all")
def basic_stats_categorical(field):
    print "---------------------"
    print field
    cnt = Counter(data[field])
#     print cnt
    percentage = np.array(cnt.values())/float(sum(cnt.values()))*100
    for category,percent in zip(cnt.keys(),percentage):
        print  "{} : %.2f ".format(str(category).ljust(5)) %percent ,"%"
        
field_lst = ["Age", "Highest Level of education","Area of Study/Professional Interest",\
             "What is your experience with citizen science? (e.g. Zooniverse, Volunteer Computing, community sensing)",\
             "Geographic Location"]
for field in field_lst:             
    basic_stats_categorical(field)

---------------------
Age
18-21 : 4.30  %
30-39 : 31.18  %
60+   : 4.30  %
40-59 : 31.18  %
22-29 : 29.03  %
---------------------
Highest Level of education
Professional (MS, PhD, JD ..etc) : 26.88  %
Bachelor / Associate Degree : 46.24  %
High School or Equivalent : 9.68  %
Some college. : 17.20  %
---------------------
Area of Study/Professional Interest
STEM  : 46.24  %
Others : 53.76  %
---------------------
What is your experience with citizen science? (e.g. Zooniverse, Volunteer Computing, community sensing)
Yes   : 17.20  %
No    : 82.80  %
---------------------
Geographic Location
Other : 31.18  %
India : 18.28  %
USA   : 50.54  %


### Fisher's Exact : Making sure Group A and Group B demographics are simmilar

http://yatani.jp/teaching/doku.php?id=hcistats:chisquare

In [15]:
users = data
A_summary_stats = users[users["Group"]=="A"]
B_summary_stats = users[users["Group"]=="B"]

In [16]:
def Fisher_Test(field,abbrev = "",output="p-value"):
    cntA = Counter(A_summary_stats[field])
    cntB = Counter(B_summary_stats[field])
#     print "# Categories: ", sorted(cntA.keys())
    ncol = len(cntA.keys())
    lst = list(set(cntA) - set(cntB))
    if len(cntA)>len(cntB):
        for l in lst:
            cntB[l]='0'
    elif len(cntA)<len(cntB):
        for l in lst:
            cntA[l]='0'
    alist = [cntA[i] for i in sorted(cntA.keys())]
    blist = [cntB[i] for i in sorted(cntB.keys())]
    alist.extend(blist)
#     print len(alist)
#     print "data <- matrix(c("+ ','.join(str(p) for p in alist)+"), ncol=5, byrow=T)"
#     print "fisher.test(data)"
    f = open("Fisher.r", "w")
    f.write("data <- matrix(c("+ ','.join(str(p) for p in alist)+"), ncol={}, byrow=T) \n".format(ncol))
    f.write("# Contingency Table \n")
#     f.write("data \n")
    f.write("fisher.test(data) \n")
    f.write("library(vcd) \n")
    f.write("assocstats(data) \n")
    f.close()

#     batcmd=os.getcwd()
#     result = subprocess.check_output('dir/', shell=True)
    os.system("r -f Fisher.r > Fisher_{}.out".format(abbrev))
    f = open("Fisher_{}.out".format(abbrev), 'r')
    lines = f.readlines()[18:] # supress header outputs
    if output=="full":
        for l in lines:
            if l!='\n':
                print l 
    elif output=="p-value":
        for l in lines:
            if l[:7]=='p-value':
                p = float(l.split()[-1])
                print "{0} : p ={1} ---> {2}".format(field,p,pcheck(p,"Independence"))
    f.close()

Null hypothesis:  the occurrence of the outcomes for the two groups is equal --> independence

If we look at the output ``cat Fisher_age.out", we see that assocstats also prints out the contingency coefficient, Pearson's coeffient, Cramer's V ...etc which measures strength of independence of the categorical frequency. That info is a bit excessive but its there if we need them. 

In [19]:
from  stats_helper import *

In [20]:
Fisher_Test("Age",abbrev="age",output="p-value")
Fisher_Test("Highest Level of education",abbrev="Edu",output="p-value")
Fisher_Test("Area of Study/Professional Interest",abbrev="Interest",output="p-value")
Fisher_Test("What is your experience with citizen science? (e.g. Zooniverse, Volunteer Computing, community sensing)",abbrev="CS_exp",output="p-value")
Fisher_Test("Geographic Location",abbrev="Geo",output="p-value")

Age : p =0.4704 ---> Independence
Highest Level of education : p =0.2127 ---> Independence
Area of Study/Professional Interest : p =0.8364 ---> Independence
What is your experience with citizen science? (e.g. Zooniverse, Volunteer Computing, community sensing) : p =1.0 ---> Independence
Geographic Location : p =0.3882 ---> Independence


##### Fisher's Exact test shows that Group A and B are independent in these variables of interest measured in our pre-screening survey. 

# Quantitative data from pre-screen survey 

- 10-point Likert scale data from pre-screen survey

In [21]:
def kolmogorov_smirnov(data1,data2,name):
    result = stats.ks_2samp(data1,data2)
    print "{0} : D = {1} ; p ={2} ---> {3}".format(name,np.around(result[0],2),np.around(result[1],2),pcheck(result[1],"from same distribution"))

In [22]:
def pcheck(p,null_hyp):
    '''
    if p>0.05 then reject null hypothesis
    '''
    if p>0.05:
        return  null_hyp
    else:
        return "NOT "+null_hyp

In [23]:
def basic_stats_quantitative(field,plot_hist = False): 
    print "---------------------"
#     print field
    Adata =  np.array(A_summary_stats[field])
    Bdata =  np.array(B_summary_stats[field])
#     print Adata
#     print Bdata
#     print "Check that they come from the same distribution with KS test" 
    kolmogorov_smirnov(Adata,Bdata,field)
    if plot_hist:
        plt.figure()
        plt.title(field,fontsize=14)
        plt.hist(Adata,label="A",bins =10)
        plt.hist(Bdata,label="B",bins =10)
        plt.xlim(0,10)
        plt.legend(loc = "upper left")
    
    print "For A: "
    print "mean  = ", mean(Adata)
    print "std  = ", std(Adata)

    print "For B: "
    print "mean  = ", mean(Bdata)
    print "std  = ", std(Bdata)

qfield_lst  = ["Level of knowledge in astronomy","Level of interest in astronomy", "Level of interest in science"]
for field in qfield_lst:
    basic_stats_quantitative(field)

---------------------
Level of knowledge in astronomy : D = 0.16 ; p =0.59 ---> from same distribution
For A: 
mean  =  3.71739130435
std  =  2.40165743021
For B: 
mean  =  3.23404255319
std  =  2.10584562241
---------------------
Level of interest in astronomy : D = 0.12 ; p =0.85 ---> from same distribution
For A: 
mean  =  6.95652173913
std  =  2.73429524931
For B: 
mean  =  6.48936170213
std  =  2.7895880915
---------------------
Level of interest in science : D = 0.1 ; p =0.97 ---> from same distribution
For A: 
mean  =  8.39130434783
std  =  1.87057606646
For B: 
mean  =  8.04255319149
std  =  2.13339652087
