# Statistical Analysis for distributions of Alleles cross-Populations
## Author: Efthymios Tzinis 

In [112]:
# read the data 
agg_alleles_datapath = '../data/aggregated_alleles_new.xlsx'
result_filepath = '../results/aggregated_allels_comparisons'

import pandas as pd
from pprint import pprint
xls = pd.ExcelFile(agg_alleles_datapath)
df = xls.parse(xls.sheet_names[0])
valid_sheets = xls.sheet_names[:-1]
print valid_sheets

[u'RS1801252', u'RS1801253', u'rs1042713', u'RS1042714', u'RS4994']


In [113]:
# gather information about a specific polymorphism for all populations 
alleles_agg_data = {}
for poly in valid_sheets:
    df = xls.parse(poly)
    
    # find indexes for alleles for all populations => pls allign all sheets!!
    alleles_indexes = [6,7]
    alleles_labels = map(lambda x: df['GENOTYPES'].loc[x],alleles_indexes)

#     print "Labels of alleles " + str(alleles_labels)
#     print "Indexes in sheet "+str(alleles_indexes)
    
    alleles_agg_data[poly] = {}
    alleles_agg_data[poly]['data'] = df.loc[alleles_indexes]
    alleles_agg_data[poly]['labels'] = alleles_labels
print alleles_agg_data

{u'RS4994': {'labels': [u'T', u'C'], 'data':   GENOTYPES GREEKS AFRICANS AMERICANS EAST ASIANS SOUTH ASIANS EUROPEANS  CEU  \
6         T   1930     1197       611         876          824       924  180   
7         C    112      125        83         132          154        82   18   

   FIN  GBR  IBS  TSI GROUP1=CEU+FIN+GBR GROUP2=IBS+TSI+GREEKS GROUP3=IBS+TSI  \
6  182  157  201  204                519                  2335            405   
7   16   25   13   10                 59                   135             23   

    ALL  
6  4432  
7   576  }, u'rs1042713': {'labels': [u'G', u'A'], 'data':   GENOTYPES GREEKS AFRICANS AMERICANS EAST ASIANS SOUTH ASIANS EUROPEANS  CEU  \
6         G   1258      634       377         455          542       618  129   
7         A    784      688       317         553          436       388   69   

   FIN  GBR  IBS  TSI GROUP1=CEU+FIN+GBR GROUP2=IBS+TSI+GREEKS GROUP3=IBS+TSI  \
6  116  106  132  135                351                  1525 

In [114]:
import math
import scipy.stats as stats

def compute_odds_ratio_CI_95(data,odds_ratio):
    val_in_list = list(data.flatten())
    val_in_list = map(lambda x: 1/float(x),val_in_list)
    sum_of_vals = sum(val_in_list)
    
    error = 1.96 * math.sqrt(sum_of_vals)
    ln_or = math.log(odds_ratio)
    
    uci = math.exp(ln_or + error)
    lci = math.exp(ln_or - error)
    
    return lci, uci
    
def compute_odds_ratio(data):
    if data[0][1] == 0 or data[1][0] == 0:
        return 0
    else:
        return float(data[0][0] * data[1][1]) / (data[0][1] * data[1][0])

def mean_confidence_interval(data, confidence=0.95):
    a = 1.0*np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
    return m, m-h, m+h

# from scipy.stats import chisquare
# obs = np.array([[44, 468], [23, 477]])
# obs = np.array([[180, 90], [60, 170]])
# obs = np.array([[668, 338], [1585, 679]]).T
# # obs = np.array([[1089, 3919], [679, 1585]])
# # obs = np.array([[977, 29], [2069, 483]])
# # obs = np.array([[4214, 794], [2069, 483]])
# # chisq, pval_chi = chisquare(obs,ddof=0)
# # print chisq, pval_chi
# # chisq, pval_chi= sum(chisq) , sum(pval_chi)
# # print chisq, pval_chi


# oddsratio, pval_f = stats.fisher_exact(obs)
# # print oddsratio, pval_f
# # print compute_odds_ratio(obs)
# lci, uci = compute_odds_ratio_CI_95(obs,oddsratio)
# print "{},{},{}-{}".format(pval_f,oddsratio,lci, uci)

In [117]:
def get_stats_from_matrix(data):
    oddsratio, pval_f = stats.fisher_exact(data)
    # print oddsratio, pval_f
    # print compute_odds_ratio(obs)
    lci, uci = compute_odds_ratio_CI_95(data,oddsratio)
    return "{},{},{}-{}".format(pval_f,oddsratio,lci, uci)

# for each polymorphism just do the same distr. comparisons in between populations 
def compare_between_populations_for_a_polymorphism(data,labels,poly):
    # All desired comparisons for producing stats
    comparison_list = [['GREEKS','AFRICANS'],
                    ['GREEKS','AMERICANS'],
                    ['GREEKS','EAST ASIANS'],
                    ['GREEKS','SOUTH ASIANS'],
                    ['GREEKS','EUROPEANS'],
                    ['GREEKS','GROUP1=CEU+FIN+GBR'],
                    ['GROUP1=CEU+FIN+GBR','GROUP2=IBS+TSI+GREEKS'],
                    ['GREEKS','GROUP3=IBS+TSI'],
                    ['GREEKS','ALL']]
    
    fout = open(result_filepath+'_'+str(poly)+'.txt','w')
    fout.write('Group_1,Group_2,p_value_fischer,odds_ratio,Confidence_Interval_95%\n')
    
    for comp_tuple in comparison_list:
        try: 
#             print data
            matrix = data[comp_tuple].as_matrix()
        except:
            print "Polymorphism: {} does not contain comparison tuple {}".format(poly,comp_tuple)
            
        stats = get_stats_from_matrix(matrix)
        result_str = str(comp_tuple[0])+','+comp_tuple[1]+','+stats + "\n"
        fout.write(result_str)
        print result_str
        
    fout.close()
        
            
for poly,v in alleles_agg_data.items():
    compare_between_populations_for_a_polymorphism(v['data'],v['labels'],poly)


GREEKS,AFRICANS,1.70698019353e-05,1.79951366512,1.38057773434-2.34557558795

GREEKS,AMERICANS,5.10101434857e-08,2.34086392331,1.73745677753-3.15383034464

GREEKS,EAST ASIANS,1.70870663268e-12,2.59662426614,1.9938258091-3.38166832266

GREEKS,SOUTH ASIANS,4.00235988981e-19,3.22057038835,2.49141871906-4.16311940941

GREEKS,EUROPEANS,0.00565238941075,1.52925943105,1.13805791829-2.05493443687

GREEKS,GROUP1=CEU+FIN+GBR,0.000118817171961,1.95895265621,1.40854191658-2.72444537439

GROUP1=CEU+FIN+GBR,GROUP2=IBS+TSI+GREEKS,6.38136917892e-05,0.508583457337,0.369183229509-0.700619942625

GREEKS,GROUP3=IBS+TSI,1.0,0.978615520282,0.616978503352-1.55222318336

GREEKS,ALL,6.96285732777e-16,2.23955647241,1.81653269931-2.76109160877

GREEKS,AFRICANS,7.871754922e-15,1.74126054207,1.51379669335-2.00290322254

GREEKS,AMERICANS,0.000778175506803,1.34921913062,1.13378939135-1.60558237387

GREEKS,EAST ASIANS,8.22696375524e-18,1.95019623234,1.67387904752-2.27212674074

GREEKS,SOUTH ASIANS,0.00132383590642,1.2