In [1]:
import numpy as np
import pandas as pd

In [2]:
# read reference and schizophrenia datasets
ref = pd.read_csv('membership-main/reference.csv', header=0)
sch = pd.read_csv('membership-main/schizophrenia.csv', header=0)

In [3]:
# read sample datasets
sample2 = pd.read_csv('membership-main/sample2.csv', header=0)
sample3 = pd.read_csv('membership-main/sample3.csv', header=0)
sample4 = pd.read_csv('membership-main/sample4.csv', header=0)
sample5 = pd.read_csv('membership-main/sample5.csv', header=0)

In [4]:
# form a dictionary that maps SNPs to minor alleles
snp = list(ref['SNP'])
a2 = list(ref['A2'])
minorAllele = {}
for i, s in enumerate(snp):
    minorAllele[s] = a2[i]

In [5]:
# define a computeMAF function to compute the frequency of a minor allele (MAF)
def computeMAF(row):
    minor = minorAllele[row['SNP']]
    if row['a1'] == minor and row['a2'] == minor:
        freq = 1
    elif row['a1'] != minor and row['a2'] != minor:
        freq = 0
    else:
        freq = 0.5
    return freq

In [6]:
# apply the computeMAF to compute MAFs for the 4 samples
sample2['MAF'] = sample2.apply(computeMAF, axis=1)
sample3['MAF'] = sample3.apply(computeMAF, axis=1)
sample4['MAF'] = sample4.apply(computeMAF, axis=1)
sample5['MAF'] = sample5.apply(computeMAF, axis=1)

In [7]:
sample2[sample2['SNP']=='rs1000079']

Unnamed: 0,SNP,a1,a2,MAF
4,rs1000079,T,C,0.5


In [8]:
# form a dictionary that maps SNPs to MAFs for the reference dataset 
mafRef = {}
snp = list(ref['SNP'])
maf = list(ref['MAF'])

for i, s in enumerate(snp):
    mafRef[s] = maf[i]
    
# form a dictionary that maps SNPs to MAFs for the schizophrenia dataset
mafSch = {}
snp = list(sch['SNP'])
maf = list(sch['MAF'])

for i, s in enumerate(snp):
    mafSch[s] = maf[i]

In [9]:
# define a computeDelta function to compute deltas
def computeDelta(row):
    snp = row['SNP']
    Y = row['MAF']
    P = mafRef[snp]
    M = mafSch[snp]
    delta = abs(Y-P) - abs(Y-M)
    return delta

In [10]:
# apply the computeDelta function to compute deltas for the 4 samples
sample2['DELTA'] = sample2.apply(computeDelta, axis=1)
sample3['DELTA'] = sample3.apply(computeDelta, axis=1)
sample4['DELTA'] = sample4.apply(computeDelta, axis=1)
sample5['DELTA'] = sample5.apply(computeDelta, axis=1)

In [11]:
# compute the T statistics for the 4 samples
mean2 = np.mean(sample2['DELTA'])
std2 = np.std(sample2['DELTA'])
len2 = len(sample2['DELTA'])
T2 = mean2 / (std2 / np.sqrt(len2))

mean3 = np.mean(sample3['DELTA'])
std3 = np.std(sample3['DELTA'])
len3 = len(sample3['DELTA'])
T3 = mean3 / (std3 / np.sqrt(len3))

mean4 = np.mean(sample4['DELTA'])
std4 = np.std(sample4['DELTA'])
len4 = len(sample4['DELTA'])
T4 = mean4 / (std4 / np.sqrt(len4))

mean5 = np.mean(sample5['DELTA'])
std5 = np.std(sample5['DELTA'])
len5 = len(sample5['DELTA'])
T5 = mean5 / (std5 / np.sqrt(len5))

In [12]:
T2

-11.455320407537465

In [13]:
T3

-1.7792502890506725

In [14]:
T4

-2.264107569068995

In [15]:
T5

-2.264107569068995