In [7]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [10]:
# read the data in
df = pd.read_csv("http://www.ats.ucla.edu/stat/data/binary.csv")

In [11]:
df.columns=['admit','gre','gpa','prestige']

In [12]:
dummy_ranks = pd.get_dummies(df['prestige'], prefix='prestige') #pd.get_dummies converts a specified categorical variabe into a series of dummy vars
dummy_ranks.head()

Unnamed: 0,prestige_1,prestige_2,prestige_3,prestige_4
0,0,0,1,0
1,0,0,1,0
2,1,0,0,0
3,0,0,0,1
4,0,0,0,1


In [13]:
data = df[['admit','gre','gpa']].join(dummy_ranks.ix[:,'prestige_2':])

In [14]:
train_cols = data.columns[1:]
train_cols

Index(['gre', 'gpa', 'prestige_2', 'prestige_3', 'prestige_4'], dtype='object')

In [15]:
numConditions=5
conditionCodes = list(range(numConditions))
conditionCodes

[0, 1, 2, 3, 4]

In [16]:
startCode = np.random.choice(conditionCodes)
startCode

0

In [17]:
assignList = conditionCodes[startCode:] + conditionCodes[:startCode]

In [18]:
assignList = ((len(data)//numConditions)*assignList) + assignList[:(len(data)%numConditions)]

In [19]:
len(assignList)

400

In [20]:
data['condition'] = assignList

In [21]:
#Now try multinomial logit of condition on gre, gpa, prestige_2, prestige_3, prestige_4 
data.head()

Unnamed: 0,admit,gre,gpa,prestige_2,prestige_3,prestige_4,condition
0,0,380,3.61,0,1,0,0
1,1,660,3.67,0,1,0,1
2,1,800,4.0,0,0,0,2
3,1,640,3.19,0,0,1,3
4,0,520,2.93,0,0,1,4


In [22]:
# Multinomial Logit with statsmodels -- use when there are more than 2 conditions; works for logit, too :)
mlogit = sm.MNLogit(data['condition'],data[train_cols])
mresult = mlogit.fit()
msummary = mresult.summary()
msummary

Optimization terminated successfully.
         Current function value: 1.602087
         Iterations 5


0,1,2,3
Dep. Variable:,condition,No. Observations:,400.0
Model:,MNLogit,Df Residuals:,380.0
Method:,MLE,Df Model:,16.0
Date:,"Thu, 07 Apr 2016",Pseudo R-squ.:,0.004568
Time:,15:09:32,Log-Likelihood:,-640.83
converged:,True,LL-Null:,-643.78
,,LLR p-value:,0.9893

condition=1,coef,std err,z,P>|z|,[95.0% Conf. Int.]
gre,-0.0008,0.001,-0.562,0.574,-0.004 0.002
gpa,0.2379,0.283,0.840,0.401,-0.317 0.793
prestige_2,-0.2997,0.474,-0.632,0.527,-1.229 0.630
prestige_3,-0.5768,0.499,-1.156,0.248,-1.555 0.402
prestige_4,-0.3096,0.556,-0.556,0.578,-1.400 0.781
condition=2,coef,std err,z,P>|z|,[95.0% Conf. Int.]
gre,-0.0018,0.001,-1.258,0.208,-0.005 0.001
gpa,0.4067,0.281,1.445,0.148,-0.145 0.958
prestige_2,-0.2939,0.479,-0.614,0.539,-1.233 0.645
prestige_3,-0.5715,0.502,-1.137,0.255,-1.556 0.413


In [23]:
pVals = mresult.pvalues
pVals
#Columns are labeled by index which is confusion -- correspond to the (column# + 1) condition

Unnamed: 0,0,1,2,3
gre,0.574442,0.208422,0.464278,0.238696
gpa,0.40102,0.148429,0.325172,0.139179
prestige_2,0.527267,0.539481,0.592151,0.293933
prestige_3,0.247876,0.255359,0.253663,0.339602
prestige_4,0.57796,0.880537,0.590323,0.404525


In [24]:
pVals.columns = conditionCodes[1:] #Rename columns to correspond to conditions
pVals=pVals.reset_index()
pVals.rename(columns={'index':'variable'},inplace=True)

In [25]:
pVals

Unnamed: 0,variable,1,2,3,4
0,gre,0.574442,0.208422,0.464278,0.238696
1,gpa,0.40102,0.148429,0.325172,0.139179
2,prestige_2,0.527267,0.539481,0.592151,0.293933
3,prestige_3,0.247876,0.255359,0.253663,0.339602
4,prestige_4,0.57796,0.880537,0.590323,0.404525


In [26]:
minPval = .25