# Set up notebook
### Import required packages

In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

### Import our created package(s) from their .py file(s)

In [2]:
from fieldExAnalysis import fieldExAnalysis

In [3]:
cd ../../../Misc/Grace/2016/randomizeCodeTranslate

/t/Misc/Grace/2016/randomizeCodeTranslate


In [4]:
from genericRandomizationCode import randomization

In [5]:
cd ../../../../2016/experiments/analyzeCodeTranslate.git

/t/2016/experiments/analyzeCodeTranslate.git


## Read in and clean up sample data

In [6]:
df = pd.read_csv("http://www.ats.ucla.edu/stat/data/binary.csv")

#### Preview and explore your data

In [7]:
df.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [8]:
df.describe()

Unnamed: 0,admit,gre,gpa,rank
count,400.0,400.0,400.0,400.0
mean,0.3175,587.7,3.3899,2.485
std,0.466087,115.516536,0.380567,0.94446
min,0.0,220.0,2.26,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,580.0,3.395,2.0
75%,1.0,660.0,3.67,3.0
max,1.0,800.0,4.0,4.0


#### Rename column "rank" to "prestige" because "rank" is a reserved word in Python

In [9]:
df.columns = ["admit","gre","gpa","prestige"]

#### Create dummy variables where needed

In [10]:
dummy_ranks = pd.get_dummies(df['prestige'],prefix='prestige')
dummy_ranks.head()

Unnamed: 0,prestige_1,prestige_2,prestige_3,prestige_4
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0


#### Create a new data frame 'data' with admit, gre, gpa, and three of the four prestige dummy vars. We will perform our analysis on this data frame.

In [11]:
data = df[['admit','gre','gpa']].join(dummy_ranks.ix[:,'prestige_2':])
data.head()

Unnamed: 0,admit,gre,gpa,prestige_2,prestige_3,prestige_4
0,0,380,3.61,0.0,1.0,0.0
1,1,660,3.67,0.0,1.0,0.0
2,1,800,4.0,0.0,0.0,0.0
3,1,640,3.19,0.0,0.0,1.0
4,0,520,2.93,0.0,0.0,1.0


#### You must add an intercept column equal to 1.0 when using statsmodels Logit

In [12]:
data['intercept']=1.0 

#### Set up your training columns

In [13]:
train_cols = data.columns[1:]

### Run a logistic regression using statsmodels
#### This runs a logistic regression of y ('admit') on x (training columns) and saves the output, a python object, to the variable 'logit'. The variable 'logit' by itself won't make much sense to a human, so we'll do some additional fitting below.

In [14]:
logit = sm.Logit(data['admit'], data[train_cols])

#### Fit the model and save it to a new variable called 'result'

In [15]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.573147
         Iterations 6


#### View your results via .summary()

In [16]:
result.summary()

0,1,2,3
Dep. Variable:,admit,No. Observations:,400.0
Model:,Logit,Df Residuals:,394.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 02 Aug 2016",Pseudo R-squ.:,0.08292
Time:,17:56:59,Log-Likelihood:,-229.26
converged:,True,LL-Null:,-249.99
,,LLR p-value:,7.578e-08

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
gre,0.0023,0.001,2.070,0.038,0.000 0.004
gpa,0.8040,0.332,2.423,0.015,0.154 1.454
prestige_2,-0.6754,0.316,-2.134,0.033,-1.296 -0.055
prestige_3,-1.3402,0.345,-3.881,0.000,-2.017 -0.663
prestige_4,-1.5515,0.418,-3.713,0.000,-2.370 -0.733
intercept,-3.9900,1.140,-3.500,0.000,-6.224 -1.756


#### Find the odds rations, which tell you how a 1 unit increase in your variable impacts odds of being admitted. For example, if the prestige of school is 2, your odds are about 50% what they were compared to a baseline prestige of 1.

In [17]:
np.exp(result.params)

gre           1.002267
gpa           2.234545
prestige_2    0.508931
prestige_3    0.261792
prestige_4    0.211938
intercept     0.018500
dtype: float64

#### Return a data frame showing the confidence interval around each coefficient in our model

In [18]:
result.conf_int() 

Unnamed: 0,0,1
gre,0.00012,0.004409
gpa,0.153684,1.454391
prestige_2,-1.295751,-0.055135
prestige_3,-2.016992,-0.663416
prestige_4,-2.370399,-0.732529
intercept,-6.224242,-1.755716


# Test Using fieldExAnalysis function
### Reminder during testing: When you update fieldExAnalysis.py, you must restart the kernel and re-import the package for the changes to take effect in jupyter notebook.

#### Arguments taken by fieldExAnalysis: universeDf, dv, condition; Optional arguments: control, numConditions, numOutputs, covariates

In [19]:
test = fieldExAnalysis(data, 'admit',condition='prestige_2')

In [20]:
test.condition

'prestige_2'

In [21]:
test.numConditions

2

#### Check that the number of outputs found using fieldExAnalysis matches what we pull from the data frame here

In [22]:
len(set(data['admit'])) == test.numOutputs

True

### Let's experiment with data frame df, prestending prestige was the condition variable!

In [23]:
df.head()

Unnamed: 0,admit,gre,gpa,prestige
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [24]:
test = fieldExAnalysis(df,'admit',condition='prestige', control = 1)

In [25]:
test.condition

'prestige'

In [26]:
test.control

1

### Identify the treatment conditions

In [27]:
test.reformatConditions()

Treatvar: 2, n: 1, treatvars list: [2, 3, 4]
Treatvar: 3, n: 2, treatvars list: [2, 3, 4]
Treatvar: 4, n: 3, treatvars list: [2, 3, 4]


In [30]:
test.universeDf.head()

Unnamed: 0,admit,gre,gpa,prestige,treat_1,treat_2,treat_3
0,0,380,3.61,3,,,
1,1,660,3.67,3,,,
2,1,800,4.0,1,,,
3,1,640,3.19,4,,,
4,0,520,2.93,4,,,


In [32]:
new = test.universeDf

In [33]:
new.head()

Unnamed: 0,admit,gre,gpa,prestige,treat_1,treat_2,treat_3
0,0,380,3.61,3,,,
1,1,660,3.67,3,,,
2,1,800,4.0,1,,,
3,1,640,3.19,4,,,
4,0,520,2.93,4,,,


### TODO: Write code that replaces values of treat_n based on prestige