# Set up notebook
### Import required packages

In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

### Import our created package(s) from their .py file(s)

In [2]:
from fieldExAnalysis import fieldExAnalysis

## Read in and clean up sample data

In [3]:
df = pd.read_csv("http://www.ats.ucla.edu/stat/data/binary.csv")

#### Preview and explore your data

In [4]:
df.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [5]:
df.describe()

Unnamed: 0,admit,gre,gpa,rank
count,400.0,400.0,400.0,400.0
mean,0.3175,587.7,3.3899,2.485
std,0.466087,115.516536,0.380567,0.94446
min,0.0,220.0,2.26,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,580.0,3.395,2.0
75%,1.0,660.0,3.67,3.0
max,1.0,800.0,4.0,4.0


#### Rename column "rank" to "prestige" because "rank" is a reserved word in Python

In [6]:
df.columns = ["admit","gre","gpa","prestige"]

#### Create dummy variables where needed

In [7]:
dummy_ranks = pd.get_dummies(df['prestige'],prefix='prestige')
dummy_ranks.head()

Unnamed: 0,prestige_1,prestige_2,prestige_3,prestige_4
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0


#### Create a new data frame 'data' with admit, gre, gpa, and three of the four prestige dummy vars. We will perform our analysis on this data frame.

In [8]:
data = df[['admit','gre','gpa']].join(dummy_ranks.ix[:,'prestige_2':])
data.head()

Unnamed: 0,admit,gre,gpa,prestige_2,prestige_3,prestige_4
0,0,380,3.61,0.0,1.0,0.0
1,1,660,3.67,0.0,1.0,0.0
2,1,800,4.0,0.0,0.0,0.0
3,1,640,3.19,0.0,0.0,1.0
4,0,520,2.93,0.0,0.0,1.0


In [9]:
"""
dumNum =list(set(df['prestige']))
dumNum.remove(1)
dumNum
"""

"\ndumNum =list(set(df['prestige']))\ndumNum.remove(1)\ndumNum\n"

#### You must add an intercept column equal to 1.0 when using statsmodels Logit

In [10]:
data['intercept']=1.0 

#### Set up your training columns

In [11]:
train_cols = data.columns[1:]

### Run a logistic regression using statsmodels
#### This runs a logistic regression of y ('admit') on x (training columns) and saves the output, a python object, to the variable 'logit'. The variable 'logit' by itself won't make much sense to a human, so we'll do some additional fitting below.

In [12]:
logit = sm.Logit(data['admit'], data[train_cols])

#### Fit the model and save it to a new variable called 'result'

In [13]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.573147
         Iterations 6


#### View your results via .summary()

In [14]:
result.summary()

0,1,2,3
Dep. Variable:,admit,No. Observations:,400.0
Model:,Logit,Df Residuals:,394.0
Method:,MLE,Df Model:,5.0
Date:,"Wed, 16 Nov 2016",Pseudo R-squ.:,0.08292
Time:,11:10:39,Log-Likelihood:,-229.26
converged:,True,LL-Null:,-249.99
,,LLR p-value:,7.578e-08

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
gre,0.0023,0.001,2.070,0.038,0.000 0.004
gpa,0.8040,0.332,2.423,0.015,0.154 1.454
prestige_2,-0.6754,0.316,-2.134,0.033,-1.296 -0.055
prestige_3,-1.3402,0.345,-3.881,0.000,-2.017 -0.663
prestige_4,-1.5515,0.418,-3.713,0.000,-2.370 -0.733
intercept,-3.9900,1.140,-3.500,0.000,-6.224 -1.756


#### Find the odds rations, which tell you how a 1 unit increase in your variable impacts odds of being admitted. For example, if the prestige of school is 2, your odds are about 50% what they were compared to a baseline prestige of 1.

In [15]:
np.exp(result.params)

gre           1.002267
gpa           2.234545
prestige_2    0.508931
prestige_3    0.261792
prestige_4    0.211938
intercept     0.018500
dtype: float64

#### Return a data frame showing the confidence interval around each coefficient in our model

In [16]:
result.conf_int() 

Unnamed: 0,0,1
gre,0.00012,0.004409
gpa,0.153684,1.454391
prestige_2,-1.295751,-0.055135
prestige_3,-2.016992,-0.663416
prestige_4,-2.370399,-0.732529
intercept,-6.224242,-1.755716


# Test Using fieldExAnalysis function
### Reminder during testing: When you update fieldExAnalysis.py, you must restart the kernel and re-import the package for the changes to take effect in jupyter notebook.

#### Arguments taken by fieldExAnalysis: universeDf, dv, condition; Optional arguments: control, numConditions, covariates

In [17]:
test = fieldExAnalysis(data, 'admit',condition='prestige_2')

In [18]:
test.condition

'prestige_2'

In [19]:
test.numConditions

2

### Let's experiment with data frame df, prestending prestige was the condition variable

In [20]:
df.head()

Unnamed: 0,admit,gre,gpa,prestige
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [21]:
test = fieldExAnalysis(df,'admit',condition='prestige', control = 1)

In [22]:
test.condition

'prestige'

In [23]:
test.control

1

### Identify the treatment conditions

In [24]:
new = test.reformatConditions()

In [25]:
new.head()

Unnamed: 0,admit,gre,gpa,prestige,treat_1,treat_2,treat_3
0,0,380,3.61,3,,1.0,
1,1,660,3.67,3,,1.0,
2,1,800,4.0,1,0.0,0.0,0.0
3,1,640,3.19,4,,,1.0
4,0,520,2.93,4,,,1.0


In [26]:
len(set(new['prestige']))

4

In [27]:
new.columns.values

array(['admit', 'gre', 'gpa', 'prestige', 'treat_1', 'treat_2', 'treat_3'], dtype=object)

In [28]:
test.logitProcess(new,'admit','prestige')

Optimization terminated successfully.
         Current function value: 0.594639
         Iterations 5


0,1,2,3
Dep. Variable:,admit,No. Observations:,400.0
Model:,Logit,Df Residuals:,398.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 16 Nov 2016",Pseudo R-squ.:,0.04853
Time:,11:10:39,Log-Likelihood:,-237.86
converged:,True,LL-Null:,-249.99
,,LLR p-value:,8.394e-07

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
prestige,-0.5863,0.124,-4.728,0.000,-0.829 -0.343
intercept,0.6366,0.306,2.080,0.038,0.037 1.237


In [29]:
new.head()

Unnamed: 0,admit,gre,gpa,prestige,treat_1,treat_2,treat_3
0,0,380,3.61,3,,1.0,
1,1,660,3.67,3,,1.0,
2,1,800,4.0,1,0.0,0.0,0.0
3,1,640,3.19,4,,,1.0
4,0,520,2.93,4,,,1.0


### Try subsetting data frame and then doing logitProcess
#### Code below subsets data frame where values are not null. It's important to reset the index to avoid splicing error.

In [30]:
new.loc[new.treat_1.notnull()] #.reset_index(drop = True)

Unnamed: 0,admit,gre,gpa,prestige,treat_1,treat_2,treat_3
2,1,800,4.00,1,0.0,0.0,0.0
5,1,760,3.00,2,1.0,,
6,1,560,2.98,1,0.0,0.0,0.0
7,0,400,3.08,2,1.0,,
9,0,700,3.92,2,1.0,,
11,0,440,3.22,1,0.0,0.0,0.0
12,1,760,4.00,1,0.0,0.0,0.0
13,0,700,3.08,2,1.0,,
14,1,700,4.00,1,0.0,0.0,0.0
18,0,800,3.75,2,1.0,,


In [31]:
treat1Df = new[new['treat_1'].notnull()].reset_index()

#### Run logit process on subsetted data frame

In [32]:
test.logitProcess(treat1Df,'admit','treat_1')

Optimization terminated successfully.
         Current function value: 0.662896
         Iterations 4


0,1,2,3
Dep. Variable:,admit,No. Observations:,212.0
Model:,Logit,Df Residuals:,210.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 16 Nov 2016",Pseudo R-squ.:,0.02083
Time:,11:10:39,Log-Likelihood:,-140.53
converged:,True,LL-Null:,-143.52
,,LLR p-value:,0.01448

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
treat_1,-0.7500,0.308,-2.435,0.015,-1.354 -0.146
intercept,0.1643,0.257,0.639,0.523,-0.339 0.668


#### Figure out best way to create list from 1 to length of treat conditions

In [33]:
lenCondition = len(set(new['prestige'])) #new is df and prestige is condition
lenCondition

4

In [34]:
listConditions = list(range(lenCondition-1))
listConditions

[0, 1, 2]

In [35]:
# But actually... I want list [1,2,3]
listConditions = list(range(lenCondition))
listConditions.remove(0)
listConditions

[1, 2, 3]

In [36]:
test.analyze()

Optimization terminated successfully.
         Current function value: 0.662896
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  212
Model:                          Logit   Df Residuals:                      210
Method:                           MLE   Df Model:                            1
Date:                Wed, 16 Nov 2016   Pseudo R-squ.:                 0.02083
Time:                        11:10:39   Log-Likelihood:                -140.53
converged:                       True   LL-Null:                       -143.52
                                        LLR p-value:                   0.01448
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
treat_1       -0.7500      0.308     -2.435      0.015        -1.354    -0.146
intercept      0.1643      0.

### Figure out how to add in covariates
#### Can we use an empty list where no covariates are implemented?

In [37]:
covariates = []
covariates

[]

In [38]:
treatList = ['treat_1']
treatList

['treat_1']

#### This doesn't work

In [39]:
treatListNew = treatList.append(covariates)
treatListNew

#### This puts the empty list inside of our new list, instead of appending the items inside the list

In [40]:
treatListNew = treatList + covariates
treatListNew

['treat_1', []]

In [41]:
covariatesNotEmpty = ['covar1','covar2']

In [42]:
treatListNew = treatList + covariatesNotEmpty
treatListNew

['treat_1', [], 'covar1', 'covar2']

#### Actually... .append() in line 51 might be changing treatList. Let's try again with new variables and see if the adding method works

In [43]:
treatList =['treat1']
treatList

['treat1']

In [44]:
covariatesNotEmpty

['covar1', 'covar2']

In [45]:
covariates

[]

In [46]:
treatListNew = treatList + covariates
treatListNew

['treat1']

In [47]:
treatListNew = treatList + covariatesNotEmpty
treatListNew

['treat1', 'covar1', 'covar2']

#### Aha! Just add.

### Try code using a list of covariates

In [48]:
df.head()

Unnamed: 0,admit,gre,gpa,prestige
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [49]:
test = fieldExAnalysis(df,dv='admit',condition='prestige',covariates=['gre','gpa'],control = 1)

In [50]:
test.analyze()

Optimization terminated successfully.
         Current function value: 0.640115
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  212
Model:                          Logit   Df Residuals:                      208
Method:                           MLE   Df Model:                            3
Date:                Wed, 16 Nov 2016   Pseudo R-squ.:                 0.05448
Time:                        11:10:39   Log-Likelihood:                -135.70
converged:                       True   LL-Null:                       -143.52
                                        LLR p-value:                  0.001346
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
treat_1       -0.6732      0.317     -2.127      0.033        -1.294    -0.053
intercept     -4.0663      1.