In [1]:
import pandas as pd
import numpy as np

In [2]:
## Loading data directly from UCLA
df = pd.read_stata("https://stats.idre.ucla.edu/stat/stata/dae/binary.dta")

df.head()


Unnamed: 0,admit,gre,gpa,rank
0,0.0,380.0,3.61,3.0
1,1.0,660.0,3.67,3.0
2,1.0,800.0,4.0,1.0
3,1.0,640.0,3.19,4.0
4,0.0,520.0,2.93,4.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 399
Data columns (total 4 columns):
admit    400 non-null float32
gre      400 non-null float32
gpa      400 non-null float32
rank     400 non-null float32
dtypes: float32(4)
memory usage: 9.4 KB


In [4]:
df.describe()

Unnamed: 0,admit,gre,gpa,rank
count,400.0,400.0,400.0,400.0
mean,0.3175,587.700012,3.389901,2.485
std,0.466087,115.516663,0.380567,0.944462
min,0.0,220.0,2.26,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,580.0,3.395,2.0
75%,1.0,660.0,3.67,3.0
max,1.0,800.0,4.0,4.0


### Descriptic stats inference

31% of the students get admitted to a graduate program
Average GRE score is 587 with a large standard deviation
Average GPA is 3.39, and the average undergraduate school prestige is 2.49.

variable - rank is a categorical data, this has to be converted for better model accuracy

In [5]:
df['rank'].head()

0    3.0
1    3.0
2    1.0
3    4.0
4    4.0
Name: rank, dtype: float32

### Data Wrangling - Get dummies

In [6]:
df['rank'] = df['rank'].astype('category')


In [7]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,admit,gre,gpa,rank_1.0,rank_2.0,rank_3.0,rank_4.0
0,0.0,380.0,3.61,0,0,1,0
1,1.0,660.0,3.67,0,0,1,0
2,1.0,800.0,4.0,1,0,0,0
3,1.0,640.0,3.19,0,0,0,1
4,0.0,520.0,2.93,0,0,0,1


In [8]:
# rank colum has been converted into 4 new cols

In [9]:
# drop the first column from the dummy variables
del df['rank_1.0']

In [10]:
df.head()

Unnamed: 0,admit,gre,gpa,rank_2.0,rank_3.0,rank_4.0
0,0.0,380.0,3.61,0,1,0
1,1.0,660.0,3.67,0,1,0
2,1.0,800.0,4.0,0,0,0
3,1.0,640.0,3.19,0,0,1
4,0.0,520.0,2.93,0,0,1


In [11]:
df['admit'].head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: admit, dtype: float32

In [12]:
y = df['admit']

In [13]:
y.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: admit, dtype: float32

In [14]:
X = df.drop('admit', axis=1)


# lOGISTIC REGRESSION

In [16]:
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.589306
         Iterations 5
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.057     
Dependent Variable: admit            AIC:              481.4452  
Date:               2019-11-01 16:10 BIC:              501.4025  
No. Observations:   400              Log-Likelihood:   -235.72   
Df Model:           4                LL-Null:          -249.99   
Df Residuals:       395              LLR p-value:      9.7322e-06
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     5.0000                                       
------------------------------------------------------------------
               Coef.   Std.Err.     z     P>|z|    [0.025   0.975]
------------------------------------------------------------------
gre            0.0014    0.0010   1.3287  0.1839  -0.0007   0.0034
gpa           -0.1336    0.1945  -0.6869  0.4922  -0.5147   0.