## Bar Exam - Logistic Regression

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
from math import sqrt
import pickle

In [3]:
df = pd.read_csv('../assets/bar_pass_prediction.csv')
df.head().T
#lots of features...will need to narrow it down for the model

Unnamed: 0,0,1,2,3,4
decile1b,10.0,5.0,3.0,7.0,9.0
decile3,10.0,4.0,2.0,4.0,8.0
ID,2,3,36,52,55
decile1,10.0,5.0,3.0,7.0,9.0
sex,1.0,1.0,2.0,2.0,2.0
race,7.0,7.0,7.0,7.0,7.0
cluster,1.0,2.0,3.0,3.0,4.0
lsat,44.0,29.0,36.0,39.0,48.0
ugpa,3.5,3.5,3.5,3.5,3.5
zfygpa,1.33,-0.11,-0.64,0.34,1.02


In [None]:
#notes from data source:
#pass_bar = whether or not the student passed the bar exam (the target variable)
#ugpa = undergraduate gpa
#lsat = lsat score (doesn't seem to follow the usual scoring convention though)
#tier = tier of the law school the student attended (by quintile)
#fulltime = whether the student was a full time law student

## Exploratory Data Analysis

In [7]:
df.shape
#22K records

(22407, 39)

In [4]:
# declare the list of features
#the features noted above seem the most relevant ot me; also excluced demographic features based on comment from previous project
features = ['ugpa','lsat','tier','fulltime']

In [5]:
# check for missing values for the features
print(df[features].isnull().sum())
#some missing values for school tier and fulltime status

ugpa         0
lsat         0
tier        96
fulltime    34
dtype: int64


In [6]:
#make sure the target variable isn't missing any values
print(df['pass_bar'].isnull().sum())

0


In [10]:
#fill in missing values or drop them
#since these are not continuous variables I don't want to use the mean or another aggregation method
#there are also only a few missing records relative to the size of the data set (<1%) so I will drop them
df.dropna(subset=['tier'], inplace=True)
print(df['tier'].value_counts(dropna=False))
#check the math
#22407 - 96 = 22311
#7991+6083+3895+2054+1694+594 = 22311

3.0    7991
4.0    6083
5.0    3895
6.0    2054
2.0    1694
1.0     594
Name: tier, dtype: int64


In [12]:
print(df['fulltime'].value_counts(dropna=False))
#20638+1639+34 = 22311

1.0    20638
2.0     1639
NaN       34
Name: fulltime, dtype: int64


In [13]:
df.dropna(subset=['fulltime'], inplace=True)

In [14]:
print(df['fulltime'].value_counts(dropna=False))
#20638+1639 = 22,277

1.0    20638
2.0     1639
Name: fulltime, dtype: int64


In [15]:
df.shape

(22277, 39)

In [16]:
#make sure there are no more null values
print(df[features].isnull().sum())

ugpa        0
lsat        0
tier        0
fulltime    0
dtype: int64


In [18]:
#make sure features and target variable are all numeric
df.dtypes
#['ugpa','lsat','tier','fulltime','pass_bar']

decile1b                   float64
decile3                    float64
ID                           int64
decile1                    float64
sex                        float64
race                       float64
cluster                    float64
lsat                       float64
ugpa                       float64
zfygpa                     float64
DOB_yr                     float64
grad                        object
zgpa                       float64
bar1                        object
bar1_yr                    float64
bar2                        object
bar2_yr                    float64
fulltime                   float64
fam_inc                    float64
age                        float64
gender                      object
parttime                   float64
male                       float64
race1                       object
race2                       object
Dropout                     object
other                        int64
asian                        int64
black               

In [21]:
#take a look at some basic stats
df['ugpa'].describe()

count    22277.000000
mean         3.217125
std          0.403113
min          1.500000
25%          3.000000
50%          3.200000
75%          3.500000
max          3.900000
Name: ugpa, dtype: float64

In [22]:
df['lsat'].describe()
#not the usual scoring convention out of 180

count    22277.000000
mean        36.796943
std          5.437906
min         11.000000
25%         33.000000
50%         37.000000
75%         41.000000
max         48.000000
Name: lsat, dtype: float64

In [23]:
df['tier'].describe()
#why is there a 6 if it's quintiles?

count    22277.000000
mean         3.768461
std          1.179060
min          1.000000
25%          3.000000
50%          4.000000
75%          5.000000
max          6.000000
Name: tier, dtype: float64

In [24]:
df['tier'].value_counts()
#looks like there are 6 tiers instead of 5

3.0    7985
4.0    6074
5.0    3885
6.0    2050
2.0    1691
1.0     592
Name: tier, dtype: int64

In [25]:
df['fulltime'].describe()

count    22277.000000
mean         1.073574
std          0.261082
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          2.000000
Name: fulltime, dtype: float64

In [26]:
df['fulltime'].value_counts()
#based on the parttime column, 1 = fulltime, 2 = parttime. weird that it's not using the 0 / 1 convention

1.0    20638
2.0     1639
Name: fulltime, dtype: int64

In [27]:
df['parttime'].value_counts()

0.0    20638
1.0     1639
Name: parttime, dtype: int64

In [28]:
df['pass_bar'].describe()

count    22277.000000
mean         0.948198
std          0.221633
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: pass_bar, dtype: float64

In [29]:
df['pass_bar'].value_counts()
#most of them passed...might need to add some kind of penalty

1    21123
0     1154
Name: pass_bar, dtype: int64

## Model Building

In [30]:
# specify X and y
y = df['pass_bar']
X = df[features]

In [31]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .10, random_state=12) #used a smaller value for test_size since the data set is large

In [32]:
# Fit the model to the training dataset
mymodel = LogisticRegression()
mymodel.fit(X_train, y_train)

In [35]:
# coefficients and intercept
print(mymodel.intercept_)
print(mymodel.coef_)

[-5.42078336]
[[ 1.01607683  0.17376031 -0.05039563 -0.54139124]]


In [36]:
# Predict the y-values on the testing dataset
y_preds = mymodel.predict(X_test)
y_probs = mymodel.predict_proba(X_test)

## Model Evalution

In [37]:
#check the accuracy
print(metrics.accuracy_score(y_test, y_preds))
#pretty accurate

0.9443447037701975


In [38]:
# compare that to a baseline score!
# what is the majority class of the target?
print(y_train.value_counts())
#not the best data set because the vast majority of the students passed the bar...

1    19021
0     1028
Name: pass_bar, dtype: int64


In [41]:
# just guess that it's always the majority class.
# how does that effect accuracy?
baseline_preds = np.empty(len(y_test))
baseline_preds.fill(1)
print(metrics.accuracy_score(baseline_preds, y_test))
#basically the same level of accuracy as the model

0.9434470377019749


In [43]:
# improvement over baseline
print(metrics.accuracy_score(y_preds, y_test) - metrics.accuracy_score(baseline_preds, y_test))
#no improvement b/c the pass rate was so high in the original data :(

0.0008976660682226134
