In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
admissions = pd.read_csv("admissions.csv")

In [4]:
logistic_regression = LogisticRegression(solver='liblinear')
logistic_regression.fit(admissions[['gpa']], admissions['admit'])
labels = logistic_regression.predict(admissions[['gpa']])

admissions['predicted_label'] = labels

print(admissions['predicted_label'].value_counts())
print(admissions.headd())

0    598
1     46
Name: predicted_label, dtype: int64
   admit       gpa         gre  predicted_label
0      0  3.177277  594.102992                0
1      0  3.412655  631.528607                0
2      0  2.728097  553.714399                0
3      0  3.093559  551.089985                0
4      0  3.141923  537.184894                0


# Accuracy

In [5]:
admissions.rename(columns={'admit':'actual_label'}, inplace=True)

In [6]:
admissions.head()

Unnamed: 0,actual_label,gpa,gre,predicted_label
0,0,3.177277,594.102992,0
1,0,3.412655,631.528607,0
2,0,2.728097,553.714399,0
3,0,3.093559,551.089985,0
4,0,3.141923,537.184894,0


In [7]:
matches = (admissions['actual_label'] == admissions['predicted_label'])

In [8]:
matches

0       True
1       True
2       True
3       True
4       True
       ...  
639    False
640    False
641    False
642    False
643    False
Length: 644, dtype: bool

In [9]:
admissions['matches'] = matches

In [10]:
admissions.head()

Unnamed: 0,actual_label,gpa,gre,predicted_label,matches
0,0,3.177277,594.102992,0,True
1,0,3.412655,631.528607,0,True
2,0,2.728097,553.714399,0,True
3,0,3.093559,551.089985,0,True
4,0,3.141923,537.184894,0,True


In [11]:
correct_predictions = admissions[admissions['matches'] == True]

In [12]:
correct_predictions.head()

Unnamed: 0,actual_label,gpa,gre,predicted_label,matches
0,0,3.177277,594.102992,0,True
1,0,3.412655,631.528607,0,True
2,0,2.728097,553.714399,0,True
3,0,3.093559,551.089985,0,True
4,0,3.141923,537.184894,0,True


In [13]:
correct_predictions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 416 entries, 0 to 636
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   actual_label     416 non-null    int64  
 1   gpa              416 non-null    float64
 2   gre              416 non-null    float64
 3   predicted_label  416 non-null    int64  
 4   matches          416 non-null    bool   
dtypes: bool(1), float64(2), int64(2)
memory usage: 16.7 KB


In [14]:
admissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 644 entries, 0 to 643
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   actual_label     644 non-null    int64  
 1   gpa              644 non-null    float64
 2   gre              644 non-null    float64
 3   predicted_label  644 non-null    int64  
 4   matches          644 non-null    bool   
dtypes: bool(1), float64(2), int64(2)
memory usage: 20.9 KB


In [15]:
correct_predictions['matches'].value_counts()

True    416
Name: matches, dtype: int64

In [16]:
admissions['matches'].value_counts()

True     416
False    228
Name: matches, dtype: int64

In [38]:
accuracy = len(correct_predictions)/len(admissions)

In [39]:
accuracy

0.6459627329192547

# More precision 

Just number of correct_predicted/total_observations not enough to know if the model is good or not

Differenciation between 
- True positive : prediction 1, label 1
- False positive : prediction 1, label 0
- True negative : prediction 0, label 0
- False negative : prediction 0, label 1

In [21]:
true_positives = len(admissions[(admissions['predicted_label'] == 1) & (admissions['actual_label'] == 1)])

In [22]:
true_positives

31

In [23]:
true_negatives = len(admissions[(admissions['predicted_label'] == 0) & (admissions['actual_label'] == 0)])

In [24]:
true_negatives

385

# Sensitivity

$ TPR = \frac{True\ positive}{True\ positive\ +\ False\ negative} $

In [25]:
false_negative = len(admissions[(admissions['predicted_label'] == 0) & (admissions['actual_label'] == 1)])

In [26]:
false_negative

213

In [34]:
sensitivity = (true_positives)/(true_positives + false_negative)

In [35]:
sensitivity

0.12704918032786885

# Specificity

$ TPR = \frac{True\ negative}{True\ negative\ +\ False\ positive} $

In [31]:
false_positive = len(admissions[(admissions['predicted_label'] == 1) & (admissions['actual_label'] == 0)])

In [32]:
false_positive

15

In [36]:
specificity = (true_negatives)/(true_negatives + false_positive)

In [37]:
specificity

0.9625