In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import statsmodels.api as sm
import statsmodels.formula.api as smf
import os

In [2]:
mytable = np.array([[5, 15],[3,30]])
mytable

array([[ 5, 15],
       [ 3, 30]])

#### The methods below refer to the statsmodel package. 

##### Learn to read the docs! https://www.statsmodels.org/stable/generated/statsmodels.stats.contingency_tables.Table2x2.html

In [3]:
result = sm.stats.Table2x2(mytable)
result.oddsratio

3.3333333333333335

In [4]:
result.oddsratio_confint()

(0.7006053806126255, 15.859300283128432)

In [5]:
result.oddsratio_pvalue()

0.13031366333302757

## Logistic Regression 

### Read the data

In [6]:
raw_data = pd.read_csv(os.path.expanduser("~/gitRepos/cse5243/course_materials/lecture_examples/LogisticRegressionData.csv"))
raw_data

Unnamed: 0,Sick,Age,Sex,Smoker
0,0,24,0,0
1,1,40,1,1
2,0,23,0,0
3,1,39,0,1
4,0,26,0,0
5,1,45,1,1
6,1,42,1,0
7,1,39,1,1
8,0,23,1,0
9,0,38,0,0


In [7]:
raw_data.shape

(20, 4)

### Logistic Reression of One Variable

In [8]:
my_model = sm.Logit(raw_data['Sick'], sm.add_constant(raw_data['Smoker'])).fit() 
print(my_model.summary())

#be sure to check out the formulaic version as well if you're familiar with R: https://www.statsmodels.org/dev/example_formulas.html

Optimization terminated successfully.
         Current function value: 0.479248
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                   Sick   No. Observations:                   20
Model:                          Logit   Df Residuals:                       18
Method:                           MLE   Df Model:                            1
Date:                Wed, 08 Sep 2021   Pseudo R-squ.:                  0.3036
Time:                        13:38:34   Log-Likelihood:                -9.5850
converged:                       True   LL-Null:                       -13.763
Covariance Type:            nonrobust   LLR p-value:                  0.003845
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.9808      0.677     -1.449      0.147      -2.308       0.346
Smoker         3.0603      1.

In [None]:
predicted_probs = my_model.predict(sm.add_constant(raw_data['Smoker']))
predicted_probs

In [None]:
def table(predicted_probs, labels, cutoff):
    """ Replacement for R's table funcion. """
    predicted_outcome = (predicted_probs > cutoff).astype(int)
    df = pd.DataFrame({"predicted_outcome": predicted_outcome, "actual_outcome": labels})
    return pd.crosstab(index=df["actual_outcome"], columns=df["predicted_outcome"], margins=False)

table(predicted_probs, raw_data["Sick"], cutoff=0.5)

### Logistic Regression of Multiple Variables

In [9]:
my_model_full = sm.Logit(raw_data['Sick'], sm.add_constant(raw_data[['Smoker', 'Age', 'Sex']])).fit() 
print(my_model_full.summary())

Optimization terminated successfully.
         Current function value: 0.285272
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                   Sick   No. Observations:                   20
Model:                          Logit   Df Residuals:                       16
Method:                           MLE   Df Model:                            3
Date:                Wed, 08 Sep 2021   Pseudo R-squ.:                  0.5854
Time:                        13:43:46   Log-Likelihood:                -5.7054
converged:                       True   LL-Null:                       -13.763
Covariance Type:            nonrobust   LLR p-value:                  0.001074
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -13.1850      8.527     -1.546      0.122     -29.897       3.527
Smoker         1.1918      1.

In [13]:
np.exp(10*0.3672)

39.33048822819889

### What about all you sklearn folks?
#### Or why it's important to actually know what you're doing AND read the docs!

In [None]:
sk_model = sk.linear_model.LogisticRegression(fit_intercept = True, penalty = 'none').fit(raw_data[['Smoker', 'Age', 'Sex']], raw_data['Sick'])

###Note the differnece. You must specificy, the penalty (such as L1, L2, elasticnet, none)
### Other parameters possible: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
### Also Syntax is different.
### Moral of the story: Understand the THEORY.. Then figure out the code. 


In [None]:
sk_model.coef_

In [None]:
sk_model.intercept_

#### But do they give the same results?

In [None]:
predicted_probs_stats_models = my_model_full.predict( sm.add_constant(raw_data[['Smoker', 'Age', 'Sex']]))
predicted_probs_sklearn = sk_model.predict_proba(raw_data[['Smoker', 'Age', 'Sex']])

In [None]:
predicted_probs_stats_models ## This shows the probability of a 1

In [None]:
predicted_probs_sklearn ## This shows the probabiltiy of [0,1]

In [None]:
predicted_probs_stats_models - predicted_probs_sklearn[:, 1]