# Linear Regression and Logistic Regression (Python)
Yang Xi <br>
14 Nov, 2018

<br>

- Linear Regression: Model Fitting and Interpretation
    - Independent Numeric and Categorical Variables
    - Response to Highly Correlated Numeric Variables
    - Interaction: Numeric Variable Interacts With Boolean Variable
    - Polynomial Numeric Variable
- Logistic Regression: Classification with Imbalanced Class
    - Train Model and Interpret Fitted Coefficients
    - Train Performance
    - Cross-Validation Performance
    - Test Performance
- Appendix: Wrap Statsmodels in Sklearn Estimator for Cross-Validation
    - Train and Interpret GLM Model
    - Train Performance
    - Cross-Validation Performance
    - Test Performance

# Linear Regression: Model Fitting and Interpretation
### Independent Numeric and Categorical Variables

In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.linear_model import LinearRegression as lm
import statsmodels.api as sm

In [2]:
# Simulate data
random.seed(1)
n = 9000
x1 = np.array(['A', 'B', 'C']*int(n/3))
x1 = np.random.choice(x1, n, replace=False)
x2 = np.random.uniform(0, 2, n)
x3 = np.random.uniform(0, 1, n)
e = np.random.uniform(-0.5, 0.5, n)

b1_map = {'A':-2, 'B':-1, 'C':1} # 0, 1, 3
b2 = 3

X = pd.DataFrame({'x1':x1, 'x2':x2, 'x3':x3})
y = list(map(b1_map.get, x1)) + b2*x2 + e 

# Covert categorical variable
X = pd.get_dummies(X, columns=['x1'], drop_first=True) # x2, x3, x1_B, x1_C

# Fit model
lmFit = lm().fit(X, y)
print('intercept = {0:.3f}'.format(lmFit.intercept_))
print(pd.DataFrame(lmFit.coef_, index=X.columns, columns=['coef']))

intercept = -1.992
          coef
x2    2.995301
x3   -0.003818
x1_B  0.995364
x1_C  3.001635


**statsmodels** package can provide more statistics

In [3]:
Xc = sm.add_constant(X)
sm_fit = sm.OLS(y, Xc).fit()
print(sm_fit.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.982
Model:                            OLS   Adj. R-squared:                  0.982
Method:                 Least Squares   F-statistic:                 1.230e+05
Date:                Fri, 17 Jul 2020   Prob (F-statistic):               0.00
Time:                        13:21:32   Log-Likelihood:                -1592.5
No. Observations:                9000   AIC:                             3195.
Df Residuals:                    8995   BIC:                             3231.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.9920      0.009   -217.799      0.0

### Response to Highly Correlated Numeric Variables

In [4]:
# Simulate data
random.seed(1)
n = 10000
x1 = np.random.uniform(1, 2, n)
x2 = x1
e = np.random.uniform(-0.5, 0.5, n)

b1 = 2
b2 = 3

X = pd.DataFrame({'x1':x1, 'x2':x2})
y = b1*x1 + b2*x2 + e

# Fit model
lmFit = lm().fit(X, y)
print('intercept = {0:.3f}'.format(lmFit.intercept_))
print(pd.DataFrame(lmFit.coef_, index=X.columns, columns=['coef']))

intercept = -0.023
        coef
x1  2.506362
x2  2.506362


### Interaction: Numeric Variable Interacts with Boolean Variable
Take note on how to interpret the result

In [5]:
from sklearn.preprocessing import PolynomialFeatures as poly

# Simulate data
random.seed(1)
n = 10000
x1 = np.array(['N','Y']*int(n/2))
x1 = np.random.choice(x1, n, replace=False)
x2 = np.random.uniform(0, 1, n)
e = np.random.uniform(-0.1, 0.1, n)

bn = -2
by = 1

# Covert categorical variable
X = pd.DataFrame({'x1':x1, 'x2':x2})
X = pd.get_dummies(X, columns=['x1'], drop_first=True)  # x2, x1_Y

# Interaction
X = pd.DataFrame(poly(interaction_only=True, include_bias=False).fit_transform(X),
                 columns=np.append(X.columns.values, 'x1_Y:x2')) # x2, x1_Y, x2:x1_Y

y = np.where(x1=='N', bn*x2, by*x2) + e

# Fit model
lmFit = lm().fit(X, y)
print('intercept = {0:.3f}'.format(lmFit.intercept_))
print(pd.DataFrame(lmFit.coef_, index=X.columns, columns=['coef']))

intercept = 0.001
             coef
x2      -2.003585
x1_Y    -0.002665
x1_Y:x2  3.004301


### Polynomial Numeric Variable

In [6]:
from sklearn.preprocessing import PolynomialFeatures as poly

# Simulate data
random.seed(1)
n = 10000
x1 = np.random.uniform(2, 5, n)
e = np.random.uniform(-0.5, 0.5, n)

b1 = 2
b2 = 3

xPoly = poly(2, include_bias=False).fit_transform(np.array([[x, 1] for x in x1]))
X = pd.DataFrame(np.array([v[[0,2]] for v in xPoly]),
                 columns=['x1', 'x1^2'])

y = b2*X['x1^2'] + b1*X['x1'] + e

# Fit model
lmFit = lm().fit(X, y)
print('intercept = {0:.3f}'.format(lmFit.intercept_))
print(pd.DataFrame(lmFit.coef_, index=X.columns, columns=['coef']))

intercept = 0.047
          coef
x1    1.967578
x1^2  3.005012


# Logistic Regression: Classification with Imbalanced Class
### Train Model and Interpret Fitted Coefficients

In [9]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score

dfTrain0 = pd.read_csv("../../data/(2016 UCI) Credit Default/data_train.csv")
dfTest0 = pd.read_csv("../../data/(2016 UCI) Credit Default/data_test.csv")

def prepTrainTest(df):
    df = pd.get_dummies(df, columns=['Sex', 'Marriage'], drop_first=True)
    df = pd.get_dummies(df, columns=['Education', 'SepRepayment']).drop(['Education_high school', 'SepRepayment_paid'], axis=1)
    df['Default'] = df['Default']==1
    return df
dfTrain = prepTrainTest(dfTrain0)
XTrain = dfTrain.drop('Default',axis=1)
yTrain = dfTrain['Default']

# Fit model
## Note: LogisticRegression() forces L1 or L2 regularization. To remove regularization,
## need to set penalty to 'l1' and C to a large value.
lmModel = LogisticRegression(class_weight="balanced", penalty='l1', C=1000)
lmFit = lmModel.fit(XTrain, yTrain)
print('intercept = {0:.3f}'.format(lmFit.intercept_[0]))
print(pd.DataFrame(lmFit.coef_.T, index=XTrain.columns, columns=['coef']))

intercept = -0.345
                            coef
CreditLimit            -0.000002
Age                     0.004691
SepBill                -0.000004
AugBill                 0.000005
SepPay                 -0.000014
AugPay                 -0.000007
Sex_M                   0.153457
Marriage_single        -0.164347
Education_graduate      0.032429
Education_university    0.040215
SepRepayment_1m delay   1.076591
SepRepayment_2m+ delay  2.528001


### Train Performance

In [10]:
probTrain = [x[1] for x in lmFit.predict_proba(XTrain)]
predTrain = lmFit.predict(XTrain)

cmTrain = pd.DataFrame(confusion_matrix(yTrain, predTrain))
cmTrain.columns = pd.Series(cmTrain.columns).apply(lambda s: 'pred'+str(s))
cmTrain.index = pd.Series(cmTrain.index).apply(lambda s: 'actual'+str(s))
print(cmTrain)

perfTrain = pd.DataFrame({'F1':[round(f1_score(yTrain, predTrain), 3)],
                          'AUC':[round(roc_auc_score(yTrain, probTrain),3)]})
print(perfTrain)

         pred0  pred1
actual0  15530   2590
actual1   2496   2716
      F1    AUC
0  0.516  0.749


### Cross-Validation Performance

In [11]:
scores = cross_val_score(lmModel, XTrain, yTrain, scoring='f1', cv=10)

print('Cross-validation f1 score is {0:.3f}'.format(scores.mean()))

Cross-validation f1 score is 0.517


### Test Performance

In [12]:
dfTest = prepTrainTest(dfTest0)
XTest = dfTest.drop('Default',axis=1)
yTest = dfTest['Default']
    
predTest = lmFit.predict(XTest)

cmTest = pd.DataFrame(confusion_matrix(yTest, predTest))
cmTest.columns = pd.Series(cmTest.columns).apply(lambda s: 'pred'+str(s))
cmTest.index = pd.Series(cmTest.index).apply(lambda s: 'actual'+str(s))
print(cmTest)

f1Test = f1_score(yTest, predTest)
print('Test f1 score = {0:.3f}'.format(f1Test))

         pred0  pred1
actual0   3876    653
actual1    638    664
Test f1 score = 0.507


# Appendix: Wrap Statsmodels in Sklearn Estimator for Cross-Validation
`sklearn.model_selection.cross_val_score` can be used to carry out cross-validation of customized model.<br>
This section will demonstrate how to wrap the `GLM` model from `statsmodels` pakcage with sklearn estimator.
### Train and Interpret GLM Model

In [13]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

dfTrain0 = pd.read_csv("../../data/(2016 UCI) Credit Default/data_train.csv")
dfTest0 = pd.read_csv("../../data/(2016 UCI) Credit Default/data_test.csv")

def prepTrainTest(df):
    df = pd.get_dummies(df, columns=['Sex', 'Marriage'], drop_first=True)
    df = pd.get_dummies(df, columns=['Education', 'SepRepayment']).drop(['Education_high school', 'SepRepayment_paid'], axis=1)
    df['Default'] = df['Default']==1
    return df
dfTrain = prepTrainTest(dfTrain0)
XTrain = dfTrain.drop('Default',axis=1)
yTrain = dfTrain['Default']
XTrain = sm.add_constant(XTrain)

w = sum(yTrain==0)/sum(yTrain==1)
yWeights = yTrain.apply(lambda x: w if x==1 else 1)

# Fit model
lmModel = sm.GLM(yTrain, XTrain, family=sm.families.Binomial(), freq_weights=yWeights)
lmFit = lmModel.fit()
lmFit.params

const                    -0.345101
CreditLimit              -0.000002
Age                       0.004696
SepBill                  -0.000004
AugBill                   0.000005
SepPay                   -0.000014
AugPay                   -0.000007
Sex_M                     0.153460
Marriage_single          -0.164295
Education_graduate        0.032478
Education_university      0.040273
SepRepayment_1m delay     1.076577
SepRepayment_2m+ delay    2.527989
dtype: float64

### Train Performance

In [14]:
probTrain = lmFit.predict(XTrain)
predTrain = probTrain.apply(lambda x: 0 if x<0.5 else 1)

cmTrain = pd.DataFrame(confusion_matrix(yTrain, predTrain))
cmTrain.columns = pd.Series(cmTrain.columns).apply(lambda s: 'pred'+str(s))
cmTrain.index = pd.Series(cmTrain.index).apply(lambda s: 'actual'+str(s))
print(cmTrain)

prfsTrain = precision_recall_fscore_support(yTrain, predTrain, average='binary', pos_label=1)
prfsTrain = {"precision": prfsTrain[0],
             "recall": prfsTrain[1],
             "f1-score": prfsTrain[2],
             "support": prfsTrain[3]}
prfsTrain

         pred0  pred1
actual0  15530   2590
actual1   2496   2716


{'precision': 0.5118733509234829,
 'recall': 0.5211051419800461,
 'f1-score': 0.5164479939151929,
 'support': None}

### Cross-Validation Performance
Here we will wrap `statsmodels.api.GLM` model in `sklearn.model_selection.cross_val_score` to perform 10-fold cross-validation.

In [15]:
from sklearn.base import BaseEstimator, ClassifierMixin

class LogisticsRegression(BaseEstimator, ClassifierMixin):
    def __init__(self):
        return

    def fit(self, X, y):
        ys = y.unique()
        ys.sort()
        w = sum(y==ys[0])/sum(y==ys[1])
        yWegiths = y.apply(lambda x: w if x==ys[1] else 1)
        self.fitted = sm.GLM(y, X, family=sm.families.Binomial(), freq_weights=yWegiths).fit()   
        return self
    
    def predict(self, X):
        prob = self.fitted.predict(X)
        pred = prob.apply(lambda x: 0 if x<0.5 else 1)
        return pred

scores = cross_val_score(LogisticsRegression(), XTrain, yTrain, scoring='f1', cv=10)

print('Cross-validation f1 score is {0:.3f}'.format(scores.mean()))

Cross-validation f1 score is 0.517


### Test Performance

In [16]:
dfTest = prepTrainTest(dfTest0)
XTest = dfTest.drop('Default',axis=1)
yTest = dfTest['Default']
XTest = sm.add_constant(XTest)
    
probTest = lmFit.predict(XTest)
predTest = probTest.apply(lambda x: 0 if x<0.5 else 1)

cmTest = pd.DataFrame(confusion_matrix(yTest, predTest))
cmTest.columns = pd.Series(cmTest.columns).apply(lambda s: 'pred'+str(s))
cmTest.index = pd.Series(cmTest.index).apply(lambda s: 'actual'+str(s))
print(cmTest)

prfsTest = precision_recall_fscore_support(yTest, predTest, average='binary', pos_label=1)
prfsTest = {"precision": prfsTest[0],
             "recall": prfsTest[1],
             "f1-score": prfsTest[2],
             "support": prfsTest[3]}
prfsTest

         pred0  pred1
actual0   3876    653
actual1    638    664


{'precision': 0.5041761579347,
 'recall': 0.5099846390168971,
 'f1-score': 0.5070637647957236,
 'support': None}