# Prediction

In [1]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np
from plotnine import *

import statsmodels.api as sm


from sklearn.linear_model import LogisticRegression # Logistic Regression Model
from sklearn.preprocessing import StandardScaler #Z-score variables
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.model_selection import train_test_split # simple TT split cv
from sklearn.model_selection import KFold # k-fold cv
from sklearn.model_selection import LeaveOneOut #LOO cv
from sklearn.model_selection import cross_val_score # cross validation metrics
from sklearn.model_selection import cross_val_predict # cross validation metrics

## Framework
1. Model
2. Fit
3. Predict

In [2]:
# data
fashionBIG = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/SKP_fashionBIG.csv")
fashionBIG.head()

Unnamed: 0,age,income,months_subbed,upgrade
0,22,55.89,14,0
1,32,86.03,57,0
2,38,49.22,37,1
3,14,92.71,51,1
4,33,94.06,37,0


In [3]:
predictors = ["age", "income", "months_subbed"]

X_train, X_test, y_train, y_test = train_test_split(fashionBIG[predictors], fashionBIG["upgrade"], test_size = 0.2)

zscore = StandardScaler()
zscore.fit(X_train)
Xz_train = zscore.transform(X_train)
Xz_test = zscore.transform(X_test)

In [4]:
#create
myLogit = LogisticRegression()

In [5]:
 #fit
myLogit.fit(Xz_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
 #predict
y_pred = myLogit.predict(Xz_test) 

In [7]:
# acc
accuracy_score(y_test,y_pred)

0.63

In [8]:
# confusion matrix
confusion_matrix(y_test,y_pred)

array([[ 23,  56],
       [ 18, 103]])

## Predict New Data

In [12]:
fashionNEW = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/SKP_fashionNEW.csv")

Xnew = fashionNEW[predictors]
Xnewz = zscore.transform(Xnew)

In [14]:
y_prednew = myLogit.predict(Xnewz)
y_prednew[1:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 0])

In [17]:
accuracy_score(fashionNEW["upgrade"],y_prednew)

0.593

In [18]:
confusion_matrix(fashionNEW["upgrade"],y_prednew)

array([[ 88, 309],
       [ 98, 505]])

## LR different thresholds


In [20]:
y_pred_prob = myLogit.predict_proba(Xnewz)
y_pred_prob[1:10]

array([[0.36783694, 0.63216306],
       [0.49695985, 0.50304015],
       [0.46718898, 0.53281102],
       [0.40031463, 0.59968537],
       [0.44647647, 0.55352353],
       [0.48817161, 0.51182839],
       [0.43368209, 0.56631791],
       [0.36554565, 0.63445435],
       [0.55823765, 0.44176235]])

In [21]:
y_pred_prob1 = y_pred_prob[:,1]
y_pred_prob1

array([0.46451303, 0.63216306, 0.50304015, 0.53281102, 0.59968537,
       0.55352353, 0.51182839, 0.56631791, 0.63445435, 0.44176235,
       0.58714254, 0.58296231, 0.38065364, 0.52407398, 0.50634059,
       0.50361224, 0.40389766, 0.55943233, 0.71635563, 0.60150194,
       0.53426069, 0.57146995, 0.61125067, 0.59609572, 0.42571684,
       0.64305188, 0.57010735, 0.54201657, 0.6040882 , 0.53666118,
       0.53317216, 0.49043288, 0.46259887, 0.56908189, 0.46925144,
       0.40270868, 0.7202828 , 0.65318475, 0.58376237, 0.39431859,
       0.54750163, 0.68157393, 0.55807904, 0.71876599, 0.53096995,
       0.59360079, 0.57389025, 0.66094908, 0.62197431, 0.57444225,
       0.54296361, 0.60335541, 0.64634083, 0.63151076, 0.59315245,
       0.46422049, 0.54489489, 0.66925014, 0.61163021, 0.5424711 ,
       0.57760224, 0.67805627, 0.56122529, 0.62302153, 0.51819961,
       0.59610335, 0.65468447, 0.62898262, 0.71735318, 0.59389539,
       0.51417775, 0.67420983, 0.37787342, 0.46649462, 0.44950

In [22]:
thresh = 0.7

y_pred_prob1_thresh = (y_pred_prob1 >= thresh)*1
y_pred_prob1_thresh

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## LR with Cross Validation

In [25]:
# Kfold
X = fashionBIG[["age", "income", "months_subbed"]]
y = fashionBIG["upgrade"]


# create k-fold object
kf = KFold(n_splits = 5)
kf.split(X)

lr = LogisticRegression() #create model

acc = [] #create empty list to store accuracy for each fold

In [26]:
# Use a for loop to loop through each fold and train a model, then add the accuracy to acc.

for train_indices, test_indices in kf.split(X):
    # Get your train/test for this fold
    X_train = X.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]
    
    
    # standardize
    
    zscore = StandardScaler()
    zscore.fit(X_train)
    Xz_train = zscore.transform(X_train)
    Xz_test = zscore.transform(X_test)
    
    # model
    model = lr.fit(Xz_train, y_train)
    
    acc.append(accuracy_score(y_test, model.predict(Xz_test)))

    # record accuracy

    
#print overall acc
print(acc)
print(np.mean(acc))

[0.58, 0.605, 0.54, 0.565, 0.645]
0.587


## Regularization

In [27]:
predictors = ["age", "income", "months_subbed"]

X_train, X_test, y_train, y_test = train_test_split(fashionBIG[predictors], fashionBIG["upgrade"], test_size = 0.2)

zscore = StandardScaler()
zscore.fit(X_train)
Xz_train = zscore.transform(X_train)
Xz_test = zscore.transform(X_test)

In [31]:
# Default Regularization
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# myLogit = LogisticRegression()

#create
myLogit = LogisticRegression(penalty = "none") #create
#fit
myLogit.fit(Xz_train,y_train)

print(myLogit.coef_)
print(myLogit.intercept_)

[[0.3715105  0.00347919 0.06299273]]
[0.28630292]


In [32]:
Xz_train2 = sm.add_constant(Xz_train)

In [33]:
model = sm.Logit(y_train, Xz_train2)
output = model.fit()
output.summary()

Optimization terminated successfully.
         Current function value: 0.667155
         Iterations 4


0,1,2,3
Dep. Variable:,upgrade,No. Observations:,800.0
Model:,Logit,Df Residuals:,796.0
Method:,MLE,Df Model:,3.0
Date:,"Mon, 09 Mar 2020",Pseudo R-squ.:,0.02415
Time:,16:58:42,Log-Likelihood:,-533.72
converged:,True,LL-Null:,-546.93
Covariance Type:,nonrobust,LLR p-value:,7.808e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2863,0.073,3.940,0.000,0.144,0.429
x1,0.3715,0.075,4.938,0.000,0.224,0.519
x2,0.0035,0.073,0.048,0.962,-0.139,0.146
x3,0.0630,0.073,0.867,0.386,-0.079,0.205
