# Prediction

In [1]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np
from plotnine import *

from sklearn.linear_model import LogisticRegression # Logistic Regression Model
from sklearn.preprocessing import StandardScaler #Z-score variables
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.model_selection import train_test_split # simple TT split cv
from sklearn.model_selection import KFold # k-fold cv
from sklearn.model_selection import LeaveOneOut #LOO cv
from sklearn.model_selection import cross_val_score # cross validation metrics
from sklearn.model_selection import cross_val_predict # cross validation metrics

## Framework
1. Model
2. Fit
3. Predict

In [2]:
# data
fashionBIG = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/SKP_fashionBIG.csv")
fashionBIG.head()

Unnamed: 0,age,income,months_subbed,upgrade
0,22,55.89,14,0
1,32,86.03,57,0
2,38,49.22,37,1
3,14,92.71,51,1
4,33,94.06,37,0


In [3]:
predictors = ["age", "income", "months_subbed"]

X_train, X_test, y_train, y_test = train_test_split(fashionBIG[predictors], fashionBIG["upgrade"], test_size=0.2)

zscore = StandardScaler()
zscore.fit(X_train)
Xz_train = zscore.transform(X_train)
Xz_test = zscore.transform(X_test)

In [4]:
myLogit = LogisticRegression(penalty = "none") #create

In [5]:
myLogit.fit(Xz_train,y_train) #fit

LogisticRegression(penalty='none')

In [6]:
predictedVals = myLogit.predict(Xz_test) #predict

In [7]:
accuracy_score(y_test,predictedVals)

0.6

In [8]:
confusion_matrix(y_test,predictedVals)

array([[ 18,  65],
       [ 15, 102]])

## LR Coef interpretation


In [10]:
coef = pd.DataFrame({"Coefs": myLogit.coef_[0],
                    "Names": predictors})
coef = coef.append({"Coefs": myLogit.intercept_[0],
                    "Names": "intercept"}, ignore_index = True)

In [12]:
coef["Odds Coefs"] = np.exp(coef["Coefs"])
coef

Unnamed: 0,Coefs,Names,Odds Coefs
0,0.356522,age,1.428353
1,0.031958,income,1.032474
2,0.04229,months_subbed,1.043197
3,0.32222,intercept,1.380189


## LR different thresholds


In [13]:
fashionNEW = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/SKP_fashionNEW.csv")
Xnew = fashionNEW.iloc[:,0:3]
Xnewz = zscore.transform(Xnew)

In [14]:
Ypred_prob = myLogit.predict_proba(Xnewz)
Ypred_prob[1:10]

array([[0.36729782, 0.63270218],
       [0.48881043, 0.51118957],
       [0.45697279, 0.54302721],
       [0.39829466, 0.60170534],
       [0.44782815, 0.55217185],
       [0.48332643, 0.51667357],
       [0.42381022, 0.57618978],
       [0.36269757, 0.63730243],
       [0.55493969, 0.44506031]])

In [15]:
Ypred_prob1 = Ypred_prob[:, 1]
Ypred_prob1[1:10]

array([0.63270218, 0.51118957, 0.54302721, 0.60170534, 0.55217185,
       0.51667357, 0.57618978, 0.63730243, 0.44506031])

In [16]:
thresh = 0.3

Ypred_prob1_thresh = (Ypred_prob1 > thresh) * 1
Ypred_prob1_thresh[1:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [18]:
accuracy_score(fashionNEW["upgrade"], Ypred_prob1_thresh)

0.603

## Regularization

In [21]:
# Default Regularization
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

myLogit = LogisticRegression() #create
myLogit.fit(Xz_train,y_train) #fit

print(myLogit.coef_)
print(myLogit.intercept_)

[[-0.05664551  0.07862888  0.01025403]]
[0.27227631]


In [22]:
myLogit2 = LogisticRegression(penalty = "none") #create
myLogit2.fit(Xz_train,y_train) #fit

print(myLogit2.coef_)
print(myLogit2.intercept_)

[[-0.05695513  0.07904637  0.01030759]]
[0.27228295]
