# Prediction

In [38]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np
from plotnine import *

import statsmodels.api as sm


from sklearn.linear_model import LogisticRegression # Logistic Regression Model
from sklearn.preprocessing import StandardScaler #Z-score variables
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.model_selection import train_test_split # simple TT split cv
from sklearn.model_selection import KFold # k-fold cv
from sklearn.model_selection import LeaveOneOut #LOO cv
from sklearn.model_selection import cross_val_score # cross validation metrics
from sklearn.model_selection import cross_val_predict # cross validation metrics

## Framework
1. Model
2. Fit
3. Predict

In [39]:
# data
fashionBIG = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/SKP_fashionBIG.csv")
fashionBIG.head()

Unnamed: 0,age,income,months_subbed,upgrade
0,22,55.89,14,0
1,32,86.03,57,0
2,38,49.22,37,1
3,14,92.71,51,1
4,33,94.06,37,0


In [40]:
predictors = ["age", "income", "months_subbed"]

X_train, X_test, y_train, y_test = train_test_split(fashionBIG[predictors], fashionBIG["upgrade"], test_size=0.2)
X_train.head()

zscore = StandardScaler()
zscore.fit(X_train)
Xz_train = zscore.transform(X_train)
Xz_test = zscore.transform(X_test)

In [41]:
myLogit = LogisticRegression(penalty = "none") #create

In [42]:
myLogit.fit(Xz_train,y_train) #fit

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='none',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
predictedVals = myLogit.predict(Xz_test) #predict

In [44]:
accuracy_score(y_test,predictedVals)

0.635

In [45]:
confusion_matrix(y_test,predictedVals)

array([[ 21,  54],
       [ 19, 106]])

## Predict New Data

In [46]:
fashionNEW = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/SKP_fashionNEW.csv")

Xnew = fashionNEW.iloc[:,0:3]
Xnewz = zscore.transform(Xnew)

In [47]:
Ypred = myLogit.predict(Xnewz)

In [48]:
accuracy_score(fashionNEW["upgrade"], Ypred)

0.591

In [49]:
confusion_matrix(fashionNEW["upgrade"], Ypred)

array([[102, 295],
       [114, 489]])

# LR with Cross Validation

In [50]:
# Kfold

X = fashionBIG[["age","income","months_subbed"]]
y = fashionBIG["upgrade"]

# create k-fold object
kf = KFold(n_splits = 5)
kf.split(X)

lr = LogisticRegression() #create model

acc = [] #create empty list to store accuracy for each fold

In [51]:
# Use a for loop to loop through each fold and train a model, then add the accuracy to acc.

for train_indices, test_indices in kf.split(X):
    # Get your train/test for this fold
    X_train = X.iloc[train_indices]
    X_test  = X.iloc[test_indices]
    y_train = y[train_indices]
    y_test  = y[test_indices]
    
    # model
    model = lr.fit(X_train, y_train)
    # record accuracy
    acc.append(accuracy_score(y_test, model.predict(X_test)))
    
#print overall acc
print(acc)
np.mean(acc)

[0.58, 0.605, 0.54, 0.565, 0.645]


0.587

In [61]:
# Default Regularization
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

myLogit = LogisticRegression() #create
# myLogit = LogisticRegression(penalty = "none") #create
myLogit.fit(Xz_train,y_train) #fit

print(myLogit.coef_)
print(myLogit.intercept_)

[[-0.01609591  0.15232243  0.04379799]]
[0.27331459]


In [62]:
Xz_train2 = sm.add_constant(Xz_train)

In [54]:
model = sm.Logit(y_train, Xz_train2)
output = model.fit()
output.summary()

Optimization terminated successfully.
         Current function value: 0.680951
         Iterations 4


0,1,2,3
Dep. Variable:,upgrade,No. Observations:,800.0
Model:,Logit,Df Residuals:,796.0
Method:,MLE,Df Model:,3.0
Date:,"Sun, 08 Mar 2020",Pseudo R-squ.:,0.004467
Time:,18:11:14,Log-Likelihood:,-544.76
converged:,True,LL-Null:,-547.21
Covariance Type:,nonrobust,LLR p-value:,0.1801

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2733,0.072,3.818,0.000,0.133,0.414
x1,-0.0163,0.072,-0.226,0.821,-0.157,0.125
x2,0.1531,0.072,2.120,0.034,0.012,0.295
x3,0.0440,0.072,0.615,0.539,-0.096,0.184
