In [1]:
import pandas as PD
import numpy as NP
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import Perceptron

Assumptions made:
1. Company is able to collect the same data from their current and future customers
  a. Implication: No need for evaluation set, only training and test sets
  b. Implication: Test set needs to be extensive and test accuracy should be high

In [2]:
data = PD.DataFrame = PD.read_csv("data/cleaned_final.csv")

In [3]:
shuffled = data.sample(frac=1) # sample 100% of the data
shuffled

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,HOUR_APPR_PROCESS_START,...,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21
178354,0,0.000000,0.001508,0.034792,0.129705,0.333352,0.041837,0.762930,0.661526,0.565217,...,0,0,0,0,0,0,0,0,0,0
89211,0,0.105263,0.000546,0.259259,0.491595,0.765136,0.037207,0.758431,0.452967,0.304348,...,0,0,0,0,0,0,0,0,0,0
196611,0,0.000000,0.000739,0.068462,0.260475,0.087076,1.000000,0.497446,0.350980,0.608696,...,0,0,0,0,0,0,0,0,0,0
195397,0,0.000000,0.001700,0.046016,0.256321,0.846448,0.044810,0.803137,0.579825,0.521739,...,0,0,0,0,0,0,0,0,0,0
7116,0,0.000000,0.001700,0.040404,0.129331,0.444679,0.038073,0.589859,0.871474,0.478261,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207909,0,0.052632,0.000623,0.057239,0.254009,0.507732,0.034145,0.880188,0.434487,0.434783,...,0,0,0,0,0,0,0,0,0,0
85477,0,0.000000,0.000585,0.060606,0.392880,0.304422,0.041187,0.722884,0.528415,0.347826,...,0,0,0,0,0,0,0,0,0,0
102335,0,0.000000,0.001123,0.049383,0.344429,0.638973,0.038561,0.856639,0.870363,0.347826,...,0,0,0,0,0,0,0,0,0,0
12483,1,0.000000,0.001700,0.102132,0.136490,0.815407,0.045352,0.822390,0.821175,0.391304,...,0,0,0,0,0,0,0,0,0,0


In [4]:
shuffled_index_reset = shuffled.reset_index(drop=True)
total = len(shuffled_index_reset)
testPercentage = int(total * 0.2)
test = shuffled_index_reset.iloc[:testPercentage].reset_index(drop=True)
train = shuffled_index_reset.iloc[testPercentage:].reset_index(drop=True)
testX = test.iloc[:, 1:]
trainX = train.iloc[:, 1:]
testY = test.iloc[:,:1]
trainY = train.iloc[:,:1]

In [5]:
print(trainX.shape, trainY.shape, testX.shape, testY.shape)

(195424, 156) (195424, 1) (48856, 156) (48856, 1)


6 Classifiers will be tried:
1. Perceptron
2. Logistic Regression
3. K nearest Neighbours
4. Naiive Bayes Classifier
5. Support Vector Machine
6. Random Forest

In [34]:
def Perceptron_Trainer(x, y):
    penaltys = ["l2", "l1", "elasticnet", None] # default = None
    alphas = [10**i for i in range(-1, 3)] # default =  0.0001 # only if penalty is not none
    l1_ratios = [i/10 for i in range(1, 11, 1)] # default = 0.15, =1 is penalty L1, =0 is penalty L2, only use for elasticnet
    eta0s = [i/10 for i in range(1, 11, 1)] #default = 1
    tol = 1e-3 
    shuffle = True
    means:NP.ndarray = NP.ndarray(shape=(len(penaltys), len(alphas), len(l1_ratios), len(eta0s)))
    sds:NP.ndarray = NP.ndarray(shape=(len(penaltys), len(alphas), len(l1_ratios), len(eta0s)))
    for penalty in ["elasticnet"]:
        for alpha in alphas:
            for l1_ratio in l1_ratios:
                for eta0 in eta0s:
                    model = Perceptron(penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, tol=tol, shuffle=shuffle, 
                                                eta0=eta0,
                                                early_stopping=True, validation_fraction=0.2, 
                                                n_iter_no_change=10, max_iter=1000, verbose=1)
                    # define model evaluation method
                    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
                    # evaluate model
                    scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1)
                    score = (NP.mean(scores), NP.std(scores))
                    # summarize result
                    print(penalty, alpha, l1_ratio, eta0)
                    print('Mean Accuracy: %.3f (%.3f)' % score)
                    means[penaltys.index(penalty), alphas.index(alpha), 
                            l1_ratios.index(l1_ratio), eta0s.index(eta0)] = score[0]
                    sds[penaltys.index(penalty), alphas.index(alpha), 
                            l1_ratios.index(l1_ratio), eta0s.index(eta0)] = score[1]

    return means, sds

In [33]:
results = Perceptron_Trainer(trainX, trainY)

Mean Accuracy: 0.922 (0.000)


In [71]:
# tuned hyperparams
model = Perceptron(penalty="l1", alpha=0.01, tol=1e-3, shuffle=True, 
                    eta0=0.3, l1_ratio=0.5,
                    early_stopping=True, validation_fraction=0.2, 
                    n_iter_no_change=10, max_iter=1000, verbose=0)
model.fit(trainX, trainY)

  y = column_or_1d(y, warn=True)


-- Epoch 1
Norm: 200.82, NNZs: 1, Bias: -0.300000, T: 156339, Avg. loss: 0.019663
Total training time: 0.09 seconds.
-- Epoch 2
Norm: 284.48, NNZs: 1, Bias: 0.000000, T: 312678, Avg. loss: 0.019150
Total training time: 0.26 seconds.
-- Epoch 3
Norm: 348.45, NNZs: 1, Bias: 0.300000, T: 469017, Avg. loss: 0.019202
Total training time: 0.42 seconds.
-- Epoch 4
Norm: 402.28, NNZs: 1, Bias: -0.300000, T: 625356, Avg. loss: 0.019401
Total training time: 0.58 seconds.
-- Epoch 5
Norm: 449.77, NNZs: 0, Bias: -0.300000, T: 781695, Avg. loss: 0.019338
Total training time: 0.74 seconds.
-- Epoch 6
Norm: 492.96, NNZs: 0, Bias: -0.300000, T: 938034, Avg. loss: 0.019103
Total training time: 0.90 seconds.
-- Epoch 7
Norm: 532.50, NNZs: 0, Bias: -0.300000, T: 1094373, Avg. loss: 0.019148
Total training time: 1.06 seconds.
-- Epoch 8
Norm: 569.47, NNZs: 1, Bias: 0.000000, T: 1250712, Avg. loss: 0.019086
Total training time: 1.23 seconds.
-- Epoch 9
Norm: 604.03, NNZs: 1, Bias: -0.300000, T: 1407051, Av

Perceptron(alpha=0.01, early_stopping=True, eta0=0.3, l1_ratio=0.5,
           n_iter_no_change=10, penalty='l1', validation_fraction=0.2,
           verbose=1)

In [74]:
model.score(testX, testY) # accuracy just to ensure that it is correct

0.9237350581300148

In [76]:
import pickle
filename = 'perceptron_learner.sav'
pickle.dump(model, open(filename, 'wb')) # save

# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)