In [17]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [3]:
x_train=pd.read_csv('../processed_data/x_train.csv',header=None)
y_train=pd.read_csv('../processed_data/y_train.csv',header=None)

x_test=pd.read_csv('../processed_data/x_test.csv',header=None)
y_test=pd.read_csv('../processed_data/y_test.csv',header=None)

In [25]:
print('Train percent anamolies:',str(np.round(y_train.mean(),4)*100)+'%')
print('Test percent anamolies:',str(np.round(y_test.mean(),4)*100)+'%')

Train percent anamolies: 2.93%
Test percent anamolies: 2.93%


In [4]:
x_train.apply(np.mean,axis=1)

0        0.005352
1        0.005352
2        0.005352
3        0.054950
4        0.017573
           ...   
402537   0.054950
402538   0.076839
402539   0.005352
402540   0.017573
402541   0.005352
Length: 402542, dtype: float64

In [5]:
x_train.apply(np.var,axis=1)

0        0.001403
1        0.001403
2        0.001403
3        0.125333
4        0.020988
           ...   
402537   0.125333
402538   0.335375
402539   0.001403
402540   0.020988
402541   0.001403
Length: 402542, dtype: float64

In [6]:
## Perform standardization

# Fit StandardScaler on training set
scaler=preprocessing.StandardScaler().fit(x_train)

# Apply standardization to train/test sets
x_train_scaled=scaler.transform(x_train)
x_test_scaled=scaler.transform(x_test)

# Confirm that resulting data has zero mean and unit variance
print('Train mean:',x_train_scaled.mean(axis=0).mean())
print('Test mean:',x_test_scaled.mean(axis=0).mean())

print('Train std:',x_train_scaled.std(axis=0).mean())
print('Test std:',x_test_scaled.std(axis=0).mean())

Train mean: 0.0075187969924811185
Test mean: -0.01743165085141406
Train std: 0.9924812030075186
Test std: 0.5746919839962779


In [7]:
# Reshape y datasets
y_train=y_train.values.reshape(len(y_train),)
y_test=y_test.values.reshape(len(y_test),)

In [8]:
# Perform PCA on scaled data
pca=PCA()
pca.fit(x_train_scaled)

PCA()

In [9]:
# Examine cumulative proportion of variance explained
print(pca.explained_variance_ratio_.cumsum())

[0.04706383 0.08986195 0.11778187 0.14299055 0.16662583 0.18935604
 0.21208337 0.2348107  0.25753803 0.28026536 0.30299269 0.32413372
 0.34437346 0.36425438 0.38408804 0.40343864 0.42225016 0.44072066
 0.45895319 0.47662308 0.49313377 0.50835979 0.52351332 0.53792873
 0.55177291 0.56471777 0.5775296  0.58957772 0.59990168 0.60998765
 0.61965265 0.62906325 0.63821154 0.64661659 0.65469883 0.66266717
 0.67056661 0.67843725 0.68610624 0.69369595 0.70127457 0.70885108
 0.71642695 0.72400276 0.73157855 0.73915434 0.74673013 0.75430591
 0.76188168 0.76945746 0.77703324 0.78460901 0.79218479 0.79976057
 0.80733634 0.81491212 0.8224879  0.83006367 0.83763945 0.84521523
 0.852791   0.86036678 0.86794255 0.87551833 0.88309411 0.89066988
 0.8982448  0.90581125 0.91332167 0.92070405 0.92793982 0.93505656
 0.9418     0.94687458 0.95172236 0.95592208 0.96004638 0.96379312
 0.96747906 0.97098501 0.97426185 0.97751505 0.98041122 0.98330485
 0.98605495 0.98876006 0.99121719 0.99364309 0.99538529 0.9970

In [10]:
num_components=np.argmax(pca.explained_variance_ratio_.cumsum()>=0.99)
print('# Components to explain 99% of variance:',num_components)

# Components to explain 99% of variance: 86


In [11]:
# Get component scores for train/test sets
x_train_scores=pca.transform(x_train_scaled)[:,:num_components]
x_test_scores=pca.transform(x_test_scaled)[:,:num_components]
print(x_train_scores.shape)
print(x_test_scores.shape)

(402542, 86)
(172519, 86)


In [13]:
# Perform logistic regression on component scores
logistic_regressor=LogisticRegression(max_iter=1000).fit(x_train_scores,y_train)

In [14]:
logistic_regressor.n_iter_

array([311], dtype=int32)

In [26]:
# Compute training metrics
print('Train logloss:',metrics.log_loss(y_train,logistic_regressor.predict(x_train_scores)))
print('Train accuracy:',metrics.accuracy_score(y_train,logistic_regressor.predict(x_train_scores)))
print('Train recall:',metrics.recall_score(y_train,logistic_regressor.predict(x_train_scores)))
print('Train precision:',metrics.precision_score(y_train,logistic_regressor.predict(x_train_scores)))
print('\n')
print('Test logloss:',metrics.log_loss(y_test,logistic_regressor.predict(x_test_scores)))
print('Test accuracy:',metrics.accuracy_score(y_test,logistic_regressor.predict(x_test_scores)))
print('Test recall:',metrics.recall_score(y_test,logistic_regressor.predict(x_test_scores)))
print('Test precision:',metrics.precision_score(y_test,logistic_regressor.predict(x_test_scores)))

Train logloss: 0.029687974203607456
Train accuracy: 0.9991404623616914
Train recall: 0.9960970643135924
Train precision: 0.9750830564784053


Test logloss: 0.009009137176562834
Test accuracy: 0.9997391591650775
Test recall: 0.9916864608076009
Test precision: 0.9994015559545183
