In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score

### Rename columns 

In [2]:
df = pd.read_csv(".\HTRU_2.csv", header=None)
df.rename(columns={0: "mean_int_pf", 1: "std_pf", 2: "ex_kurt_pf", 3: "skew_pf", 4: "mean_dm", 5: "std_dm", 6: "kurt_dm", 7: "skew_dm", 8: "class"}, inplace = True)
df.index = df.index + 1
df

Unnamed: 0,mean_int_pf,std_pf,ex_kurt_pf,skew_pf,mean_dm,std_dm,kurt_dm,skew_dm,class
1,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
2,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580,0
3,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
4,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661,0
5,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306,0
6,93.570312,46.698114,0.531905,0.416721,1.636288,14.545074,10.621748,131.394004,0
7,119.484375,48.765059,0.031460,-0.112168,0.999164,9.279612,19.206230,479.756567,0
8,130.382812,39.844056,-0.158323,0.389540,1.220736,14.378941,13.539456,198.236457,0
9,107.250000,52.627078,0.452688,0.170347,2.331940,14.486853,9.001004,107.972506,0
10,107.257812,39.496488,0.465882,1.162877,4.079431,24.980418,7.397080,57.784738,0


In [3]:
feature_names = df.columns.values[0:-1]
print(feature_names)

['mean_int_pf' 'std_pf' 'ex_kurt_pf' 'skew_pf' 'mean_dm' 'std_dm'
 'kurt_dm' 'skew_dm']


In [4]:
print('Dataset has %d rows and %d columns including features and labels'%(df.shape[0],df.shape[1]))

Dataset has 17898 rows and 9 columns including features and labels


### Feature Selection 

In [5]:
features = df.drop('class', axis=1)
targets = df['class']

### Splitting dataset into train & test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.30, random_state=66)

### Using Random Forest Classifier

In [7]:
#RFC = RandomForestClassifier(n_jobs=2,n_estimators=10)
RFC = RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=90, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=-1,
                       oob_score=False, random_state=1007, verbose=0,
                       warm_start=False)
RFC.fit(X_train,y_train)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=90,
                       min_samples_leaf=2, min_samples_split=10,
                       n_estimators=70, n_jobs=-1, random_state=1007)

In [8]:
rfc_predict = RFC.predict(X_test)

### Assesing accurace

In [9]:
rfc_cv_score = cross_val_score(RFC, features, targets, cv=10, scoring='roc_auc')
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict, target_names=['Non Pulsar','Pulsar']))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[4849   33]
 [  75  413]]


=== Classification Report ===
              precision    recall  f1-score   support

  Non Pulsar       0.98      0.99      0.99      4882
      Pulsar       0.93      0.85      0.88       488

    accuracy                           0.98      5370
   macro avg       0.96      0.92      0.94      5370
weighted avg       0.98      0.98      0.98      5370



=== All AUC Scores ===
[0.92710677 0.96836468 0.97279723 0.97067283 0.97869416 0.96269463
 0.97983605 0.97172847 0.98366083 0.97055347]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.9686109123408364


### Accuracy 

In [10]:
RFC.score(X_test, y_test)

0.9798882681564246

### Dumping model as python function

In [11]:
from joblib import dump, load
dump(RFC, './pulsar.joblib')

['./pulsar.joblib']

### Converting Sci-Kit model into ONNX(Open Neural Network Exchange) format

In [13]:
!pip install skl2onnx

Collecting skl2onnx
  Using cached https://files.pythonhosted.org/packages/8a/41/47cb3c420d3a1d0a1ad38ef636ac2d4929c938c2f209582bcf3b33440b1a/skl2onnx-1.7.0-py2.py3-none-any.whl
Collecting onnxconverter-common>=1.5.1 (from skl2onnx)
  Using cached https://files.pythonhosted.org/packages/fe/7a/7e30c643cd7d2ad87689188ef34ce93e657bd14da3605f87bcdbc19cd5b1/onnxconverter_common-1.7.0-py2.py3-none-any.whl
Collecting onnx>=1.2.1 (from skl2onnx)
  Downloading https://files.pythonhosted.org/packages/c2/97/ddbddd9ffcaa8930add923378dc8f2abec693cd9ade5d01f2976a6155c48/onnx-1.7.0-cp37-cp37m-win_amd64.whl (6.8MB)
Collecting typing-extensions>=3.6.2.1 (from onnx>=1.2.1->skl2onnx)
  Using cached https://files.pythonhosted.org/packages/0c/0e/3f026d0645d699e7320b59952146d56ad7c374e9cd72cd16e7c74e657a0f/typing_extensions-3.7.4.2-py3-none-any.whl
Installing collected packages: typing-extensions, onnx, onnxconverter-common, skl2onnx
Successfully installed onnx-1.7.0 onnxconverter-common-1.7.0 skl2onnx-1.7.

In [14]:
# Convert into ONNX format with onnxmltools
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 8]))]
onx = convert_sklearn(RFC, initial_types=initial_type)
with open("pulsar.onnx", "wb") as f:
    f.write(onx.SerializeToString())

### ONNXRuntime for Inferencing

In [None]:
# Compute the prediction with ONNX Runtime
import onnxruntime as rt
import numpy
sess = rt.InferenceSession("pulsar.onnx")
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
#pred_onx = sess.run([label_name], {input_name: numpy.array((X_test)).astype(numpy.float32)})[0]

In [None]:
pred_onx = sess.run([label_name], {input_name: numpy.array(X_test).astype(numpy.float32)})[0]
pred_onx[0]