In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
## dataset file
pulsar_dataset = '/home/d.dasarathan/ds5500/projects/datasets/HTRU2/HTRU_2.csv'

In [None]:
## read dataset
df_main = pd.read_csv(pulsar_dataset, header=None)

In [None]:
## display sample dataset
df_main.head()

In [None]:
## add column names to dataset
col_names = ['mean_int_pf', 'std_pf', 'ex_kurt_pf', 'skew_pf', 'mean_dm', 'std_dm', 'kurt_dm', 'skew_dm','label']
df_main.columns = col_names
df_main.head()

In [None]:
df_main.label.value_counts()

In [None]:
df_main['label'].value_counts()/np.float(len(df_main)) * 100

In [None]:
#X = df_main[['mean_int_pf', 'std_pf', 'ex_kurt_pf', 'skew_pf', 'mean_dm', 'std_dm', 'kurt_dm', 'skew_dm']]
X = df_main.drop(['label'], axis=1)
y = df_main['label']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
## split X and y into train and test sets
## 80%-20% split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
col_names = X_train.columns

----------------------------------------------------------------------------------------------------

#### Scaling Features

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train = scaler.fit_transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns=[col_names])

In [None]:
X_test = pd.DataFrame(X_test, columns=[col_names])

------------------------------------------------------------------------------------

#### Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(n_estimators = 100)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
y_pred_rfc_train = rfc.predict(X_train)

In [None]:
y_pred_rfc_test = rfc.predict(X_test)

In [None]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_rfc_train)))

In [None]:
print("Model accuracy : ", accuracy_score(y_test, y_pred_rfc_test))

In [None]:
## Confusion Matrix

cm = confusion_matrix(y_test, y_pred_rfc_test)

print('Confusion matrix\n\n', cm)
print('True Positives(TP) = ', cm[0,0])
print('True Negatives(TN) = ', cm[1,1])
print('False Positives(FP) = ', cm[0,1])
print('False Negatives(FN) = ', cm[1,0])

In [None]:
## Confusion matrix heatmap

cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
print(classification_report(y_test, y_pred_rfc_test))

In [None]:
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

In [None]:
## classification accuracy
classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)
print('Classification accuracy : {0:0.4f}'.format(classification_accuracy))

In [None]:
## classification error
classification_error = (FP + FN) / float(TP + TN + FP + FN)
print('Classification error : {0:0.4f}'.format(classification_error))

In [None]:
## precision score
precision = TP / float(TP + FP)
print('Precision : {0:0.4f}'.format(precision))

In [None]:
## recall 
recall = TP / float(TP + FN)
print('Recall or Sensitivity : {0:0.4f}'.format(recall))

In [None]:
## specificity
specificity = TN / (TN + FP)
print('Specificity : {0:0.4f}'.format(specificity))

In [None]:
## F-1 score
f1 = 2 * (precision * recall) / (precision + recall)
print('F-1 score : {0:0.4f}'.format(f1))

-------------------------------------------------------------------------------------------------------------