Dataset from https://archive.ics.uci.edu/dataset/372/htru2

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
htru2 = fetch_ucirepo(id=372) 
  
# data (as pandas dataframes)
# Features and target
X = htru2.data.features 
y = htru2.data.targets 
  
# metadata 
print(htru2.metadata) 
  
# variable information 
print(htru2.variables) 


{'uci_id': 372, 'name': 'HTRU2', 'repository_url': 'https://archive.ics.uci.edu/dataset/372/htru2', 'data_url': 'https://archive.ics.uci.edu/static/public/372/data.csv', 'abstract': 'Pulsar candidates collected during the HTRU survey. Pulsars are a type of star, of considerable scientific interest. Candidates must be classified in to pulsar and non-pulsar classes to aid discovery.', 'area': 'Physics and Chemistry', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 17898, 'num_features': 8, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2015, 'last_updated': 'Wed Apr 03 2024', 'dataset_doi': '10.24432/C5DK6R', 'creators': ['Robert Lyon'], 'intro_paper': {'ID': 460, 'type': 'NATIVE', 'title': 'Fifty years of pulsar candidate selection: from simple filters to a new principled real-time classification approach', 'a

In [4]:
type(X)

pandas.core.frame.DataFrame

In [5]:
X.head()

Unnamed: 0,Profile_mean,Profile_stdev,Profile_skewness,Profile_kurtosis,DM_mean,DM_stdev,DM_skewness,DM_kurtosis
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Model: Random Forest (good baseline for classification)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

  return fit_method(estimator, *args, **kwargs)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
# Predictions
y_pred = model.predict(X_test_scaled)

In [32]:
y_pred[:10]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [12]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [14]:
# Feature importance
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)

In [29]:
# Display results
import ace_tools_open as tools; tools.display_dataframe_to_user(name="Feature Importance", dataframe=feature_importance_df)

accuracy, report, conf_matrix

Feature Importance


0
Loading ITables v2.4.4 from the internet...  (need help?)


(0.9804469273743017,
 '              precision    recall  f1-score   support\n\n           0       0.98      0.99      0.99      3259\n           1       0.93      0.84      0.89       321\n\n    accuracy                           0.98      3580\n   macro avg       0.96      0.92      0.94      3580\nweighted avg       0.98      0.98      0.98      3580\n',
 array([[3240,   19],
        [  51,  270]]))