In [None]:
pip install -U altair

# Group project: pulsar

INTRODUCTION

DATA CLEANING & WRANGLING

To analyze a data set accurately, it's crucial to first observe and wrangle the data to prevent formatting issues or null values. This help choose the best analysis method for the data. First the required packages are imported from library to help perform actions.

In [2]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn import set_config
from sklearn.model_selection import train_test_split # importing necessary libraries

In [3]:
set_config(transform_output="pandas") # set output as dataframes instead of arrays

The data set is downloaded from the web, the files are read using the pandas function read_csv. The first 5 values of the dataset is shown below:

In [4]:
htru2='https://drive.google.com/uc?export=download&id=1kLqmyQYnEt5M-stWnzz35p_9Zk2-FOZD'
pulsar= pd.read_csv(htru2,names=[1,2,3,4,5,6,7,8,9],index_col=False) # reading dataset from data file

In [5]:
pulsar.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


The data is organized but lacks clear variable names and meaningful 'type' column values. Thus we used the rename function to change the column names and type values to meaningful names. The column name should correspond the variables, 0s and 1s in the type column should correspond to 'others' or 'pulsar'. (See Table 1)

In [6]:
# renaming column names to meaningful names
pulsar=pulsar.rename(columns={
    1:'mean_IP', # Mean of the integrated profile.
    2:'SD_IP', # Standard deviation of the integrated profile.
    3:'EK_IP', # Excess kurtosis of the integrated profile.
    4:'S_IP', # Skewness of the integrated profile.
    5:'mean_DM-SNR', # Mean of the DM-SNR curve.
    6:'SD_DM-SNR', # Standard deviation of the DM-SNR curve.
    7:'EK_DM-SNR',# Excess kurtosis of the DM-SNR curve.
    8:'S_DM-SNR', # Skewness of the DM-SNR curve.
    9:'type'}) # type of star (others or pulsar)
pulsar['type']=pulsar['type'].replace({
    0:'others',
    1:'pulsar'}) # replacing values of type to more meaningful values

In [7]:
pulsar.head(5)

Unnamed: 0,mean_IP,SD_IP,EK_IP,S_IP,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR,type
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,others
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,others
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,others
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,others
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,others


Table 1

The data frame is then split into training and testing sets, this allows for accuracy testing in the future. (See Table 2)

In [8]:
pulsar_train, pulsar_test = train_test_split(
    pulsar, train_size=0.75, stratify=pulsar["type"]
) # splitting testing and training data

In [9]:
pulsar_train.reset_index()

Unnamed: 0,index,mean_IP,SD_IP,EK_IP,S_IP,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR,type
0,12702,125.406250,52.689604,0.329296,-0.362887,3.076087,19.298961,8.508906,85.847040,others
1,4821,18.296875,30.716028,5.776291,35.341879,55.082776,63.577743,1.253041,1.000621,pulsar
2,12635,143.617188,41.959825,0.023703,0.503648,0.734114,10.325126,17.803600,367.178236,others
3,15108,158.625000,47.559009,-0.577251,0.571487,0.435619,7.994775,27.968066,863.433229,others
4,14843,110.234375,51.274306,0.210045,-0.210050,1.862876,14.282374,12.166518,179.282124,others
...,...,...,...,...,...,...,...,...,...,...
13418,11307,96.960938,34.547613,0.396593,2.817613,3.075251,21.774560,8.096709,69.148710,others
13419,7281,153.546875,42.572156,-0.581813,0.922306,111.082776,82.954952,-0.549741,-1.601379,others
13420,9828,118.882812,55.162915,-0.024709,-0.471540,2.035117,16.267942,11.124529,141.338692,others
13421,8324,112.265625,46.046933,0.225142,0.554728,1.072742,11.513503,14.852714,262.919883,others


Table 2

To work with data, we need to know its basics information, we use the info() function to check for some traits of the data set. (See List 1)


In [10]:
pulsar_train.info() # basic information about training data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13423 entries, 12702 to 5121
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mean_IP      13423 non-null  float64
 1   SD_IP        13423 non-null  float64
 2   EK_IP        13423 non-null  float64
 3   S_IP         13423 non-null  float64
 4   mean_DM-SNR  13423 non-null  float64
 5   SD_DM-SNR    13423 non-null  float64
 6   EK_DM-SNR    13423 non-null  float64
 7   S_DM-SNR     13423 non-null  float64
 8   type         13423 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.0+ MB


List 1

We see that all types are float64, except for the renamed "objects" column. And Non-null values are the same for all columns.

To further check if there are any null values so we could drop them, the sum of all the null values in each column are calculated. (See List 2)

In [11]:
count_nan = pulsar_train.isnull().sum() # total number of null values in each column 
count_nan 

mean_IP        0
SD_IP          0
EK_IP          0
S_IP           0
mean_DM-SNR    0
SD_DM-SNR      0
EK_DM-SNR      0
S_DM-SNR       0
type           0
dtype: int64

List 2

There are no null values in columns,so no need to drop them.

Calculation of column-wise means for pulsars and other sources to identify any differences between them. (See Table 3)

In [12]:
mean_value=pulsar_train.groupby('type').mean() # mean values of each column for pulsars and other stars
mean_value

Unnamed: 0_level_0,mean_IP,SD_IP,EK_IP,S_IP,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
others,116.520224,47.314795,0.212578,0.384103,8.802132,23.22566,8.867284,113.736084
pulsar,56.631452,38.79027,3.116759,15.432986,49.958603,56.268064,2.738327,17.282747


Table 3

The results suggest a significant difference in mean values of all variables for other source and pulsars, 
indicating distinctive characteristics between the two classes.

Compare pulsar and other observations to avoid oversampling due to unequal sample sizes. (See List 3)

In [13]:
count_obs = pulsar_train.groupby('type')['type'].count()  # total number of pulsar observations and other star observations
count_obs 

type
others    12194
pulsar     1229
Name: type, dtype: int64

List 3

Most observations in the dataset are of origins other than pulsars, which means pulsars are rare.Resampling of pulsar observations during model training is necessary.

Graph displaying the correlation between mean IP and Skewness of IP for pulsars and other stars. The graph shows clear separation between pulsars and other sources, with some overlap in the middle where KNN predictions can be challenging. (See Graph 1)

In [87]:
alt.data_transformers.disable_max_rows()
pulsar_mean_plot=alt.Chart(pulsar_train,title='mean IP verses Skewness of IP').mark_point(opacity=0.2).encode(
    x=alt.X('mean_IP'),
    y=alt.Y('S_IP'),
    color='type')
pulsar_mean_plot 

Graph 1

DATA ANALYSIS

In this section, the testing data is separated into two parts, one model based on the values of the Integrated Profile, and another model based on the DM-SNR curve. First all the training data is upsampled to account for the rareness of pulsars and prevent undersampling. The bellow table showed the upsampled training data (See Table 4)

In [15]:
from sklearn.utils import resample
np.random.seed(1)
type_pulsar=pulsar_train[pulsar_train['type']=='pulsar']
type_others=pulsar_train[pulsar_train['type']=='others']
type_others
type_pulsar_upsampled = resample(
    type_pulsar, n_samples=type_others.shape[0],random_state=1
)

upsampled_pulsar = pd.concat((type_pulsar_upsampled ,type_others))
upsampled_pulsar 

Unnamed: 0,mean_IP,SD_IP,EK_IP,S_IP,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR,type
7258,68.468750,40.750947,2.093171,5.353433,53.852843,32.922506,-0.731596,0.309300,pulsar
5665,20.406250,27.540152,6.625817,47.840198,29.763378,55.515026,2.237916,4.720125,pulsar
8272,56.640625,49.860075,1.749052,2.374494,156.981605,79.712682,-0.798699,-0.769955,pulsar
7672,34.640625,34.898230,3.799229,16.544542,88.933946,65.976393,0.237147,-0.338388,pulsar
3233,32.945312,28.915099,5.367896,36.069793,25.263378,60.782165,2.378192,4.444621,pulsar
...,...,...,...,...,...,...,...,...,...
11307,96.960938,34.547613,0.396593,2.817613,3.075251,21.774560,8.096709,69.148710,others
7281,153.546875,42.572156,-0.581813,0.922306,111.082776,82.954952,-0.549741,-1.601379,others
9828,118.882812,55.162915,-0.024709,-0.471540,2.035117,16.267942,11.124529,141.338692,others
8324,112.265625,46.046933,0.225142,0.554728,1.072742,11.513503,14.852714,262.919883,others


Now pulsars and other stars have equal amounts of samples.

In [16]:
count_obs = upsampled_pulsar.groupby('type')['type'].count()  # total number of pulsar observations and other star observations
count_obs 

type
others    12194
pulsar    12194
Name: type, dtype: int64

Next 

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

pulsar_training_IP = upsampled_pulsar[['mean_IP','SD_IP','EK_IP','S_IP','type']]
pulsar_training_IP
IP_preprocessor = make_column_transformer(
    (StandardScaler(), ['mean_IP','SD_IP','EK_IP','S_IP']),
    verbose_feature_names_out=False
)
IP_preprocessor.fit(pulsar_training_IP)
scaled_training_IP = IP_preprocessor.transform(pulsar_training_IP)
scaled_training_IP

Unnamed: 0,mean_IP,SD_IP,EK_IP,S_IP
7258,-0.461601,-0.272313,0.209894,-0.210737
5665,-1.697068,-1.856180,2.492074,3.202145
8272,-0.765648,0.819797,0.036631,-0.450030
7672,-1.331167,-0.974006,1.068892,0.688224
3233,-1.374746,-1.691335,1.858713,2.256651
...,...,...,...,...
11307,0.270802,-1.016042,-0.644330,-0.414435
7281,1.725367,-0.053965,-1.136955,-0.566681
9828,0.834313,1.455564,-0.856455,-0.678647
8324,0.664215,0.362632,-0.730655,-0.596208


Table 4

The same is done for DM-SNR values, the resulting table is shown as Table 5

In [18]:
pulsar_training_DMSNR= upsampled_pulsar[['mean_DM-SNR','SD_DM-SNR','EK_DM-SNR','S_DM-SNR','type']]
DMSNR_preprocessor = make_column_transformer(
    (StandardScaler(), ['mean_DM-SNR','SD_DM-SNR','EK_DM-SNR','S_DM-SNR']),
    verbose_feature_names_out=False
)
DMSNR_preprocessor.fit(pulsar_training_DMSNR)
scaled_training_DMSNR = DMSNR_preprocessor.transform(pulsar_training_DMSNR)
scaled_training_DMSNR

Unnamed: 0,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR
7258,0.582505,-0.282383,-1.357388,-0.674855
5665,0.007483,0.639377,-0.739608,-0.629189
8272,3.044219,1.626625,-1.371348,-0.686029
7672,1.419901,1.066193,-1.155850,-0.681561
3233,-0.099934,0.854273,-0.710424,-0.632042
...,...,...,...,...
11307,-0.629571,-0.737211,0.479262,0.037849
7281,1.948600,1.758907,-1.319555,-0.694637
9828,-0.654399,-0.961877,1.109173,0.785242
8324,-0.677371,-1.155855,1.884788,2.043990


Table 5

After the IP and DM-SNR values are standarized, the most appropritate k value for both models are located using cross-validation.

First, we need to get the grid of parameter values.

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

IP_knn = KNeighborsClassifier()
IP_tune_pipe = make_pipeline(IP_preprocessor, IP_knn)
IP_tune_pipe.get_params()


{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                    ['mean_IP', 'SD_IP', 'EK_IP', 'S_IP'])],
                     verbose_feature_names_out=False)),
  ('kneighborsclassifier', KNeighborsClassifier())],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                  ['mean_IP', 'SD_IP', 'EK_IP', 'S_IP'])],
                   verbose_feature_names_out=False),
 'kneighborsclassifier': KNeighborsClassifier(),
 'columntransformer__n_jobs': None,
 'columntransformer__remainder': 'drop',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__transformer_weights': None,
 'columntransformer__transformers': [('standardscaler',
   StandardScaler(),
   ['mean_IP', 'SD_IP', 'EK_IP', 'S_IP'])],
 'columntransformer__verbose': False,
 'columntransformer__verbose_feature_names_out': False,
 'columntransformer

the appropriate parameter value would be 5

Then we tune the grib and aquire a table of cross validation results. Afterwhich it is plotted onto a line plot. (See Graph2)

In [73]:
IP_parameter_grid = {
    "kneighborsclassifier__n_neighbors": range(1,51,5),
}

In [74]:
from sklearn.model_selection import GridSearchCV

IP_tune_grid = GridSearchCV(
    estimator=IP_tune_pipe,
    param_grid=IP_parameter_grid,
    cv=10
)
scaled_training_IP
IP_accuracies_grid = pd.DataFrame(
    IP_tune_grid.fit(
        scaled_training_IP,
        upsampled_pulsar["type"]
    ).cv_results_
)
cross_val_plot=alt.Chart(IP_accuracies_grid).mark_line(point=True).encode(
    y=alt.Y("mean_test_score").scale(zero=False),
    x=alt.X("param_kneighborsclassifier__n_neighbors"),
)
cross_val_plot

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 19 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   mean_fit_time                            10 non-null     float64
 1   std_fit_time                             10 non-null     float64
 2   mean_score_time                          10 non-null     float64
 3   std_score_time                           10 non-null     float64
 4   param_kneighborsclassifier__n_neighbors  10 non-null     object 
 5   params                                   10 non-null     object 
 6   split0_test_score                        10 non-null     float64
 7   split1_test_score                        10 non-null     float64
 8   split2_test_score                        10 non-null     float64
 9   split3_test_score                        10 non-null     float64
 10  split4_test_score                        10 non-null 

Graph 2

From this graph, we see that the highest test score is when k=1, however, this would leave to overfitting and provide a less useful data. K=16 could be a useful value as it yields a high test score, the test scores for k values before and after it does it vary much, and it does not require a significant amount of computational power.

The same process is repeated for DMSNR to find the optimal k value.

In [76]:
DMSNR_knn = KNeighborsClassifier()
DMSNR_tune_pipe = make_pipeline(DMSNR_preprocessor, DMSNR_knn)
DMSNR_tune_pipe.get_params()


{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                    ['mean_DM-SNR', 'SD_DM-SNR', 'EK_DM-SNR',
                                     'S_DM-SNR'])],
                     verbose_feature_names_out=False)),
  ('kneighborsclassifier', KNeighborsClassifier())],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                  ['mean_DM-SNR', 'SD_DM-SNR', 'EK_DM-SNR',
                                   'S_DM-SNR'])],
                   verbose_feature_names_out=False),
 'kneighborsclassifier': KNeighborsClassifier(),
 'columntransformer__n_jobs': None,
 'columntransformer__remainder': 'drop',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__transformer_weights': None,
 'columntransformer__transformers': [('standardscaler',
   StandardScaler(),
   ['mean_DM-SNR', 'SD_DM-SNR', 'EK_DM-SNR', 'S_DM-SN

In [77]:
DMSNR_parameter_grid = {
    "kneighborsclassifier__n_neighbors": range(1,51,5),
}

In [79]:
DMSNR_tune_grid = GridSearchCV(
    estimator=DMSNR_tune_pipe,
    param_grid=DMSNR_parameter_grid,
    cv=100
)
scaled_training_DMSNR
DMSNR_accuracies_grid = pd.DataFrame(
    DMSNR_tune_grid.fit(
        scaled_training_DMSNR,
        upsampled_pulsar["type"]
    ).cv_results_
)
cross_val_plot=alt.Chart(DMSNR_accuracies_grid).mark_line(point=True).encode(
    y=alt.Y("mean_test_score").scale(zero=False),
    x=alt.X("param_kneighborsclassifier__n_neighbors"),
)
cross_val_plot

Accoding to the graph, the highest score for knn is again knn=1, however as mentioned before, this would lead to overfitting. Therefore other values of k is considered. k=6, k=11, and k=16 are considered, but the difference between the scores for nearby values is quite large. The final k value chosen is k=21, it have a moderate score of around 88%, and the difference between the nearby values is around 1%, furthermore it would not require a significant amount of computational power.

Afterwards, the k values are used to fit the models for both IP and DM-SNR. As shown in the code below

In [83]:
IP_knn = KNeighborsClassifier(n_neighbors=16) 
X = scaled_training_IP[['mean_IP','SD_IP','EK_IP','S_IP']]
y = upsampled_pulsar ["type"]
IP_fit = make_pipeline(IP_preprocessor, IP_knn).fit(X, y)
IP_fit

In [86]:
DMSNR_knn = KNeighborsClassifier(n_neighbors=21) 
X = scaled_training_DMSNR[['mean_DM-SNR','SD_DM-SNR','EK_DM-SNR','S_DM-SNR']]
y = upsampled_pulsar ["type"]
DMSNR_fit = make_pipeline(DMSNR_preprocessor, DMSNR_knn).fit(X, y)
DMSNR_fit