In [1]:
pip install -U altair

Note: you may need to restart the kernel to use updated packages.


# Group project: pulsar

INTRODUCTION

DATA CLEANING & WRANGLING

To analyze a data set accurately, it's crucial to first observe and wrangle the data to prevent formatting issues or null values. This help choose the best analysis method for the data. First the required packages are imported from library to help perform actions.

In [2]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn import set_config
from sklearn.model_selection import train_test_split # importing necessary libraries

In [3]:
set_config(transform_output="pandas") # set output as dataframes instead of arrays

The data set is downloaded from the web, the files are read using the pandas function read_csv. The first 5 values of the dataset is shown below:

In [4]:
htru2='https://raw.githubusercontent.com/dorni12/DSCI100_GroupProject/main/HTRU_2.csv'
pulsar= pd.read_csv(htru2,names=[1,2,3,4,5,6,7,8,9],index_col=False) # reading dataset from data file

In [5]:
pulsar.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


The data is organized but lacks clear variable names and meaningful 'type' column values. Thus we used the rename function to change the column names and type values to meaningful names. The column name should correspond the variables, 0s and 1s in the type column should correspond to 'others' or 'pulsar'. (See Table 1)

In [6]:
# renaming column names to meaningful names
pulsar=pulsar.rename(columns={
    1:'mean_IP', # Mean of the integrated profile.
    2:'SD_IP', # Standard deviation of the integrated profile.
    3:'EK_IP', # Excess kurtosis of the integrated profile.
    4:'S_IP', # Skewness of the integrated profile.
    5:'mean_DM-SNR', # Mean of the DM-SNR curve.
    6:'SD_DM-SNR', # Standard deviation of the DM-SNR curve.
    7:'EK_DM-SNR',# Excess kurtosis of the DM-SNR curve.
    8:'S_DM-SNR', # Skewness of the DM-SNR curve.
    9:'type'}) # type of star (others or pulsar)
pulsar['type']=pulsar['type'].replace({
    0:'others',
    1:'pulsar'}) # replacing values of type to more meaningful values

In [7]:
pulsar.head(5)

Unnamed: 0,mean_IP,SD_IP,EK_IP,S_IP,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR,type
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,others
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,others
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,others
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,others
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,others


Table 1

The data frame is then split into training and testing sets, this allows for accuracy testing in the future. (See Table 2)

In [8]:
pulsar_train, pulsar_test = train_test_split(
    pulsar, train_size=0.75, stratify=pulsar["type"]
) # splitting testing and training data

In [9]:
pulsar_train.reset_index()

Unnamed: 0,index,mean_IP,SD_IP,EK_IP,S_IP,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR,type
0,7245,92.468750,34.845404,0.743092,3.092163,27.561873,71.845142,2.236180,3.041461,others
1,4732,112.687500,45.733561,0.745413,0.978695,9.751672,39.064663,4.100970,16.000180,others
2,13804,124.531250,52.936864,0.044084,-0.197515,13.196488,46.203371,3.529246,11.307275,others
3,9040,130.070312,48.618924,-0.043825,-0.069758,2.786789,15.527800,8.257814,88.883480,others
4,12334,123.828125,48.722285,-0.065228,-0.440342,6.491639,29.339851,4.820628,23.606183,others
...,...,...,...,...,...,...,...,...,...,...
13418,10985,12.476562,27.754494,7.037478,52.458596,130.265886,70.574052,-0.176803,-0.768256,pulsar
13419,1335,126.296875,45.716921,0.124646,0.180690,2.639632,15.339079,8.022532,85.695481,others
13420,7102,123.609375,51.892352,0.288274,-0.199181,29.039298,67.633561,1.915681,1.718845,others
13421,12929,117.937500,51.842573,0.252481,-0.221498,1.390468,15.009937,12.138534,158.969317,others


Table 2

To work with data, we need to know its basics information, we use the info() function to check for some traits of the data set. (See List 1)


In [10]:
pulsar_train.info() # basic information about training data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13423 entries, 7245 to 13067
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mean_IP      13423 non-null  float64
 1   SD_IP        13423 non-null  float64
 2   EK_IP        13423 non-null  float64
 3   S_IP         13423 non-null  float64
 4   mean_DM-SNR  13423 non-null  float64
 5   SD_DM-SNR    13423 non-null  float64
 6   EK_DM-SNR    13423 non-null  float64
 7   S_DM-SNR     13423 non-null  float64
 8   type         13423 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.0+ MB


List 1

We see that all types are float64, except for the renamed "objects" column. And Non-null values are the same for all columns.

To further check if there are any null values so we could drop them, the sum of all the null values in each column are calculated. (See List 2)

In [11]:
count_nan = pulsar_train.isnull().sum() # total number of null values in each column 
count_nan 

mean_IP        0
SD_IP          0
EK_IP          0
S_IP           0
mean_DM-SNR    0
SD_DM-SNR      0
EK_DM-SNR      0
S_DM-SNR       0
type           0
dtype: int64

List 2

There are no null values in columns,so no need to drop them.

Calculation of column-wise means for pulsars and other sources to identify any differences between them. (See Table 3)

In [12]:
mean_value=pulsar_train.groupby('type').mean() # mean values of each column for pulsars and other stars
mean_value

Unnamed: 0_level_0,mean_IP,SD_IP,EK_IP,S_IP,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
others,116.444206,47.296882,0.213082,0.38717,9.105551,23.473032,8.81131,112.532479
pulsar,56.34586,38.69105,3.154399,15.694917,50.171363,56.689469,2.712591,17.56643


Table 3

The results suggest a significant difference in mean values of all variables for other source and pulsars, 
indicating distinctive characteristics between the two classes.

Compare pulsar and other observations to avoid oversampling due to unequal sample sizes. (See List 3)

In [13]:
count_obs = pulsar_train.groupby('type')['type'].count()  # total number of pulsar observations and other star observations
count_obs 

type
others    12194
pulsar     1229
Name: type, dtype: int64

List 3

Most observations in the dataset are of origins other than pulsars, which means pulsars are rare.Resampling of pulsar observations during model training is necessary.

Graph displaying the correlation between mean IP and Skewness of IP for pulsars and other stars. The graph shows clear separation between pulsars and other sources, with some overlap in the middle where KNN predictions can be challenging. (See Graph 1)

In [14]:
alt.data_transformers.disable_max_rows()
pulsar_mean_plot=alt.Chart(pulsar_train,title='mean IP verses Skewness of IP').mark_point(opacity=0.2).encode(
    x=alt.X('mean_IP'),
    y=alt.Y('S_IP'),
    color='type')
pulsar_mean_plot 

Graph 1

DATA ANALYSIS

In this section, the testing data is separated into two parts, one model based on the values of the Integrated Profile, and another model based on the DM-SNR curve. First all the training data is upsampled to account for the rareness of pulsars and prevent undersampling. The bellow table showed the upsampled training data (See Table 4)

In [15]:
from sklearn.utils import resample
np.random.seed(1)
type_pulsar=pulsar_train[pulsar_train['type']=='pulsar']
type_others=pulsar_train[pulsar_train['type']=='others']
type_others
type_pulsar_upsampled = resample(
    type_pulsar, n_samples=type_others.shape[0],random_state=1
)

upsampled_pulsar = pd.concat((type_pulsar_upsampled ,type_others))
upsampled_pulsar 

Unnamed: 0,mean_IP,SD_IP,EK_IP,S_IP,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR,type
4713,66.367188,39.477223,2.048420,7.171229,5.870401,30.944482,6.077354,37.481793,pulsar
585,34.664062,34.037038,4.539875,23.640581,39.857860,68.722881,1.722660,1.888185,pulsar
1775,98.359375,51.190626,1.057578,0.759414,134.582776,94.924652,-0.481317,-1.497141,pulsar
11379,22.578125,35.793158,4.650770,21.803837,138.787625,79.152788,-0.445270,-0.909086,pulsar
1493,28.546875,29.131399,5.421874,34.243176,56.295987,85.124408,1.229159,-0.059996,pulsar
...,...,...,...,...,...,...,...,...,...
1603,108.695312,51.755368,0.231442,-0.403158,1.453177,11.451453,13.717288,245.768654,others
1335,126.296875,45.716921,0.124646,0.180690,2.639632,15.339079,8.022532,85.695481,others
7102,123.609375,51.892352,0.288274,-0.199181,29.039298,67.633561,1.915681,1.718845,others
12929,117.937500,51.842573,0.252481,-0.221498,1.390468,15.009937,12.138534,158.969317,others


Now pulsars and other stars have equal amounts of samples.

In [16]:
count_obs = upsampled_pulsar.groupby('type')['type'].count()  # total number of pulsar observations and other star observations
count_obs 

type
others    12194
pulsar    12194
Name: type, dtype: int64

Next 

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

pulsar_training_IP = upsampled_pulsar[['mean_IP','SD_IP','EK_IP','S_IP','type']]
pulsar_training_IP
IP_preprocessor = make_column_transformer(
    (StandardScaler(), ['mean_IP','SD_IP','EK_IP','S_IP']),
    verbose_feature_names_out=False
)
IP_preprocessor.fit(pulsar_training_IP)
scaled_training_IP = IP_preprocessor.transform(pulsar_training_IP)
scaled_training_IP

Unnamed: 0,mean_IP,SD_IP,EK_IP,S_IP
4713,-0.512217,-0.414111,0.177067,-0.075963
585,-1.328543,-1.062913,1.421809,1.225391
1775,0.311553,0.982841,-0.317962,-0.582604
11379,-1.639744,-0.853477,1.477212,1.080258
1493,-1.486054,-1.647964,1.862459,2.063174
...,...,...,...,...
1603,0.577693,1.050192,-0.730703,-0.674467
1335,1.030917,0.330041,-0.784059,-0.628333
7102,0.961717,1.066529,-0.702310,-0.658349
12929,0.815671,1.060592,-0.720192,-0.660112


Table 4

The same is done for DM-SNR values, the resulting table is shown as Table 5

In [18]:
pulsar_training_DMSNR= upsampled_pulsar[['mean_DM-SNR','SD_DM-SNR','EK_DM-SNR','S_DM-SNR','type']]
DMSNR_preprocessor = make_column_transformer(
    (StandardScaler(), ['mean_DM-SNR','SD_DM-SNR','EK_DM-SNR','S_DM-SNR']),
    verbose_feature_names_out=False
)
DMSNR_preprocessor.fit(pulsar_training_DMSNR)
scaled_training_DMSNR = DMSNR_preprocessor.transform(pulsar_training_DMSNR)
scaled_training_DMSNR

Unnamed: 0,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR
4713,-0.568004,-0.369239,0.064611,-0.286950
585,0.255145,1.166854,-0.843330,-0.656439
1775,2.549307,2.232234,-1.302853,-0.691581
11379,2.651145,1.590941,-1.295338,-0.685476
1493,0.653263,1.833750,-0.946224,-0.676662
...,...,...,...,...
1603,-0.674986,-1.161837,1.657515,1.875226
1335,-0.646251,-1.003764,0.470175,0.213545
7102,-0.006872,1.122562,-0.803086,-0.658197
12929,-0.676504,-1.017147,1.328350,0.974183


Table 5

After the IP and DM-SNR values are standarized, the most appropritate k value for both models are located using cross-validation.

First, we need to get the grid of parameter values.

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

IP_knn = KNeighborsClassifier()
IP_tune_pipe = make_pipeline(IP_preprocessor, IP_knn)
IP_tune_pipe.get_params()


{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                    ['mean_IP', 'SD_IP', 'EK_IP', 'S_IP'])],
                     verbose_feature_names_out=False)),
  ('kneighborsclassifier', KNeighborsClassifier())],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                  ['mean_IP', 'SD_IP', 'EK_IP', 'S_IP'])],
                   verbose_feature_names_out=False),
 'kneighborsclassifier': KNeighborsClassifier(),
 'columntransformer__n_jobs': None,
 'columntransformer__remainder': 'drop',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__transformer_weights': None,
 'columntransformer__transformers': [('standardscaler',
   StandardScaler(),
   ['mean_IP', 'SD_IP', 'EK_IP', 'S_IP'])],
 'columntransformer__verbose': False,
 'columntransformer__verbose_feature_names_out': False,
 'columntransformer

the appropriate parameter value would be 5

Then we tune the grib and aquire a table of cross validation results. Afterwhich it is plotted onto a line plot. (See Graph2)

In [20]:
IP_parameter_grid = {
    "kneighborsclassifier__n_neighbors": range(1,51,5),
}

In [None]:
from sklearn.model_selection import GridSearchCV

IP_tune_grid = GridSearchCV(
    estimator=IP_tune_pipe,
    param_grid=IP_parameter_grid,
    cv=10
)
scaled_training_IP
IP_accuracies_grid = pd.DataFrame(
    IP_tune_grid.fit(
        scaled_training_IP,
        upsampled_pulsar["type"]
    ).cv_results_
)
cross_val_plot=alt.Chart(IP_accuracies_grid).mark_line(point=True).encode(
    y=alt.Y("mean_test_score").scale(zero=False),
    x=alt.X("param_kneighborsclassifier__n_neighbors"),
)
cross_val_plot

Graph 2

From this graph, we see that the highest test score is when k=1, however, this would leave to overfitting and provide a less useful data. K=16 could be a useful value as it yields a high test score, the test scores for k values before and after it does it vary much, and it does not require a significant amount of computational power.

The same process is repeated for DMSNR to find the optimal k value.

In [None]:
DMSNR_knn = KNeighborsClassifier()
DMSNR_tune_pipe = make_pipeline(DMSNR_preprocessor, DMSNR_knn)
DMSNR_tune_pipe.get_params()


In [None]:
DMSNR_parameter_grid = {
    "kneighborsclassifier__n_neighbors": range(1,51,5),
}

In [None]:
DMSNR_tune_grid = GridSearchCV(
    estimator=DMSNR_tune_pipe,
    param_grid=DMSNR_parameter_grid,
    cv=100
)
scaled_training_DMSNR
DMSNR_accuracies_grid = pd.DataFrame(
    DMSNR_tune_grid.fit(
        scaled_training_DMSNR,
        upsampled_pulsar["type"]
    ).cv_results_
)
cross_val_plot=alt.Chart(DMSNR_accuracies_grid).mark_line(point=True).encode(
    y=alt.Y("mean_test_score").scale(zero=False),
    x=alt.X("param_kneighborsclassifier__n_neighbors"),
)
cross_val_plot

Accoding to the graph, the highest score for knn is again knn=1, however as mentioned before, this would lead to overfitting. Therefore other values of k is considered. k=6, k=11, and k=16 are considered, but the difference between the scores for nearby values is quite large. The final k value chosen is k=21, it have a moderate score of around 88%, and the difference between the nearby values is around 1%, furthermore it would not require a significant amount of computational power.

Afterwards, the k values are used to fit the models for both IP and DM-SNR. As shown in the code below

In [None]:
IP_knn = KNeighborsClassifier(n_neighbors=16) 
X = scaled_training_IP[['mean_IP','SD_IP','EK_IP','S_IP']]
y = upsampled_pulsar ["type"]
IP_fit = make_pipeline(IP_preprocessor, IP_knn).fit(X, y)
IP_fit

In [None]:
DMSNR_knn = KNeighborsClassifier(n_neighbors=21) 
X = scaled_training_DMSNR[['mean_DM-SNR','SD_DM-SNR','EK_DM-SNR','S_DM-SNR']]
y = upsampled_pulsar ["type"]
DMSNR_fit = make_pipeline(DMSNR_preprocessor, DMSNR_knn).fit(X, y)
DMSNR_fit

In [None]:
pulsar_IP = pulsar[["mean_IP", "SD_IP", "EK_IP", "S_IP", "type"]]
pulsar_IP
## Made a new Dataframe with only IP data and its type.

In [None]:
pulsar_IP = pulsar_IP.assign(predicted_type = IP_fit.predict(pulsar[["mean_IP", "SD_IP", "EK_IP", "S_IP"]]))
pulsar_IP
## Made a new Column with the predicted pulsar type from our model

In [None]:
pulsar_IP[pulsar_IP["type"] != pulsar_IP["predicted_type"]]
##Found the columns in which the predictions are not True

In [None]:
correct_preds = pulsar_IP[
    pulsar_IP['type'] == pulsar_IP['predicted_type']
]

correct_preds.shape[0] / pulsar_IP.shape[0]

## Finding the Accuracy of the model

In [None]:
confusion_matrix_IP = pd.crosstab(
    pulsar_IP["type"],
    pulsar_IP["predicted_type"]
)
confusion_matrix_IP
## Making Confusion Matrix

In [None]:
pulsar_SNR = pulsar[["mean_DM-SNR", "SD_DM-SNR", "EK_DM-SNR", "S_DM-SNR", "type"]]
pulsar_SNR
## Made a new Dataframe with only SNR data and its type.

In [None]:
pulsar_SNR = pulsar_SNR.assign(predicted_type = DMSNR_fit.predict(pulsar[["mean_DM-SNR", "SD_DM-SNR", "EK_DM-SNR", "S_DM-SNR", "type"]]))
pulsar_SNR
## Made a new Column with the predicted pulsar type from our model

In [None]:
pulsar_SNR[pulsar_SNR["type"] != pulsar_SNR["predicted_type"]]
##Found the columns in which the predictions are not True

In [None]:
correct_preds_SNR = pulsar_SNR[
    pulsar_SNR['type'] == pulsar_SNR['predicted_type']
]

correct_preds_SNR.shape[0] / pulsar_SNR.shape[0]

## Finding the Accuracy of the model

In [None]:
confusion_matrix_SNR = pd.crosstab(
    pulsar_SNR["type"],
    pulsar_SNR["predicted_type"]
)
confusion_matrix_SNR
## Making Confusion Matrix

As seen above, the confusion matrix of SNR data is more balanced and gives out mixed results, whereas most of the predictions of IP data are wrong. Therefore, we should use SNR model to predict the type.

In [None]:
final_model = DMSNR_fit