In [40]:
pip install -U altair

Note: you may need to restart the kernel to use updated packages.


# Group project: pulsar

INTRODUCTION

DATA CLEANING & WRANGLING

To analyze a data set accurately, it's crucial to first observe and wrangle the data to prevent formatting issues or null values. This help choose the best analysis method for the data. First the required packages are imported from library to help perform actions.

In [41]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn import set_config
from sklearn.model_selection import train_test_split # importing necessary libraries
np.random.seed(9)

In [42]:
set_config(transform_output="pandas") # set output as dataframes instead of arrays

The data set is downloaded from the web, the files are read using the pandas function read_csv. The first 5 values of the dataset is shown below:

In [43]:
htru2='https://raw.githubusercontent.com/dorni12/DSCI100_GroupProject/main/HTRU_2.csv'
pulsar= pd.read_csv(htru2,names=[1,2,3,4,5,6,7,8,9],index_col=False) # reading dataset from data file

In [44]:
pulsar.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


The data is organized but lacks clear variable names and meaningful 'type' column values. Thus we used the rename function to change the column names and type values to meaningful names. The column name should correspond the variables, 0s and 1s in the type column should correspond to 'others' or 'pulsar'. (See Table 1)

In [45]:
# renaming column names to meaningful names
pulsar=pulsar.rename(columns={
    1:'mean_IP', # Mean of the integrated profile.
    2:'SD_IP', # Standard deviation of the integrated profile.
    3:'EK_IP', # Excess kurtosis of the integrated profile.
    4:'S_IP', # Skewness of the integrated profile.
    5:'mean_DM-SNR', # Mean of the DM-SNR curve.
    6:'SD_DM-SNR', # Standard deviation of the DM-SNR curve.
    7:'EK_DM-SNR',# Excess kurtosis of the DM-SNR curve.
    8:'S_DM-SNR', # Skewness of the DM-SNR curve.
    9:'type'}) # type of star (others or pulsar)
pulsar['type']=pulsar['type'].replace({
    0:'others',
    1:'pulsar'}) # replacing values of type to more meaningful values

In [46]:
pulsar.head(5)

Unnamed: 0,mean_IP,SD_IP,EK_IP,S_IP,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR,type
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,others
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,others
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,others
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,others
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,others


Table 1

The data frame is then split into training and testing sets, this allows for accuracy testing in the future. (See Table 2)

In [47]:
pulsar_train, pulsar_test = train_test_split(
    pulsar, train_size=0.75, stratify=pulsar["type"]
) # splitting testing and training data

In [48]:
pulsar_train.reset_index()

Unnamed: 0,index,mean_IP,SD_IP,EK_IP,S_IP,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR,type
0,16444,106.742188,50.662524,0.246558,-0.384159,2.449833,18.901213,8.944606,87.049926,others
1,4487,143.789062,51.673956,-0.225833,-0.408937,1.090301,12.543644,16.757088,311.439718,others
2,6869,108.375000,39.052049,0.115301,2.044731,11.939799,48.694733,3.845666,12.847599,others
3,16616,92.953125,46.001844,0.222959,0.361164,1.300167,10.896719,14.323632,273.448704,others
4,8172,73.187500,36.092708,1.145832,4.196603,4.051839,23.249713,6.773630,50.496178,others
...,...,...,...,...,...,...,...,...,...,...
13418,15553,103.125000,53.530949,0.376731,-0.185251,2.382943,14.401685,9.338643,114.031917,others
13419,7175,123.914062,49.306286,0.058630,0.255576,14.983278,56.100928,3.497040,10.309213,others
13420,5021,58.164062,37.856592,2.990382,11.785648,41.121237,62.009584,1.735378,2.354386,pulsar
13421,3674,33.187500,37.080729,4.530471,21.665170,40.076087,64.349537,1.671464,1.994202,pulsar


Table 2

To work with data, we need to know its basics information, we use the info() function to check for some traits of the data set. (See List 1)


In [49]:
pulsar_train.info() # basic information about training data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13423 entries, 16444 to 5995
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mean_IP      13423 non-null  float64
 1   SD_IP        13423 non-null  float64
 2   EK_IP        13423 non-null  float64
 3   S_IP         13423 non-null  float64
 4   mean_DM-SNR  13423 non-null  float64
 5   SD_DM-SNR    13423 non-null  float64
 6   EK_DM-SNR    13423 non-null  float64
 7   S_DM-SNR     13423 non-null  float64
 8   type         13423 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.0+ MB


List 1

We see that all types are float64, except for the renamed "objects" column. And Non-null values are the same for all columns.

To further check if there are any null values so we could drop them, the sum of all the null values in each column are calculated. (See List 2)

In [50]:
count_nan = pulsar_train.isnull().sum() # total number of null values in each column 
count_nan 

mean_IP        0
SD_IP          0
EK_IP          0
S_IP           0
mean_DM-SNR    0
SD_DM-SNR      0
EK_DM-SNR      0
S_DM-SNR       0
type           0
dtype: int64

List 2

There are no null values in columns,so no need to drop them.

Calculation of column-wise means for pulsars and other sources to identify any differences between them. (See Table 3)

In [51]:
mean_value=pulsar_train.groupby('type').mean() # mean values of each column for pulsars and other stars
mean_value

Unnamed: 0_level_0,mean_IP,SD_IP,EK_IP,S_IP,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
others,116.522794,47.347335,0.211515,0.37993,8.811016,23.224048,8.884452,114.392824
pulsar,56.442077,38.546959,3.146628,15.671984,49.224379,56.232506,2.818744,18.795908


Table 3

The results suggest a significant difference in mean values of all variables for other source and pulsars, 
indicating distinctive characteristics between the two classes.

Compare pulsar and other observations to avoid oversampling due to unequal sample sizes. (See List 3)

In [52]:
count_obs = pulsar_train.groupby('type')['type'].count()  # total number of pulsar observations and other star observations
count_obs 

type
others    12194
pulsar     1229
Name: type, dtype: int64

List 3

Most observations in the dataset are of origins other than pulsars, which means pulsars are rare.Resampling of pulsar observations during model training is necessary.

Graph displaying the correlation between mean IP and Skewness of IP for pulsars and other stars. The graph shows clear separation between pulsars and other sources, with some overlap in the middle where KNN predictions can be challenging. (See Graph 1)

In [53]:
alt.data_transformers.disable_max_rows()
pulsar_mean_plot=alt.Chart(pulsar_train,title='mean IP verses Skewness of IP').mark_point(opacity=0.2).encode(
    x=alt.X('mean_IP'),
    y=alt.Y('S_IP'),
    color='type')
pulsar_mean_plot 

Graph 1

DATA ANALYSIS

In this section, the testing data is separated into two parts, one model based on the values of the Integrated Profile, and another model based on the DM-SNR curve. First all the training data is upsampled to account for the rareness of pulsars and prevent undersampling. The bellow table showed the upsampled training data (See Table 4)

In [54]:
from sklearn.utils import resample
np.random.seed(1)
type_pulsar=pulsar_train[pulsar_train['type']=='pulsar']
type_others=pulsar_train[pulsar_train['type']=='others']
type_others
type_pulsar_upsampled = resample(
    type_pulsar, n_samples=type_others.shape[0],random_state=1
)

upsampled_pulsar = pd.concat((type_pulsar_upsampled ,type_others))
upsampled_pulsar 

Unnamed: 0,mean_IP,SD_IP,EK_IP,S_IP,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR,type
7648,10.578125,35.128027,5.575468,31.867382,94.964883,52.072327,0.977852,0.824462,pulsar
12258,82.382812,46.598035,1.526851,2.545764,153.243311,78.435295,-0.710193,-0.527549,pulsar
5706,15.257812,29.853875,6.274609,41.339106,82.668060,56.172403,1.102932,1.344280,pulsar
1376,35.617188,33.152376,5.168640,29.046024,58.406355,69.247620,1.093189,0.323029,pulsar
4012,48.726562,29.382902,4.408512,26.629986,111.437291,97.072820,-0.076718,-1.715076,pulsar
...,...,...,...,...,...,...,...,...,...
13094,102.835938,44.586604,0.653969,1.447579,2.494147,19.712587,9.235310,92.930280,others
11809,117.242188,65.441021,-0.014939,-1.252590,53.184783,83.614421,1.062169,-0.665545,others
15553,103.125000,53.530949,0.376731,-0.185251,2.382943,14.401685,9.338643,114.031917,others
7175,123.914062,49.306286,0.058630,0.255576,14.983278,56.100928,3.497040,10.309213,others


Now pulsars and other stars have equal amounts of samples.

In [55]:
count_obs = upsampled_pulsar.groupby('type')['type'].count()  # total number of pulsar observations and other star observations
count_obs 

type
others    12194
pulsar    12194
Name: type, dtype: int64

Next 

In [56]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

pulsar_training_IP = upsampled_pulsar[['mean_IP','SD_IP','EK_IP','S_IP','type']]
pulsar_training_IP
IP_preprocessor = make_column_transformer(
    (StandardScaler(), ['mean_IP','SD_IP','EK_IP','S_IP']),
    verbose_feature_names_out=False
)
IP_preprocessor.fit(pulsar_training_IP)
scaled_training_IP = IP_preprocessor.transform(pulsar_training_IP)
scaled_training_IP

Unnamed: 0,mean_IP,SD_IP,EK_IP,S_IP
7648,-1.957970,-0.926013,1.956658,1.904012
12258,-0.103386,0.441090,-0.081545,-0.443850
5706,-1.837102,-1.554636,2.308629,2.662439
1376,-1.311257,-1.161490,1.751848,1.678098
4012,-0.972665,-1.610771,1.369176,1.484640
...,...,...,...,...
13094,0.424881,0.201349,-0.520982,-0.531785
11809,0.796968,2.686974,-0.857731,-0.747995
15553,0.432347,1.267420,-0.660552,-0.662530
7175,0.969291,0.763885,-0.820695,-0.627232


Table 4

The same is done for DM-SNR values, the resulting table is shown as Table 5

In [57]:
pulsar_training_DMSNR= upsampled_pulsar[['mean_DM-SNR','SD_DM-SNR','EK_DM-SNR','S_DM-SNR','type']]
DMSNR_preprocessor = make_column_transformer(
    (StandardScaler(), ['mean_DM-SNR','SD_DM-SNR','EK_DM-SNR','S_DM-SNR']),
    verbose_feature_names_out=False
)
DMSNR_preprocessor.fit(pulsar_training_DMSNR)
scaled_training_DMSNR = DMSNR_preprocessor.transform(pulsar_training_DMSNR)
scaled_training_DMSNR

Unnamed: 0,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR
7648,1.606828,0.503091,-1.013072,-0.670502
12258,3.023051,1.572052,-1.363560,-0.684306
5706,1.308003,0.669340,-0.987101,-0.665195
1376,0.718420,1.199512,-0.989124,-0.675622
4012,2.007124,2.327763,-1.232032,-0.696430
...,...,...,...,...
13094,-0.640301,-0.809025,0.701424,0.269849
11809,0.591531,1.782054,-0.995565,-0.685714
15553,-0.643003,-1.024371,0.722879,0.485286
7175,-0.336803,0.666442,-0.490013,-0.573668


Table 5

After the IP and DM-SNR values are standarized, the most appropritate k value for both models are located using cross-validation.

First, we need to get the grid of parameter values.

In [58]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

IP_knn = KNeighborsClassifier()
IP_tune_pipe = make_pipeline(IP_preprocessor, IP_knn)
IP_tune_pipe.get_params()


{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                    ['mean_IP', 'SD_IP', 'EK_IP', 'S_IP'])],
                     verbose_feature_names_out=False)),
  ('kneighborsclassifier', KNeighborsClassifier())],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                  ['mean_IP', 'SD_IP', 'EK_IP', 'S_IP'])],
                   verbose_feature_names_out=False),
 'kneighborsclassifier': KNeighborsClassifier(),
 'columntransformer__n_jobs': None,
 'columntransformer__remainder': 'drop',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__transformer_weights': None,
 'columntransformer__transformers': [('standardscaler',
   StandardScaler(),
   ['mean_IP', 'SD_IP', 'EK_IP', 'S_IP'])],
 'columntransformer__verbose': False,
 'columntransformer__verbose_feature_names_out': False,
 'columntransformer

the appropriate parameter value would be 5

Then we tune the grib and aquire a table of cross validation results. Afterwhich it is plotted onto a line plot. (See Graph2)

In [59]:
IP_parameter_grid = {
    "kneighborsclassifier__n_neighbors": range(1,51,5),
}

In [60]:
from sklearn.model_selection import GridSearchCV

IP_tune_grid = GridSearchCV(
    estimator=IP_tune_pipe,
    param_grid=IP_parameter_grid,
    cv=10
)
scaled_training_IP
IP_accuracies_grid = pd.DataFrame(
    IP_tune_grid.fit(
        scaled_training_IP,
        upsampled_pulsar["type"]
    ).cv_results_
)
cross_val_plot=alt.Chart(IP_accuracies_grid).mark_line(point=True).encode(
    y=alt.Y("mean_test_score").scale(zero=False),
    x=alt.X("param_kneighborsclassifier__n_neighbors"),
)
cross_val_plot

Graph 2

From this graph, we see that the highest test score is when k=1, however, this would leave to overfitting and provide a less useful data. K=16 could be a useful value as it yields a high test score, the test scores for k values before and after it does it vary much, and it does not require a significant amount of computational power.

The same process is repeated for DMSNR to find the optimal k value.

In [61]:
DMSNR_knn = KNeighborsClassifier()
DMSNR_tune_pipe = make_pipeline(DMSNR_preprocessor, DMSNR_knn)
DMSNR_tune_pipe.get_params()


{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                    ['mean_DM-SNR', 'SD_DM-SNR', 'EK_DM-SNR',
                                     'S_DM-SNR'])],
                     verbose_feature_names_out=False)),
  ('kneighborsclassifier', KNeighborsClassifier())],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                  ['mean_DM-SNR', 'SD_DM-SNR', 'EK_DM-SNR',
                                   'S_DM-SNR'])],
                   verbose_feature_names_out=False),
 'kneighborsclassifier': KNeighborsClassifier(),
 'columntransformer__n_jobs': None,
 'columntransformer__remainder': 'drop',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__transformer_weights': None,
 'columntransformer__transformers': [('standardscaler',
   StandardScaler(),
   ['mean_DM-SNR', 'SD_DM-SNR', 'EK_DM-SNR', 'S_DM-SN

In [62]:
DMSNR_parameter_grid = {
    "kneighborsclassifier__n_neighbors": range(1,51,5),
}

In [63]:
DMSNR_tune_grid = GridSearchCV(
    estimator=DMSNR_tune_pipe,
    param_grid=DMSNR_parameter_grid,
    cv=100
)
scaled_training_DMSNR
DMSNR_accuracies_grid = pd.DataFrame(
    DMSNR_tune_grid.fit(
        scaled_training_DMSNR,
        upsampled_pulsar["type"]
    ).cv_results_
)
cross_val_plot=alt.Chart(DMSNR_accuracies_grid).mark_line(point=True).encode(
    y=alt.Y("mean_test_score").scale(zero=False),
    x=alt.X("param_kneighborsclassifier__n_neighbors"),
)
cross_val_plot

Accoding to the graph, the highest score for knn is again knn=1, however as mentioned before, this would lead to overfitting. Therefore other values of k is considered. k=6, k=11, and k=16 are considered, but the difference between the scores for nearby values is quite large. The final k value chosen is k=21, it have a moderate score of around 88%, and the difference between the nearby values is around 1%, furthermore it would not require a significant amount of computational power.

Afterwards, the k values are used to fit the models for both IP and DM-SNR. As shown in the code below

In [64]:
IP_knn = KNeighborsClassifier(n_neighbors=16) 
X = scaled_training_IP[['mean_IP','SD_IP','EK_IP','S_IP']]
y = upsampled_pulsar ["type"]
IP_fit = make_pipeline(IP_preprocessor, IP_knn).fit(X, y)
IP_fit

In [65]:
DMSNR_knn = KNeighborsClassifier(n_neighbors=21) 
X = scaled_training_DMSNR[['mean_DM-SNR','SD_DM-SNR','EK_DM-SNR','S_DM-SNR']]
y = upsampled_pulsar ["type"]
DMSNR_fit = make_pipeline(DMSNR_preprocessor, DMSNR_knn).fit(X, y)
DMSNR_fit

In [66]:
pulsar_IP = pulsar[["mean_IP", "SD_IP", "EK_IP", "S_IP", "type"]]
pulsar_IP
## Made a new Dataframe with only IP data and its type.

Unnamed: 0,mean_IP,SD_IP,EK_IP,S_IP,type
0,140.562500,55.683782,-0.234571,-0.699648,others
1,102.507812,58.882430,0.465318,-0.515088,others
2,103.015625,39.341649,0.323328,1.051164,others
3,136.750000,57.178449,-0.068415,-0.636238,others
4,88.726562,40.672225,0.600866,1.123492,others
...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,others
17894,122.554688,49.485605,0.127978,0.323061,others
17895,119.335938,59.935939,0.159363,-0.743025,others
17896,114.507812,53.902400,0.201161,-0.024789,others


In [67]:
pulsar_IP = pulsar_IP.assign(predicted_type = IP_fit.predict(pulsar[["mean_IP", "SD_IP", "EK_IP", "S_IP"]]))
pulsar_IP
## Made a new Column with the predicted pulsar type from our model

Unnamed: 0,mean_IP,SD_IP,EK_IP,S_IP,type,predicted_type
0,140.562500,55.683782,-0.234571,-0.699648,others,others
1,102.507812,58.882430,0.465318,-0.515088,others,others
2,103.015625,39.341649,0.323328,1.051164,others,others
3,136.750000,57.178449,-0.068415,-0.636238,others,others
4,88.726562,40.672225,0.600866,1.123492,others,others
...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,others,others
17894,122.554688,49.485605,0.127978,0.323061,others,others
17895,119.335938,59.935939,0.159363,-0.743025,others,others
17896,114.507812,53.902400,0.201161,-0.024789,others,others


In [68]:
pulsar_IP[pulsar_IP["type"] != pulsar_IP["predicted_type"]]
##Found the columns in which the predictions are not True

Unnamed: 0,mean_IP,SD_IP,EK_IP,S_IP,type,predicted_type
19,99.367188,41.572202,1.547197,4.154106,pulsar,others
42,120.554688,45.549905,0.282924,0.419909,pulsar,others
61,27.765625,28.666042,5.770087,37.419009,pulsar,others
92,23.625000,29.948654,5.688038,35.987172,pulsar,others
93,94.585938,35.779823,1.187309,3.687469,pulsar,others
...,...,...,...,...,...,...
17515,89.867188,47.482295,1.591325,2.505057,pulsar,others
17529,27.039062,33.754722,4.779124,26.255357,pulsar,others
17558,77.070312,39.000638,1.884421,6.372178,pulsar,others
17642,28.375000,27.649311,6.377273,45.944048,pulsar,others


In [69]:
correct_preds = pulsar_IP[
    pulsar_IP['type'] == pulsar_IP['predicted_type']
]

correct_preds.shape[0] / pulsar_IP.shape[0]

## Finding the Accuracy of the model

0.9121689574254107

In [70]:
confusion_matrix_IP = pd.crosstab(
    pulsar_IP["type"],
    pulsar_IP["predicted_type"]
)
confusion_matrix_IP
## Making Confusion Matrix

predicted_type,others,pulsar
type,Unnamed: 1_level_1,Unnamed: 2_level_1
others,16259,0
pulsar,1572,67


In [71]:
pulsar_SNR = pulsar[["mean_DM-SNR", "SD_DM-SNR", "EK_DM-SNR", "S_DM-SNR", "type"]]
pulsar_SNR
## Made a new Dataframe with only SNR data and its type.

Unnamed: 0,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR,type
0,3.199833,19.110426,7.975532,74.242225,others
1,1.677258,14.860146,10.576487,127.393580,others
2,3.121237,21.744669,7.735822,63.171909,others
3,3.642977,20.959280,6.896499,53.593661,others
4,1.178930,11.468720,14.269573,252.567306,others
...,...,...,...,...,...
17893,1.296823,12.166062,15.450260,285.931022,others
17894,16.409699,44.626893,2.945244,8.297092,others
17895,21.430602,58.872000,2.499517,4.595173,others
17896,1.946488,13.381731,10.007967,134.238910,others


In [72]:
pulsar_SNR = pulsar_SNR.assign(predicted_type = DMSNR_fit.predict(pulsar[["mean_DM-SNR", "SD_DM-SNR", "EK_DM-SNR", "S_DM-SNR", "type"]]))
pulsar_SNR
## Made a new Column with the predicted pulsar type from our model

Unnamed: 0,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR,type,predicted_type
0,3.199833,19.110426,7.975532,74.242225,others,others
1,1.677258,14.860146,10.576487,127.393580,others,others
2,3.121237,21.744669,7.735822,63.171909,others,others
3,3.642977,20.959280,6.896499,53.593661,others,others
4,1.178930,11.468720,14.269573,252.567306,others,others
...,...,...,...,...,...,...
17893,1.296823,12.166062,15.450260,285.931022,others,others
17894,16.409699,44.626893,2.945244,8.297092,others,others
17895,21.430602,58.872000,2.499517,4.595173,others,others
17896,1.946488,13.381731,10.007967,134.238910,others,others


In [73]:
pulsar_SNR[pulsar_SNR["type"] != pulsar_SNR["predicted_type"]]
##Found the columns in which the predictions are not True

Unnamed: 0,mean_DM-SNR,SD_DM-SNR,EK_DM-SNR,S_DM-SNR,type,predicted_type
19,27.555184,61.719016,2.208808,3.662680,pulsar,others
42,1.358696,13.079034,13.312141,212.597029,pulsar,others
61,73.112876,62.070220,1.268206,1.082920,pulsar,others
92,146.568562,82.394624,-0.274902,-1.121848,pulsar,others
93,6.071070,29.760400,5.318767,28.698048,pulsar,others
...,...,...,...,...,...,...
17642,141.860368,82.893017,-0.477222,-1.067880,pulsar,others
17693,36.806856,60.103759,1.427498,0.758975,others,pulsar
17744,42.750000,73.387551,1.348588,0.196401,others,pulsar
17820,39.866221,66.733375,1.346171,0.318382,others,pulsar


In [74]:
correct_preds_SNR = pulsar_SNR[
    pulsar_SNR['type'] == pulsar_SNR['predicted_type']
]

correct_preds_SNR.shape[0] / pulsar_SNR.shape[0]

## Finding the Accuracy of the model

0.908425522404738

In [75]:
confusion_matrix_SNR = pd.crosstab(
    pulsar_SNR["type"],
    pulsar_SNR["predicted_type"]
)
confusion_matrix_SNR
## Making Confusion Matrix

predicted_type,others,pulsar
type,Unnamed: 1_level_1,Unnamed: 2_level_1
others,16096,163
pulsar,1476,163


As seen above, the confusion matrix of IP data has more Accuracy score, so the IP model will be used to predict the star type.

In [76]:
final_model = IP_fit

In [79]:
## Mean IP v/s SD IP graph


mean_v_sd_ip = alt.Chart(pulsar_IP).mark_point(opacity = 0.2).encode(
    x=alt.X("mean_IP").title("Mean of the integrated profile."),
    y=alt.Y("SD_IP").title("Standard deviation of the integrated profile."),
    color = "predicted_type"
)
mean_v_sd_ip

In [81]:
## EK-IP v/s S-IP graph


ek_v_s_ip = alt.Chart(pulsar_IP).mark_point(opacity = 0.2).encode(
    x=alt.X("EK_IP").title("Excess kurtosis of the integrated profile."),
    y=alt.Y("S_IP").title("Skewness of the integrated profile."),
    color = "predicted_type"
)
ek_v_s_ip