In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

In [2]:
pulsar_data = pd.read_csv('pulsar_stars.csv', delimiter=',')
#get column names from dataset
cols = pd.read_csv('pulsar_stars.csv', nrows=1).columns

#turn target_class into a 'dummy' variable
dummy_ranks = pd.get_dummies(pulsar_data['target_class'],prefix='target_class')
print(dummy_ranks.head())

#create intercept column
pulsar_data['intercept'] = 1.0

   target_class_0  target_class_1
0               1               0
1               1               0
2               1               0
3               1               0
4               1               0


In [3]:
#remove target_class from columns, so we can use the remaining ones as predictors
cols_to_keep = cols.values[:-1]
#join dummy variable column with other predictor columns
data = pulsar_data[cols_to_keep].join(dummy_ranks.loc[:, 'target_class_1'])
#all columns except target_class go into train_cols
train_cols = data.columns[:-1]
#create a logistical regression model
logit = sm.Logit(pulsar_data['target_class'], pulsar_data[train_cols])
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.075602
         Iterations 9


In [4]:
print(result.summary())
print(result.conf_int())

                           Logit Regression Results                           
Dep. Variable:           target_class   No. Observations:                17898
Model:                          Logit   Df Residuals:                    17890
Method:                           MLE   Df Model:                            7
Date:                Thu, 26 Sep 2019   Pseudo R-squ.:                  0.7531
Time:                        20:42:55   Log-Likelihood:                -1353.1
converged:                       True   LL-Null:                       -5479.8
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                    coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------------------
 Mean of the integrated profile                  -0.0013      0.005     -0.259      0.796      -0.011       0.008
 Standard deviation of the

In [5]:
#odds ratios
print(np.exp(result.params))

 Mean of the integrated profile                    0.998748
 Standard deviation of the integrated profile      0.937637
 Excess kurtosis of the integrated profile       189.944927
 Skewness of the integrated profile                0.578381
 Mean of the DM-SNR curve                          0.969764
 Standard deviation of the DM-SNR curve            1.014835
 Excess kurtosis of the DM-SNR curve               0.615632
 Skewness of the DM-SNR curve                      1.011522
dtype: float64


In [6]:
params = result.params
conf = result.conf_int()
conf['OR'] = params
conf.columns = ['2.5%', '97.5%', 'OR']
#get odds ratios and conf ints
print(np.exp(conf))

                                                     2.5%       97.5%  \
 Mean of the integrated profile                  0.989306    1.008280   
 Standard deviation of the integrated profile    0.919611    0.956017   
 Excess kurtosis of the integrated profile     118.051740  305.620869   
 Skewness of the integrated profile              0.543160    0.615887   
 Mean of the DM-SNR curve                        0.963781    0.975784   
 Standard deviation of the DM-SNR curve          1.002593    1.027226   
 Excess kurtosis of the DM-SNR curve             0.544874    0.695579   
 Skewness of the DM-SNR curve                    1.007093    1.015970   

                                                       OR  
 Mean of the integrated profile                  0.998748  
 Standard deviation of the integrated profile    0.937637  
 Excess kurtosis of the integrated profile     189.944927  
 Skewness of the integrated profile              0.578381  
 Mean of the DM-SNR curve                 

In [41]:
#LDA
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

y = pulsar_data['target_class']
X = pulsar_data[cols_to_keep]

#fit LDA
sklearn_lda = LinearDiscriminantAnalysis()
print(sklearn_lda.fit(X,y))
#sample prediction, one value for each predictor variable
print(sklearn_lda.predict([[1,1,1,1,1,1,1,1]]))

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)
[0]


In [39]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#fit QDA
sklearn_qda = QuadraticDiscriminantAnalysis()
print(sklearn_qda.fit(X,y))
#sample prediction
print(sklearn_qda.predict([[1,1,1,1,1,1,1,1]]))

QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                              store_covariance=False, tol=0.0001)
[0]


In [38]:
#KNN
from sklearn import neighbors

n_neighbors = 200
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
print(clf.fit(X,y))
#predicts 1 vs LDA and QDA predicting 0
print(clf.predict([[1,1,1,1,1,1,1,1]]))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=200, p=2,
                     weights='distance')
[1]
