## **Exercise 13.02**
### Implementing Random Undersampling and Classification on Our Pulsar Star Dataset to Find the Optimal Result

## Importing modules

In [32]:
# Loading the necessary library files
import pandas as pd

### Loading data

In [33]:
# Defining data path of Github repository

filename = 'https://raw.githubusercontent.com/fenago/DSBook/main/Chapter%2013/pulsar_data_train.csv'
# Loading the data using pandas


In [None]:
StarData = pd.read_csv(filename)
StarData.dropna
StarData.head()

### Renamimg columns

In [44]:
 #columns Rename
StarData.rename({' Mean of the integrated profile': 'integrated_profile_Mean',\
                ' Standard deviation of the integrated profile':"integrated_profile_Std",\
                 ' Excess kurtosis of the integrated profile':"integrated_profile_Excess",\
                 ' Skewness of the integrated profile':"integrated_profile_Skewness",\
                 ' Mean of the DM-SNR curve':"DM-SNR_Mean",\
                 ' Standard deviation of the DM-SNR curve':"DM-SNR_Std",\
                 ' Excess kurtosis of the DM-SNR curve':"DM-SNR_Excess",\
                 ' Skewness of the DM-SNR curve':"DM-SNR_Skewness"}, inplace=True,axis=1)


In [None]:
StarData.columns

### Robust Scaler

In [46]:
from sklearn.preprocessing import RobustScaler
rob_scaler = RobustScaler()
# Converting each of the columns to scaled version

StarData['meanScaled'] = rob_scaler.fit_transform(StarData['integrated_profile_Mean'].values.reshape(-1,1))
StarData['st_devScaled'] = rob_scaler.fit_transform(StarData['integrated_profile_Std'].values.reshape(-1,1))
StarData['kurtosisScaled'] = rob_scaler.fit_transform(StarData['integrated_profile_Excess'].values.reshape(-1,1))

In [47]:
# Dropping the original columns
StarData.drop(['integrated_profile_Mean','integrated_profile_Std','integrated_profile_Excess'], axis=1, inplace=True)

In [None]:
# Print the head of the data
StarData.head()


### Dummy variables

In [49]:
# Converting all the categorical variables to dummy variables
StarCat = pd.get_dummies(StarData[['DM-SNR_Mean', 'DM-SNR_Std', 'DM-SNR_Excess','DM-SNR_Skewness']])


In [None]:
# Seperating the numerical data
StarNum = StarData[['meanScaled','st_devScaled','kurtosisScaled']]
StarNum.shape

###  Merging with the original data frame

In [None]:

# Preparing the X variables
X = pd.concat([StarCat, StarNum], axis=1)
print(X.shape)
# Preparing the Y variable
Y = StarData['target_class']
print(Y.shape)
StarData.dropna
X.head()

### Logistic Regression

In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)


### Join the train_x and train_y for ease of operation

In [None]:

trainData = pd.concat([X_train,y_train],axis=1)
trainData.head()

###  Finding the indexes of the sample data set where the target class is 1: min class

In [None]:
ind = trainData[trainData['target_class']==1.0].index
print(len(ind))


### Seperate the minority classes

In [None]:

minData = trainData.loc[ind]
print(minData.shape)

### Finding indexes of majority class

In [None]:

ind1 = trainData[trainData['target_class']==0.0].index
print(len(ind1))

In [None]:

# Seperating the majority class
majData = trainData.loc[ind1]
print(majData.shape)
majData.head()

In [85]:
# Take a random sample equal to length of the minority class to make the data set balanced
majSample = majData.sample(n=len(ind1),random_state = 123)

In [86]:
print(majSample.shape)
majSample.head()

(7961, 8)


Unnamed: 0,DM-SNR_Mean,DM-SNR_Std,DM-SNR_Excess,DM-SNR_Skewness,meanScaled,st_devScaled,kurtosisScaled,target_class
755,2.378763,16.859875,8.999389,94.483806,-0.669495,-0.820904,1.387764,0.0
11815,8.749164,38.006345,4.566126,20.392958,-0.157957,0.40945,0.978623,0.0
3209,1.945652,13.265537,11.243473,162.051614,-0.165997,0.913181,0.324561,0.0
2903,2.314381,14.161686,9.423445,117.82871,0.679917,1.105812,-0.385879,0.0
3221,1.782609,12.643738,10.726153,159.580229,-0.221676,-0.914879,0.496758,0.0


In [87]:
# Concatinating both data sets and then shuffling the data set
balData = pd.concat([minData,majSample],axis = 0)
print('balanced data set shape',balData.shape)

balanced data set shape (8769, 8)


In [88]:
# Shuffling the data set
from sklearn.utils import shuffle

balData = shuffle(balData)
balData.head()

Unnamed: 0,DM-SNR_Mean,DM-SNR_Std,DM-SNR_Excess,DM-SNR_Skewness,meanScaled,st_devScaled,kurtosisScaled,target_class
3221,1.782609,12.643738,10.726153,159.580229,-0.221676,-0.914879,0.496758,0.0
9493,3.367893,21.710395,7.447844,60.294453,-0.954444,0.001128,0.501517,0.0
6326,1.662207,12.436512,12.881222,211.918098,0.532827,-0.132666,-0.461475,0.0
757,8.252508,33.767374,4.402245,19.265065,0.609052,0.700955,-0.446255,0.0
11805,10.544314,40.891187,4.427151,19.415112,1.74438,1.114356,-2.025569,0.0


In [None]:
y_trainNew = balData['target_class']
print(y_trainNew.head())

In [None]:
# Making the new X_train and y_train

X_trainNew = balData.iloc[:,0:100]
print(X_trainNew.head())
X_trainNew.dropna

y_trainNew = balData['target_class']
y_trainNew.dropna
print(y_trainNew.head())

In [None]:
from sklearn.linear_model import LogisticRegression
# Defining the LogisticRegression function
StarModel1 = LogisticRegression()
StarModel1.fit(X_trainNew, y_trainNew)

In [None]:
# Predicting on the test
pred = StarModel1.predict(X_test)
print('Accuracy of Logistic regression model prediction on test set for balanced data set: {:.2f}'.format(StarModel1.score(X_test, y_test)))

In [None]:
# Confusion Matrix for the model
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

[[9969 2029]
 [ 278 1288]]
              precision    recall  f1-score   support

          no       0.97      0.83      0.90     11998
         yes       0.39      0.82      0.53      1566

    accuracy                           0.83     13564
   macro avg       0.68      0.83      0.71     13564
weighted avg       0.91      0.83      0.85     13564



In [None]:
pred = bankModel.predict(X_test)
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(bankModel.score(X_test, y_test)))

Accuracy of Logistic regression model prediction on test set: 0.90


In [None]:
# Confusion Matrix for the model
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

[[11696   302]
 [ 1073   493]]


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          no       0.92      0.97      0.94     11998
         yes       0.62      0.31      0.42      1566

    accuracy                           0.90     13564
   macro avg       0.77      0.64      0.68     13564
weighted avg       0.88      0.90      0.88     13564



In [None]:
print('Percentage of negative class :',(y_train[y_train=='yes'].value_counts()/len(y_train) ) * 100)
print('Percentage of positive class :',(y_train[y_train=='no'].value_counts()/len(y_train) ) * 100)

Percentage of negative class : yes    11.764148
Name: y, dtype: float64
Percentage of positive class : no    88.235852
Name: y, dtype: float64
