## **Exercise 13.04**
###  Implementing MSMOTE on the Pulsar Star Dataset to Find the Optimal Result

## Importing modules

In [None]:
# Loading the necessary library files
import pandas as pd

### Loading data

In [None]:
# Defining data path of Github repository

filename = 'https://raw.githubusercontent.com/fenago/DSBook/main/Chapter%2013/pulsar_data_train.csv'
# Loading the data using pandas


In [None]:
StarData = pd.read_csv(filename)
StarData.dropna
StarData.head()

### Renamimg columns

In [None]:
 #columns Rename
StarData.rename({' Mean of the integrated profile': 'integrated_profile_Mean',\
                ' Standard deviation of the integrated profile':"integrated_profile_Std",\
                 ' Excess kurtosis of the integrated profile':"integrated_profile_Excess",\
                 ' Skewness of the integrated profile':"integrated_profile_Skewness",\
                 ' Mean of the DM-SNR curve':"DM-SNR_Mean",\
                 ' Standard deviation of the DM-SNR curve':"DM-SNR_Std",\
                 ' Excess kurtosis of the DM-SNR curve':"DM-SNR_Excess",\
                 ' Skewness of the DM-SNR curve':"DM-SNR_Skewness"}, inplace=True,axis=1)


In [None]:
StarData.columns

### Robust Scaler

In [None]:
from sklearn.preprocessing import RobustScaler
rob_scaler = RobustScaler()
# Converting each of the columns to scaled version

StarData['meanScaled'] = rob_scaler.fit_transform(StarData['integrated_profile_Mean'].values.reshape(-1,1))
StarData['st_devScaled'] = rob_scaler.fit_transform(StarData['integrated_profile_Std'].values.reshape(-1,1))
StarData['kurtosisScaled'] = rob_scaler.fit_transform(StarData['integrated_profile_Excess'].values.reshape(-1,1))

In [None]:
# Dropping the original columns
StarData.drop(['integrated_profile_Mean','integrated_profile_Std','integrated_profile_Excess'], axis=1, inplace=True)

In [None]:
# Print the head of the data
StarData.head()


### Dummy variables

In [None]:
# Converting all the categorical variables to dummy variables
StarCat = pd.get_dummies(StarData[['DM-SNR_Mean', 'DM-SNR_Std', 'DM-SNR_Excess','DM-SNR_Skewness']])


In [None]:
# Seperating the numerical data
StarNum = StarData[['meanScaled','st_devScaled','kurtosisScaled']]
StarNum.shape

###  Merging with the original data frame

In [None]:

# Preparing the X variables
X = pd.concat([StarCat, StarNum], axis=1)
print(X.shape)
# Preparing the Y variable
Y = StarData['target_class']
print(Y.shape)
StarData.dropna
X.head()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)


In [None]:
print("Before OverSampling count of 0: {}".format(sum(y_train== 0)))
print("Before OverSampling count of 1: {} \n".format(sum(y_train== 1)))

In [None]:
!pip install smote-variants

In [None]:
import smote_variants as sv
import numpy as np
# Instantiating the SMOTE class
oversampler= sv.MSMOTE()
# Creating new training set

X_train_os, y_train_os = oversampler.sample(np.array(X_train), np.array(y_train))


### Defining the LogisticRegression function

In [None]:

StarModel2 = LogisticRegression()

StarModel2.fit(X_train_os, y_train_os)

### Predicting on the test set

In [None]:

pred = StarModel2.predict(X_test)
# Printing accuracy 
print('Accuracy of Logistic regression model prediction on test set for MSmote balanced data set: {:.2f}'.format(StarModel2.score(X_test, y_test)))


### Confusion Matrix for the model

In [None]:

from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

###  Classification report for the model

In [None]:

from sklearn.metrics import classification_report
print(classification_report(y_test, pred))