In [1]:
# Fayruz Kibria
# February 18, 2021
# Cardiotocograohy data

In [2]:
import pandas as pd
import numpy as np

import seaborn as sn
import matplotlib.pyplot as plt

import plotnine
from plotnine import *
from plotnine.data import *
%matplotlib inline

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from imblearn.over_sampling import SMOTENC
from collections import Counter

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, roc_curve, auc
from sklearn.metrics import classification_report

In [3]:
# Import dataset
mydata0 = pd.read_csv("df8.csv")
mydata=mydata0.copy()

# View the first few rows of the data
mydata.head()

Unnamed: 0,LB,ASTV,MSTV,ALTV,MLTV,Width,Min,Max,Nmax,Nzeros,...,C,D,E,AD,DE,LD,FS,SUSP,CLASS,NSP
0,0.259259,0.813333,0.044118,0.472527,0.047337,0.344633,0.110092,0.034483,0.111111,0.0,...,0,0,0,0,0,0,1,0,9,2
1,0.481481,0.066667,0.279412,0.0,0.205128,0.717514,0.165138,0.655172,0.333333,0.1,...,0,0,0,1,0,0,0,0,6,1
2,0.5,0.053333,0.279412,0.0,0.2643,0.717514,0.165138,0.655172,0.277778,0.1,...,0,0,0,1,0,0,0,0,6,1
3,0.518519,0.053333,0.323529,0.0,0.453649,0.644068,0.027523,0.413793,0.611111,0.0,...,0,0,0,1,0,0,0,0,6,1
4,0.481481,0.053333,0.323529,0.0,0.392505,0.644068,0.027523,0.413793,0.5,0.0,...,0,0,0,0,0,0,0,0,2,1


In [4]:
# df8 is already normalized in R

label_col=mydata['NSP'] # defining the label column
data_col=mydata.iloc[:, 0:32] # defining data numerical columns

X_train,X_test,y_train,y_test=train_test_split(data_col,label_col,test_size=0.3 ,random_state=42, stratify=label_col)

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1488 entries, 480 to 229
Data columns (total 32 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   LB        1488 non-null   float64
 1   ASTV      1488 non-null   float64
 2   MSTV      1488 non-null   float64
 3   ALTV      1488 non-null   float64
 4   MLTV      1488 non-null   float64
 5   Width     1488 non-null   float64
 6   Min       1488 non-null   float64
 7   Max       1488 non-null   float64
 8   Nmax      1488 non-null   float64
 9   Nzeros    1488 non-null   float64
 10  Mode      1488 non-null   float64
 11  Mean      1488 non-null   float64
 12  Median    1488 non-null   float64
 13  Variance  1488 non-null   float64
 14  nAC       1488 non-null   float64
 15  nFM       1488 non-null   float64
 16  nUC       1488 non-null   float64
 17  nDL       1488 non-null   float64
 18  nDS       1488 non-null   float64
 19  nDP       1488 non-null   float64
 20  Tendency  1488 non-null   int

In [6]:
smote_nc = SMOTENC(categorical_features=[20,21,22,23,24,25,26,27,28,29,30,31], random_state=0) # over sampling
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train)

print(sorted(Counter(y_resampled).items()))
print(X_resampled[-5:])

[(1, 1158), (2, 1158), (3, 1158)]
            LB      ASTV      MSTV      ALTV      MLTV     Width       Min  \
3469  0.296296  0.947233  0.044118  0.065467  0.306015  0.367232  0.110092   
3470  0.575719  0.736535  0.233626  0.015580  0.004195  0.255027  0.379858   
3471  0.537037  0.725817  0.031070  0.760720  0.095845  0.425807  0.160101   
3472  0.777778  0.776449  0.014706  0.818039  0.089859  0.095900  0.756581   
3473  0.259259  0.685818  0.148930  0.000000  0.163654  0.463277  0.174312   

           Max      Nmax  Nzeros  ...  B  C  D  E  AD  DE  LD  FS  SUSP  CLASS  
3469  0.068966  0.000000     0.0  ...  0  0  0  0   0   0   0   1     0      9  
3470  0.151245  0.134320     0.0  ...  0  0  0  0   0   0   1   0     0      8  
3471  0.205334  0.117375     0.0  ...  0  0  0  0   0   0   0   1     0      9  
3472  0.262427  0.073598     0.0  ...  0  0  0  0   0   0   0   1     0      9  
3473  0.275862  0.158080     0.1  ...  0  0  0  0   0   0   1   0     0      8  

[5 rows x 

In [7]:
X_resampled.groupby('SUSP').count() # checking categorrical oversampling

Unnamed: 0_level_0,LB,ASTV,MSTV,ALTV,MLTV,Width,Min,Max,Nmax,Nzeros,...,A,B,C,D,E,AD,DE,LD,FS,CLASS
SUSP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2625,2625,2625,2625,2625,2625,2625,2625,2625,2625,...,2625,2625,2625,2625,2625,2625,2625,2625,2625,2625
1,849,849,849,849,849,849,849,849,849,849,...,849,849,849,849,849,849,849,849,849,849


In [8]:
X_train.groupby('SUSP').count()

Unnamed: 0_level_0,LB,ASTV,MSTV,ALTV,MLTV,Width,Min,Max,Nmax,Nzeros,...,A,B,C,D,E,AD,DE,LD,FS,CLASS
SUSP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1344,1344,1344,1344,1344,1344,1344,1344,1344,1344,...,1344,1344,1344,1344,1344,1344,1344,1344,1344,1344
1,144,144,144,144,144,144,144,144,144,144,...,144,144,144,144,144,144,144,144,144,144


In [9]:
# removing some extra over sampled data randomly

N=1158-537

dd2=np.random.choice(y_resampled.index[y_resampled.eq(2)],N, replace=False)
                     
X_train2 = X_resampled.drop(dd2)
y_train2 = y_resampled.drop(dd2)
                     
dd3=np.random.choice(y_train2.index[y_train2.eq(3)],N, replace=False)
                     
X_train3 = X_train2.drop(dd3)
y_train3 = y_train2.drop(dd3)

In [10]:
y_train3.eq(2).sum() # checking whether the desirer number of oversampled data were obtained

537

In [11]:
y_train3.eq(3).sum()  # checking whether the desirer number of oversampled data were obtained

537

In [12]:
print(sorted(Counter(y_train3).items())) # checking class 1, 2, 3 distribution after removal of some SMOTE-NC data

[(1, 1158), (2, 537), (3, 537)]


In [13]:
X_train.groupby('SUSP').count() # checking oversampling of a categorical variable

Unnamed: 0_level_0,LB,ASTV,MSTV,ALTV,MLTV,Width,Min,Max,Nmax,Nzeros,...,A,B,C,D,E,AD,DE,LD,FS,CLASS
SUSP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1344,1344,1344,1344,1344,1344,1344,1344,1344,1344,...,1344,1344,1344,1344,1344,1344,1344,1344,1344,1344
1,144,144,144,144,144,144,144,144,144,144,...,144,144,144,144,144,144,144,144,144,144


In [14]:
X_train3.groupby('SUSP').count() # checking oversampling of a categorical variable

Unnamed: 0_level_0,LB,ASTV,MSTV,ALTV,MLTV,Width,Min,Max,Nmax,Nzeros,...,A,B,C,D,E,AD,DE,LD,FS,CLASS
SUSP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1843,1843,1843,1843,1843,1843,1843,1843,1843,1843,...,1843,1843,1843,1843,1843,1843,1843,1843,1843,1843
1,389,389,389,389,389,389,389,389,389,389,...,389,389,389,389,389,389,389,389,389,389


In [15]:
X_y_train_combined = pd.concat([X_train3,y_train3], axis=1) # combining independent and feature variables to write csv
X_y_test_combined = pd.concat([X_test,y_test], axis=1)
X_y_train_fullres_combined = pd.concat([X_resampled,y_resampled], axis=1)

In [16]:
X_y_test_combined.shape # checking samples in the test data

(638, 33)

In [19]:
X_y_train_combined.to_csv('X_y_train_combined.csv',index=False) # writing csv to export back to R
X_y_test_combined.to_csv('X_y_test_combined.csv',index=False)
X_y_train_fullres_combined.to_csv('X_y_train_fullres_combined.csv',index=False)