In [65]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Define the relevant feature set

In [66]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("CTG3.csv")
df = pd.read_csv(file_path)

# Review the DataFrame
df.head()

Unnamed: 0,b,e,AC,FM,UC,DL,DS,DP,DR,LB,...,C,D,E,AD,DE,LD,FS,SUSP,CLASS,NSP
0,240,357,0,0,0,0,0,0,0,120,...,-1,-1,-1,-1,-1,-1,1,-1,9,2
1,5,632,4,0,4,2,0,0,0,132,...,-1,-1,-1,1,-1,-1,-1,-1,6,1
2,177,779,2,0,5,2,0,0,0,133,...,-1,-1,-1,1,-1,-1,-1,-1,6,1
3,411,1192,2,0,6,2,0,0,0,134,...,-1,-1,-1,1,-1,-1,-1,-1,6,1
4,533,1147,4,0,5,0,0,0,0,132,...,-1,-1,-1,-1,-1,-1,-1,-1,2,1


In [67]:
# check the data stats
df.columns

Index(['b', 'e', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'DR', 'LB', 'AC.1',
       'FM.1', 'UC.1', 'DL.1', 'DS.1', 'DP.1', 'ASTV', 'MSTV', 'ALTV', 'MLTV',
       'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median',
       'Variance', 'Tendency', 'A', 'B', 'C', 'D', 'E', 'AD', 'DE', 'LD', 'FS',
       'SUSP', 'CLASS', 'NSP'],
      dtype='object')

In [68]:
# compare number of values in each column

df.count()

b           2126
e           2126
AC          2126
FM          2126
UC          2126
DL          2126
DS          2126
DP          2126
DR          2126
LB          2126
AC.1        2126
FM.1        2126
UC.1        2126
DL.1        2126
DS.1        2126
DP.1        2126
ASTV        2126
MSTV        2126
ALTV        2126
MLTV        2126
Width       2126
Min         2126
Max         2126
Nmax        2126
Nzeros      2126
Mode        2126
Mean        2126
Median      2126
Variance    2126
Tendency    2126
A           2126
B           2126
C           2126
D           2126
E           2126
AD          2126
DE          2126
LD          2126
FS          2126
SUSP        2126
CLASS       2126
NSP         2126
dtype: int64

In [69]:
# data types
df.dtypes

b             int64
e             int64
AC            int64
FM            int64
UC            int64
DL            int64
DS            int64
DP            int64
DR            int64
LB            int64
AC.1        float64
FM.1        float64
UC.1        float64
DL.1        float64
DS.1        float64
DP.1        float64
ASTV          int64
MSTV        float64
ALTV          int64
MLTV        float64
Width         int64
Min           int64
Max           int64
Nmax          int64
Nzeros        int64
Mode          int64
Mean          int64
Median        int64
Variance      int64
Tendency      int64
A             int64
B             int64
C             int64
D             int64
E             int64
AD            int64
DE            int64
LD            int64
FS            int64
SUSP          int64
CLASS         int64
NSP           int64
dtype: object

In [70]:
# Define feature set and target set to be used for models
# Note most of the columns are not relevant to the required analysis and require removal in order to avoid
# confusing the model
# X = df[["LB", "AC", "FM", "UC", "DL", "DS", "DP", "ASTV", "MSTV", "ALTV", "MLTV", "Width", "Min", "Max", "Nmax", "Nzeros", "Mode", "Mean", "Median", "Variance", "Tendency"]]
data_df = df[["LB", "AC", "FM", "UC", "DL", "DS", "DP", "ASTV", "MSTV", "ALTV", "MLTV", "NSP"]]

data_df.head()

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,MLTV,NSP
0,120,0,0,0,0,0,0,73,0.5,43,2.4,2
1,132,4,0,4,2,0,0,17,2.1,0,10.4,1
2,133,2,0,5,2,0,0,16,2.1,0,13.4,1
3,134,2,0,6,2,0,0,16,2.4,0,23.0,1
4,132,4,0,5,0,0,0,16,2.4,0,19.9,1


In [71]:
# check table dimensions:
data_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   LB      2126 non-null   int64  
 1   AC      2126 non-null   int64  
 2   FM      2126 non-null   int64  
 3   UC      2126 non-null   int64  
 4   DL      2126 non-null   int64  
 5   DS      2126 non-null   int64  
 6   DP      2126 non-null   int64  
 7   ASTV    2126 non-null   int64  
 8   MSTV    2126 non-null   float64
 9   ALTV    2126 non-null   int64  
 10  MLTV    2126 non-null   float64
 11  NSP     2126 non-null   int64  
dtypes: float64(2), int64(10)
memory usage: 199.4 KB


In [72]:
# drop NaN data

data_df = data_df.dropna(how='any')
data_df.count()

LB      2126
AC      2126
FM      2126
UC      2126
DL      2126
DS      2126
DP      2126
ASTV    2126
MSTV    2126
ALTV    2126
MLTV    2126
NSP     2126
dtype: int64

In [73]:
count_isna = data_df.isna()
count_isna

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,MLTV,NSP
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
2121,False,False,False,False,False,False,False,False,False,False,False,False
2122,False,False,False,False,False,False,False,False,False,False,False,False
2123,False,False,False,False,False,False,False,False,False,False,False,False
2124,False,False,False,False,False,False,False,False,False,False,False,False


---

In [78]:
# select 30 lines of the original data set as 'real world data' to use on the trained and tested model
# this dataset was selected 'nonrandomly' because it contains a selection of the various outcomes 
# to be tested on the trained / tested model (i.e. The NSP column values)
df_real_world = data_df.iloc[1470:1500]
df_real_world[20:30]



Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,MLTV,NSP
1490,132,2,0,10,1,0,1,34,1.2,0,12.4,2
1491,132,0,0,5,1,0,1,30,1.7,0,0.0,3
1492,132,0,0,8,1,0,0,32,1.3,0,14.2,3
1493,132,0,0,3,0,0,0,32,1.1,0,15.5,1
1494,132,6,0,9,0,0,0,33,1.2,0,14.0,1
1495,132,0,0,3,0,0,0,33,1.0,0,14.5,1
1496,132,7,0,11,1,0,0,32,1.2,0,13.5,1
1497,132,7,0,9,0,0,0,32,1.3,0,12.3,1
1498,132,1,0,5,0,0,1,30,1.2,0,14.7,1
1499,132,0,0,4,1,0,0,30,1.2,0,14.6,1


In [60]:
df_real_world.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 1470 to 1499
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   LB      30 non-null     int64  
 1   AC      30 non-null     int64  
 2   FM      30 non-null     int64  
 3   UC      30 non-null     int64  
 4   DL      30 non-null     int64  
 5   DS      30 non-null     int64  
 6   DP      30 non-null     int64  
 7   ASTV    30 non-null     int64  
 8   MSTV    30 non-null     float64
 9   ALTV    30 non-null     int64  
 10  MLTV    30 non-null     float64
 11  NSP     30 non-null     int64  
dtypes: float64(2), int64(10)
memory usage: 2.9 KB


In [61]:
# save the real world dataset as a separate csv file. Note in the 'real world' this data would actually come from a completely
# different dataset

df_real_world.to_csv('data_real_world.csv', index=False)

In [62]:
# remove the real world data (n=30) from the test-train data set

df_reduced = data_df.drop([1470,1500])
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2124 entries, 0 to 2125
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   LB      2124 non-null   int64  
 1   AC      2124 non-null   int64  
 2   FM      2124 non-null   int64  
 3   UC      2124 non-null   int64  
 4   DL      2124 non-null   int64  
 5   DS      2124 non-null   int64  
 6   DP      2124 non-null   int64  
 7   ASTV    2124 non-null   int64  
 8   MSTV    2124 non-null   float64
 9   ALTV    2124 non-null   int64  
 10  MLTV    2124 non-null   float64
 11  NSP     2124 non-null   int64  
dtypes: float64(2), int64(10)
memory usage: 215.7 KB


In [63]:
# save the 'reduced dataset' for future analysis. 
# this dataset comprises the following:
# all the features and target column required for analysis
# a block of 30 removed which will be used as the real world data to be applied to the trained and tested model


df_reduced.to_csv('data_preprocessed.csv', index=False)
