In [1]:
import pandas as pd

In [2]:
headers = ['R', 'temperature', 'cough_severity', 'difficult_breathing', 'exposure_risk', 'days_since_onset']

In [3]:
rows = [
    ('Pos', 103.2, 'Severe', 1, 1, 15), 
    ('Neg', 97.5, 'Mild', 0, 1, 7), 
    ('Neg', 98.5, 'None', 0, 1, 0), 
    ('Pos', 101.3, 'Mild', 1, 1, 4), 
    ('Neg', 98.5, 'None', 0, 0, 21), 
    ('Pos', 99.0, 'Mild', 0, 1, 10), 
    ('Neg', 98.1, 'Mild', 0, 1, 5), 
    ('Neg', 102.1, 'Severe', 1, 0, 4)]

In [4]:
df = pd.DataFrame(rows, columns=headers)

In [5]:
df

Unnamed: 0,R,temperature,cough_severity,difficult_breathing,exposure_risk,days_since_onset
0,Pos,103.2,Severe,1,1,15
1,Neg,97.5,Mild,0,1,7
2,Neg,98.5,,0,1,0
3,Pos,101.3,Mild,1,1,4
4,Neg,98.5,,0,0,21
5,Pos,99.0,Mild,0,1,10
6,Neg,98.1,Mild,0,1,5
7,Neg,102.1,Severe,1,0,4


In [6]:
#https://scikit-learn.org/stable/modules/preprocessing.html

In [7]:
# encoding categorical features

In [8]:
from sklearn.preprocessing import OrdinalEncoder

# this allows us to control the order of the encoding
# otherwise, the encoder will order them in the order they are first observed

enc = OrdinalEncoder(categories=[['None', 'Mild', 'Moderate', 'Severe']])
enc.fit_transform(df.loc[:,['cough_severity']])

df['cough_severity'] = enc.fit_transform(df.loc[:,['cough_severity']]).astype('int')

In [9]:
df

Unnamed: 0,R,temperature,cough_severity,difficult_breathing,exposure_risk,days_since_onset
0,Pos,103.2,3,1,1,15
1,Neg,97.5,1,0,1,7
2,Neg,98.5,0,0,1,0
3,Pos,101.3,1,1,1,4
4,Neg,98.5,0,0,0,21
5,Pos,99.0,1,0,1,10
6,Neg,98.1,1,0,1,5
7,Neg,102.1,3,1,0,4


In [10]:
df.iloc[:,0]

0    Pos
1    Neg
2    Neg
3    Pos
4    Neg
5    Pos
6    Neg
7    Neg
Name: R, dtype: object

In [11]:
# more ordinal encoding, map neg test result to 0, pos result to 1

enc = OrdinalEncoder(categories=[['Neg', 'Pos']])
enc.fit_transform(df.loc[:,['R']])

df['R'] = enc.fit_transform(df.loc[:,['R']]).astype('int')

In [12]:
df

Unnamed: 0,R,temperature,cough_severity,difficult_breathing,exposure_risk,days_since_onset
0,1,103.2,3,1,1,15
1,0,97.5,1,0,1,7
2,0,98.5,0,0,1,0
3,1,101.3,1,1,1,4
4,0,98.5,0,0,0,21
5,1,99.0,1,0,1,10
6,0,98.1,1,0,1,5
7,0,102.1,3,1,0,4


In [13]:
# binarization
# we will create a column, 'fever', based on the temperature
# fever will be recorded as binary - present or not. 
# convert temperature to binary indicator based on a threshold of 101

In [14]:
from sklearn.preprocessing import Binarizer

In [15]:
binarizer = Binarizer(threshold=(101.0)).fit([df['temperature']]) 

In [17]:
df['fever'] = binarizer.transform([df['temperature']]).astype('int')[0]

In [18]:
df

Unnamed: 0,R,temperature,cough_severity,difficult_breathing,exposure_risk,days_since_onset,fever
0,1,103.2,3,1,1,15,1
1,0,97.5,1,0,1,7,0
2,0,98.5,0,0,1,0,0
3,1,101.3,1,1,1,4,1
4,0,98.5,0,0,0,21,0
5,1,99.0,1,0,1,10,0
6,0,98.1,1,0,1,5,0
7,0,102.1,3,1,0,4,1


In [None]:
# scaling and standardization 
# we will use the minmax scaler here (divide each observation by max(abs(x)))
# standard scalar will normalize as well as scale, using unit variance
# note that minmax will preserve binary data

In [19]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [20]:
scaler = MinMaxScaler().fit(df)

In [21]:
X_scaled = scaler.transform(df)

In [22]:
X_scaled

array([[1.        , 1.        , 1.        , 1.        , 1.        ,
        0.71428571, 1.        ],
       [0.        , 0.        , 0.33333333, 0.        , 1.        ,
        0.33333333, 0.        ],
       [0.        , 0.1754386 , 0.        , 0.        , 1.        ,
        0.        , 0.        ],
       [1.        , 0.66666667, 0.33333333, 1.        , 1.        ,
        0.19047619, 1.        ],
       [0.        , 0.1754386 , 0.        , 0.        , 0.        ,
        1.        , 0.        ],
       [1.        , 0.26315789, 0.33333333, 0.        , 1.        ,
        0.47619048, 0.        ],
       [0.        , 0.10526316, 0.33333333, 0.        , 1.        ,
        0.23809524, 0.        ],
       [0.        , 0.80701754, 1.        , 1.        , 0.        ,
        0.19047619, 1.        ]])

In [23]:
#minmax scaler
df.iloc[:,1:]

Unnamed: 0,temperature,cough_severity,difficult_breathing,exposure_risk,days_since_onset,fever
0,103.2,3,1,1,15,1
1,97.5,1,0,1,7,0
2,98.5,0,0,1,0,0
3,101.3,1,1,1,4,1
4,98.5,0,0,0,21,0
5,99.0,1,0,1,10,0
6,98.1,1,0,1,5,0
7,102.1,3,1,0,4,1


In [24]:
df = pd.DataFrame(X_scaled, columns=df.columns)

In [25]:
df

Unnamed: 0,R,temperature,cough_severity,difficult_breathing,exposure_risk,days_since_onset,fever
0,1.0,1.0,1.0,1.0,1.0,0.714286,1.0
1,0.0,0.0,0.333333,0.0,1.0,0.333333,0.0
2,0.0,0.175439,0.0,0.0,1.0,0.0,0.0
3,1.0,0.666667,0.333333,1.0,1.0,0.190476,1.0
4,0.0,0.175439,0.0,0.0,0.0,1.0,0.0
5,1.0,0.263158,0.333333,0.0,1.0,0.47619,0.0
6,0.0,0.105263,0.333333,0.0,1.0,0.238095,0.0
7,0.0,0.807018,1.0,1.0,0.0,0.190476,1.0


In [28]:
# convert our Result to pure integer form
df['R'] = df['R'].astype('int')

In [29]:
df

Unnamed: 0,R,temperature,cough_severity,difficult_breathing,exposure_risk,days_since_onset,fever
0,1,1.0,1.0,1.0,1.0,0.714286,1.0
1,0,0.0,0.333333,0.0,1.0,0.333333,0.0
2,0,0.175439,0.0,0.0,1.0,0.0,0.0
3,1,0.666667,0.333333,1.0,1.0,0.190476,1.0
4,0,0.175439,0.0,0.0,0.0,1.0,0.0
5,1,0.263158,0.333333,0.0,1.0,0.47619,0.0
6,0,0.105263,0.333333,0.0,1.0,0.238095,0.0
7,0,0.807018,1.0,1.0,0.0,0.190476,1.0
