In [1]:
# Numerical libs
import numpy as np
import pandas as pd

# ML libs
from sklearn.model_selection import train_test_split

# Plotting libs
import matplotlib.pyplot as plt
import seaborn as sns
# Utils

# Load data

In [2]:
datafile = 'datasets/hypo/hypothyroid2.csv'
df = pd.read_csv(datafile)

In [3]:
df.shape

(3772, 30)

In [4]:
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
0,41.0,F,f,f,f,f,f,f,f,f,...,t,125.0,t,1.14,t,109.0,f,,SVHC,negative
1,23.0,F,f,f,f,f,f,f,f,f,...,t,102.0,f,,f,,f,,other,negative
2,46.0,M,f,f,f,f,f,f,f,f,...,t,109.0,t,0.91,t,120.0,f,,other,negative
3,70.0,F,t,f,f,f,f,f,f,f,...,t,175.0,f,,f,,f,,other,negative
4,70.0,F,f,f,f,f,f,f,f,f,...,t,61.0,t,0.87,t,70.0,f,,SVI,negative


In [5]:
df.describe()

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,TBG
count,3771.0,3403.0,3003.0,3541.0,3385.0,3387.0,0.0
mean,51.735879,5.086766,2.0135,108.319345,0.995,110.469649,
std,20.084958,24.52147,0.827434,35.604248,0.195457,33.089698,
min,1.0,0.005,0.05,2.0,0.25,2.0,
25%,36.0,0.5,1.6,88.0,0.88,93.0,
50%,54.0,1.4,2.0,103.0,0.98,107.0,
75%,67.0,2.7,2.4,124.0,1.08,124.0,
max,455.0,530.0,10.6,430.0,2.32,395.0,


# Data preprocessing

- https://en.wikipedia.org/wiki/Data_pre-processing
- https://towardsdatascience.com/data-preprocessing-concepts-fa946d11c825
- https://www.geeksforgeeks.org/data-preprocessing-in-data-mining/
- https://www.youtube.com/watch?v=zVImIQuqjQ0&list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba&index=5

**Data types**
- Categorical (binary, multiple classes). Ex: days in a week, sex
- Numerical (continuous, integer-valued). Ex: measurements, age

**Problems with data**
> Garbage in, garbage out

- Usually the data is messy and problematic. We might get missing values, impossible values, duplicate values where it shouldn't happen etc. 
- It's our job to clean the data and prepare it for our model.
- The better the data we feed to the model, the better the model will be

## Data cleaning

### Human / Expert cleaning

Usually the human can analyze the dataset and see if there is something wrong.

- For example `age` shouldn't be an absurd number (such as 400) nor negative (-2).
- `TTL` shouldn't be negative

If we have domain knowledge we can use it to clear (drop rows) / replace these values.

In [58]:
# We can see that our dataset contains ages that are too big. Let's clear these
df.describe()

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,TBG
count,3771.0,3403.0,3003.0,3541.0,3385.0,3387.0,0.0
mean,51.735879,5.086766,2.0135,108.319345,0.995,110.469649,
std,20.084958,24.52147,0.827434,35.604248,0.195457,33.089698,
min,1.0,0.005,0.05,2.0,0.25,2.0,
25%,36.0,0.5,1.6,88.0,0.88,93.0,
50%,54.0,1.4,2.0,103.0,0.98,107.0,
75%,67.0,2.7,2.4,124.0,1.08,124.0,
max,455.0,530.0,10.6,430.0,2.32,395.0,


In [59]:
#Get the wrong values
df[df['age'] > 100]

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
1364,455.0,F,f,f,f,f,f,f,f,f,...,t,118.0,t,1.13,t,104.0,f,,SVI,negative


In [60]:
#drop them
df = df.drop(df[df['age'] > 100].index)

In [61]:
df[df['age'] > 100]

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class


In [62]:
# TSH can be a maximum of 450. Drop the values above that

In [63]:
df[df['TSH'] > 450]['TSH']

1165    478.0
2507    472.0
2772    468.0
3449    530.0
Name: TSH, dtype: float64

In [64]:
print(df.shape)
df = df.drop(df[df['TSH'] > 450].index)
print(df.shape)

(3771, 30)
(3767, 30)


### Missing data

Datasets might have some missing data in them. 
- When it comes to almost empty columns (features) we drop those. 
- When it comes to rows we have 2 options: 
    1. We drop the missing rows
    2. We try to fill them

Let's talk about filling the rows. *How should we fill them?*
- Zero (not reccomended)
- Estimate from the distribution 
- Mean
- Median


#### Pandas methods to check for missing data

In [65]:
df.isna() # Or df.isnull()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,True,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,True,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,False,False,False,False,False,False,False,False,False,False,...,False,True,False,True,False,True,False,True,False,False
3768,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3769,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3770,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [66]:
np.all(df.isna() == df.isnull())

True

In [67]:
# Let's get the missing values / feature
df.isna().sum()

age                             1
sex                           149
on_thyroxine                    0
query_on_thyroxine              0
on_antithyroid_medication       0
sick                            0
pregnant                        0
thyroid_surgery                 0
I131_treatment                  0
query_hypothyroid               0
query_hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                   0
psych                           0
TSH_measured                    0
TSH                           369
T3_measured                     0
T3                            768
TT4_measured                    0
TT4                           231
T4U_measured                    0
T4U                           387
FTI_measured                    0
FTI                           385
TBG_measured                    0
TBG                          3767
referral_source                 0
Class         

In [68]:
# We can see that TBG columns is almost empty so let's drop it
df = df.drop(['TBG'], axis = 1)

In [69]:
# Let's drop the rows that don't have the sex attribute since we can't fill them

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html
print(df.shape)
df = df.dropna(subset = ['sex'])
print(df.shape)

(3767, 29)
(3618, 29)


In [70]:
# We shall fill the rest

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html
df = df.fillna(df.mean())

In [71]:
df.isna().sum()

age                          0
sex                          0
on_thyroxine                 0
query_on_thyroxine           0
on_antithyroid_medication    0
sick                         0
pregnant                     0
thyroid_surgery              0
I131_treatment               0
query_hypothyroid            0
query_hyperthyroid           0
lithium                      0
goitre                       0
tumor                        0
hypopituitary                0
psych                        0
TSH_measured                 0
TSH                          0
T3_measured                  0
T3                           0
TT4_measured                 0
TT4                          0
T4U_measured                 0
T4U                          0
FTI_measured                 0
FTI                          0
TBG_measured                 0
referral_source              0
Class                        0
dtype: int64

In [72]:
sum(df['sex'] == 'M'), sum(df['sex'] == 'F')

(1142, 2476)

In [73]:
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,referral_source,Class
0,41.0,F,f,f,f,f,f,f,f,f,...,2.5,t,125.0,t,1.14,t,109.0,f,SVHC,negative
1,23.0,F,f,f,f,f,f,f,f,f,...,2.0,t,102.0,f,0.99531,f,110.357701,f,other,negative
2,46.0,M,f,f,f,f,f,f,f,f,...,2.011113,t,109.0,t,0.91,t,120.0,f,other,negative
3,70.0,F,t,f,f,f,f,f,f,f,...,1.9,t,175.0,f,0.99531,f,110.357701,f,other,negative
4,70.0,F,f,f,f,f,f,f,f,f,...,1.2,t,61.0,t,0.87,t,70.0,f,SVI,negative


## Encoding

Data usually comes in 2 types
1. Numerical data $\to$ Continuous variables
2. Categorical data $\to$ Discrete variables
    - 2 Classes $\to$ Binary data
    - Multiple classes

Sometimes we need to encode this data to numerical data for our algorithm to eat it

### Ordinal Encoding

Each category will get its own number
`['value0', 'value1', 'value2', 'value0', 'value2'] -> [0, 1, 2, 0, 2]`

**Pros and cons**
- $+$ Simple, short
- $-$ The variables are mapped from a space with no order $\to$ a space with order (The number line)
    - This may make some algorithms interpret it as such
    - It might not be a true representation of data
    
Examples: 
- Price: `[cheap, affordable, expensive] ->  [0, 1, 2]` is a good representation
- Animal type: `[mammal, insect, bird]` -> [0, 1, 2]` is not a good one

In [50]:
from sklearn.preprocessing import OrdinalEncoder

In [88]:
enc = OrdinalEncoder()
enc.fit(df[['sex']])
enc.categories_

[array(['F', 'M'], dtype=object)]

In [89]:
enc.transform(df[['sex']])

array([[0.],
       [0.],
       [1.],
       ...,
       [0.],
       [1.],
       [0.]])

### One-hot encoding

Each variable will be encoded with an array of 0s and a 1 that represents the category
```
['value0', 'value1', 'value2', 'value0', 'value2'] -> 
[1, 0, 0]
[0, 1, 0]
[0, 0, 1]
[1, 0, 0]
[0, 0, 1]
```

**Pros and cons**
- $+$ Simple
- $+$ Keeps the variables separated
- $-$  expensive -> big dimensions
- $\pm$ Sparse (depends on algorithm)


In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
enc.fit(df[['sex']])
enc.categories_

In [92]:
enc.transform(df[['sex']])

<3618x2 sparse matrix of type '<class 'numpy.float64'>'
	with 3618 stored elements in Compressed Sparse Row format>

### Back to our data

In [31]:
for feature in df:
    if df[feature].dtype != 'float64':
        print(feature, np.unique(df[feature]))


sex ['F' 'M']
on_thyroxine ['f' 't']
query_on_thyroxine ['f' 't']
on_antithyroid_medication ['f' 't']
sick ['f' 't']
pregnant ['f' 't']
thyroid_surgery ['f' 't']
I131_treatment ['f' 't']
query_hypothyroid ['f' 't']
query_hyperthyroid ['f' 't']
lithium ['f' 't']
goitre ['f' 't']
tumor ['f' 't']
hypopituitary ['f' 't']
psych ['f' 't']
TSH_measured ['f' 't']
T3_measured ['f' 't']
TT4_measured ['f' 't']
T4U_measured ['f' 't']
FTI_measured ['f' 't']
TBG_measured ['f']
referral_source ['STMW' 'SVHC' 'SVHD' 'SVI' 'other']
Class ['compensated_hypothyroid' 'negative' 'primary_hypothyroid'
 'secondary_hypothyroid']


In [32]:
# Let's drop the referral source and TBG measures since they seem to be meaningless
df = df.drop(['TBG_measured', 'referral_source'], axis = 1)

In [93]:
df_enc = df.copy()

In [96]:
for feature in df_enc:
    if df_enc[feature].dtype != 'float64':
        enc = OrdinalEncoder()
        df_enc[feature] = enc.fit_transform(df_enc[[feature]])

In [97]:
df_enc.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,referral_source,Class
0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,1.0,125.0,1.0,1.14,1.0,109.0,0.0,1.0,1.0
1,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,102.0,0.0,0.99531,0.0,110.357701,0.0,4.0,1.0
2,46.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.011113,1.0,109.0,1.0,0.91,1.0,120.0,0.0,4.0,1.0
3,70.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.9,1.0,175.0,0.0,0.99531,0.0,110.357701,0.0,4.0,1.0
4,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.2,1.0,61.0,1.0,0.87,1.0,70.0,0.0,3.0,1.0


# Feature selection

## Tree

In [101]:
from sklearn.tree import DecisionTreeClassifier

In [109]:
X = df_enc.drop(['Class'], axis = 1)
y = df_enc['Class'] # the target value are the class columns

tree = DecisionTreeClassifier(random_state=420) 
tree.fit(X, y)

DecisionTreeClassifier(random_state=420)

In [110]:
fimp_df = pd.DataFrame({"feature_names": X.columns, "feature_importance": tree.feature_importances_}).sort_values(by = "feature_importance", ascending=False)

fimp_df[fimp_df['feature_importance'] > 10e-5]

Unnamed: 0,feature_names,feature_importance
17,TSH,0.590968
25,FTI,0.175651
2,on_thyroxine,0.132206
21,TT4,0.04694
7,thyroid_surgery,0.030891
20,TT4_measured,0.004998
23,T4U,0.003818
0,age,0.003791
19,T3,0.003699
9,query_hypothyroid,0.002559


## K best

**Chi2**
- https://www.youtube.com/watch?v=2QeDRsxSF9M

In [159]:
from sklearn.feature_selection import SelectKBest, chi2

In [160]:
X = df_enc.drop(['Class'], axis = 1)
y = df_enc['Class'] # the target value are the class columns

In [167]:
selector = SelectKBest(chi2, k=3)
selector.fit(X, y)

SelectKBest(k=3, score_func=<function chi2 at 0x000002B19CDC7040>)

In [168]:
X_new = X.iloc[:, selector.get_support(indices=True)]
X_new

Unnamed: 0,TSH,TT4,FTI
0,1.300000,125.000000,109.000000
1,4.100000,102.000000,110.357701
2,0.980000,109.000000,120.000000
3,0.160000,175.000000,110.357701
4,0.720000,61.000000,70.000000
...,...,...,...
3767,4.558648,108.274566,110.357701
3768,1.000000,124.000000,114.000000
3769,5.100000,112.000000,105.000000
3770,0.700000,82.000000,87.000000


### Test the selected features

In [203]:
from sklearn.linear_model import LogisticRegression

In [204]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=420)
X2_train, X2_test, y2_train, y2_test = train_test_split(X_new, y, test_size=0.33, random_state=420)

In [205]:
np.all(X_train.index == X2_train.index)

True

In [206]:
reg1 = LogisticRegression(random_state=420)
reg1.fit(X_train, y_train)

reg2 = LogisticRegression(random_state=420)
reg2.fit(X2_train, y2_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=420)

In [207]:
print(f"Accuracy with all features: {reg1.score(X_test, y_test)}")
print(f"Accuracy with selected features: {reg2.score(X2_test, y2_test)}")

Accuracy with all features: 0.9455611390284757
Accuracy with selected features: 0.9447236180904522


In [208]:
#Tree 
tree1 = DecisionTreeClassifier(random_state=420)
tree1.fit(X_train, y_train)
tree2 = DecisionTreeClassifier(random_state=420)
tree2.fit(X2_train, y2_train)
print(f"Accuracy with all features: {tree1.score(X_test, y_test)}")
print(f"Accuracy with selected features: {tree2.score(X2_test, y2_test)}")

Accuracy with all features: 0.992462311557789
Accuracy with selected features: 0.97571189279732
